%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{dsfont}

% FONTS
\usepackage[T1]{fontenc}

\usepackage{tgtermes}
\usepackage{amsmath}
% \usepackage[subscriptcorrection,
%             amssymbols,
%             mtpbb,
%             mtpcal,
%             nofontinfo  % suppresses all warnings
%            ]{mtpro2}
\usepackage{scalefnt,letltxmacro}
\LetLtxMacro{\oldtextsc}{\textsc}
\renewcommand{\textsc}[1]{\oldtextsc{\scalefont{1.10}#1}}
\usepackage[scaled=0.92]{PTSans}
\usepackage{inconsolata}
\usepackage{mathbbol}
\usepackage{amsthm}
\usepackage{amssymb}
\usepackage{thmtools,thm-restate}


% COLOR
% \usepackage[usenames,dvipsnames]{xcolor}
% \definecolor{shadecolor}{gray}{0.9}

% SPACING and TEXT
% \usepackage[final, expansion=alltext]{microtype}
% \usepackage[english]{babel}
% \usepackage[parfill]{parskip}
\usepackage{afterpage}
\usepackage{framed}
\usepackage{nicefrac}

% Redefine the leftbar environment to accept a width and coloring options
\renewenvironment{leftbar}[1][\hsize] {%
  \def\FrameCommand {%
    {\color{Gray}\vrule width 3pt}%
    \hspace{10pt}%
    %\hspace{0pt}\fboxsep=\FrameSep\colorbox{black!10}%
  }%
  \MakeFramed{\hsize#1\advance\hsize-\width\FrameRestore}%
}%
{\endMakeFramed}

% Define a paragraph header function
\DeclareRobustCommand{\parhead}[1]{\textbf{#1}~}

% paragraph helper
\DeclareRobustCommand{\PP}{\textcolor{Plum}{\texttt{\P}}~}
\DeclareRobustCommand{\pp}{\textcolor{Plum}{\texttt{\P}}~}

% COUNTERS
\renewcommand{\labelenumi}{\color{black!67}{\arabic{enumi}.}}
\renewcommand{\labelenumii}{{\color{black!67}(\alph{enumii})}}
\renewcommand{\labelitemi}{{\color{black!67}\textbullet}}

% FIGURES
\usepackage{graphicx}
\usepackage[labelfont=bf]{caption}
\usepackage[format=hang]{subcaption}

% TABLES
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{longtable}
\usepackage{etoolbox,siunitx}
\robustify\bfseries
\sisetup{detect-weight=true, detect-shape=true, detect-mode=true,
table-format=5.1, table-number-alignment=center}

% BIBLIOGRAPHY
\usepackage{natbib}
\usepackage{bibunits}

% ALGORITHMS
% \usepackage[algoruled]{algorithm2e}
\usepackage{listings}
\usepackage{fancyvrb}
\fvset{fontsize=\normalsize}
\usepackage{algorithm}
\usepackage{algorithmic}

% HYPERREF
% \usepackage[colorlinks, linktoc=all, hidelinks]{hyperref}
% \usepackage[all]{hypcap}
% \hypersetup{citecolor=Violet}
% \hypersetup{linkcolor=black}
% \hypersetup{urlcolor=MidnightBlue}

% CLEVEREF must come after HYPERREF
\usepackage[nameinlink]{cleveref}

% ACRONYMS
\usepackage[acronym,smallcaps,nowarn]{glossaries}
\glsdisablehyper{}
% \makeglossaries

% COLOR DEFINITIONS
\newcommand{\red}[1]{\textcolor{BrickRed}{#1}}
\newcommand{\orange}[1]{\textcolor{BurntOrange}{#1}}
\newcommand{\green}[1]{\textcolor{OliveGreen}{#1}}
\newcommand{\blue}[1]{\textcolor{MidnightBlue}{#1}}
\newcommand{\gray}[1]{\textcolor{black!60}{#1}}

% LISTINGS DEFINTIONS
\usepackage{listings}
\lstdefinestyle{mystyle}{
    commentstyle=\color{OliveGreen},
    numberstyle=\tiny\color{black!60},
    stringstyle=\color{BrickRed},
    basicstyle=\ttfamily\scriptsize,
    breakatwhitespace=false,
    breaklines=true,
    captionpos=b,
    keepspaces=true,
    numbers=none,
    numbersep=5pt,
    showspaces=false,
    showstringspaces=false,
    showtabs=false,
    tabsize=2
}
\lstset{style=mystyle}




\DeclareRobustCommand{\mb}[1]{\ensuremath{\mathbf{\boldsymbol{#1}}}}
% \DeclareRobustCommand{\mb}[1]{\mathbold{#1}}

\DeclareRobustCommand{\KL}[2]{\ensuremath{\textrm{KL}\left(#1\;\|\;#2\right)}}

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\crefname{lemma}{lemma}{lemmas}
\Crefname{lemma}{Lemma}{Lemmas}
\crefname{thm}{theorem}{theorems}
\Crefname{thm}{Theorem}{Theorems}
\crefname{prop}{proposition}{propositions}
\Crefname{prop}{Proposition}{Propositions}


\newtheorem{thm}{Theorem} % reset theorem numbering for each chapter
\newtheorem{defn}[thm]{Definition} % definition numbers are dependent on theorem numbers
\newtheorem{prop}[thm]{Proposition}
\newtheorem{exmp}[thm]{Example} % same for example numbers
\newtheorem{lemma}[thm]{Lemma}
\newtheorem{assumption}{Assumption}
\newtheorem{cor}[thm]{Corollary}
\newcommand\independent{\protect\mathpalette{\protect\independenT}{\perp}}
\def\independenT#1#2{\mathrel{\rlap{$#1#2$}\mkern2mu{#1#2}}}


\renewcommand{\mid}{~\vert~}
\newcommand{\prm}{\:;\:}

\newcommand{\mbw}{\mb{w}}
\newcommand{\mbW}{\mb{W}}

\newcommand{\mbx}{\mb{x}}
\newcommand{\mbX}{\mb{X}}

\newcommand{\mby}{\mb{y}}
\newcommand{\mbY}{\mb{Y}}

\newcommand{\mbz}{\mb{z}}
\newcommand{\mbZ}{\mb{Z}}

\newcommand{\mbI}{\mb{I}}
\newcommand{\mbone}{\mb{1}}

\newcommand{\mbL}{\mb{L}}

\newcommand{\mbtheta}{\mb{\theta}}
\newcommand{\mbTheta}{\mb{\Theta}}
\newcommand{\mbomega}{\mb{\omega}}
\newcommand{\mbOmega}{\mb{\Omega}}
\newcommand{\mbsigma}{\mb{\sigma}}
\newcommand{\mbSigma}{\mb{\Sigma}}
\newcommand{\mbphi}{\mb{\phi}}
\newcommand{\mbPhi}{\mb{\Phi}}

\newcommand{\mbalpha}{\mb{\alpha}}
\newcommand{\mbbeta}{\mb{\beta}}
\newcommand{\mbgamma}{\mb{\gamma}}
\newcommand{\mbeta}{\mb{\eta}}
\newcommand{\mbmu}{\mb{\mu}}
\newcommand{\mbrho}{\mb{\rho}}
\newcommand{\mblambda}{\mb{\lambda}}
\newcommand{\mbzeta}{\mb{\zeta}}

\newcommand\dif{\mathop{}\!\mathrm{d}}
\newcommand{\diag}{\textrm{diag}}
\newcommand{\supp}{\textrm{supp}}

\newcommand{\E}{\mathbb{E}}
\newcommand{\V}{\mathbb{V}}
\newcommand{\bbH}{\mathbb{H}}

\newcommand{\bbN}{\mathbb{N}}
\newcommand{\bbZ}{\mathbb{Z}}
\newcommand{\bbR}{\mathbb{R}}
\newcommand{\bbS}{\mathbb{S}}

\newcommand{\cL}{\mathcal{L}}

\newcommand{\cN}{\mathcal{N}}
\newcommand{\cT}{\mathcal{T}}
\newcommand{\Gam}{\textrm{Gam}}
\newcommand{\InvGam}{\textrm{InvGam}}

% \newcommand{\qedsymbol}{\rule{0.7em}{0.7em}}

\newcommand{\g}{\, | \,}
\newcommand{\s}{\, ; \,}




\newacronym{KL}{kl}{Kullback-Leibler}
\newacronym{ELBO}{elbo}{\emph{evidence lower bound}}
\newacronym{POPELBO}{pop-elbo}{\emph{population evidence lower bound}}
\newacronym{PROELBO}{pro-elbo}{\emph{profile evidence lower bound}}

\newacronym{SVI}{svi}{stochastic variational inference}
\newacronym{VI}{vi}{variational inference}

\newacronym{ADVI}{advi}{automatic differentiation variational inference}

\newacronym{GMM}{gmm}{Gaussian mixture model}
\newacronym{LDA}{lda}{latent Dirichlet allocation}

\newacronym{SMC}{smc}{Sequential Monte Carlo}
\newacronym{VB}{vb}{variational Bayes}

\newacronym{TDVI}{tdvi}{transdimensional variational inference}
\newacronym{STDVI}{stdvi}{sequential transdimensional variational inference}
\newacronym{MCMC}{mcmc}{Markov chain Monte Carlo}
\newacronym{RJMCMC}{rjmcmc}{reversible jump Markov chain Monte Carlo}
\newacronym{TDMCMC}{tdmcmc}{transdimensional Markov chain Monte Carlo}

\newacronym{SLDS}{slds}{switching linear dynamical system}
\newacronym{HDP-SLDS}{hdp-slds}{hierarchical Dirichlet process switching linear dynamical system}
\newacronym{MDP}{mdp}{Markov decision process}


\newacronym{MLM}{mlm}{masked language model}
\newacronym{CBOW}{cbow}{continuous bag of words}
\newacronym{MoE}{moe}{mixture-of-experts}
\newacronym{SGNS}{sgns}{skip-gram with negative sampling}
\newacronym{LLM}{llm}{large language model}
\newacronym{OOD}{ood}{out-of-distribution}
\newacronym{AWE}{awe}{attention word embedding}





\usepackage{tikz}
\usetikzlibrary{bayesnet}
\usepackage{pgfplots}
\pgfplotsset{compat=newest}
\pgfplotsset{plot coordinates/math parser=false}
\usepgfplotslibrary{statistics}

\pgfdeclarelayer{edgelayer}
\pgfdeclarelayer{nodelayer}
\pgfsetlayers{edgelayer,nodelayer,main}

\definecolor{hexcolor0xbfbfbf}{rgb}{0.749,0.749,0.749}

\tikzset{>=latex}
\tikzstyle{none}   = [inner sep=0pt]
\tikzstyle{line}   = [ thick, -, shorten <=1pt, shorten >=1pt ]
\tikzstyle{arrow}  = [ thick,  ->, shorten <=1pt, shorten >=1pt ]
\tikzstyle{ardash} = [ thick dotted, ->, shorten <=1pt, shorten >=1pt ]

\tikzstyle{empty}=[circle,opacity=0.0,text opacity=1.0,minimum width=4pt,minimum height=4pt]
\tikzstyle{box}=[rectangle,fill=White,draw=Black]
\tikzstyle{filled}=[circle,fill=hexcolor0xbfbfbf,draw=Black]
\tikzstyle{hollow}=[circle,fill=White,draw=Black]
\tikzstyle{param}=[rectangle,fill=Black,draw=Black,inner sep=0pt,minimum width=4pt,minimum height=4pt]
\tikzstyle{paramhollow}=[rectangle,fill=White,draw=Black,inner sep=0pt,minimum
width=4pt,minimum height=4pt]



\usepackage{xr}
\externaldocument{wibisono_672-supp}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example


\title{Bidirectional Attention as a Mixture of Continuous Word Experts}

% yw / i changed the title a little for grammatical reasons

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors

\author[1]{Kevin Christian Wibisono}
\author[1]{Yixin Wang}
% Add affiliations after the authors
\affil[1]{%
    Department of Statistics\\
    University of Michigan\\
    Ann Arbor, MI, USA~\thanks{Correspondence to: \{kwib,yixinw\}@umich.edu; Software that
  replicates the empirical studies is at
  \texttt{https://github.com/yixinw-lab/attention-uai}.}
}

% yw / please fix the capitalization in the bibs. (e.g. bert needs to
% be capitalized; first word after colon in the title needs to be
% capitalized; the journal name should be properly capitalized etc
% etc) and go through all the submission checklist in the github repo
% https://github.com/yixinw-lab/yixinw-lab/blob/main/checklist-before-submitting-papers.md
  
\begin{document}
\maketitle


\begin{abstract}
Bidirectional attention---composed of the neural network architecture
of self-attention with positional encodings, together with the
\gls{MLM} objective---has emerged as a key component of modern
\glspl{LLM}. Despite its empirical success, few studies have examined
its statistical underpinnings: What statistical model is bidirectional
attention implicitly fitting? What sets it apart from its
non-attention predecessors? We explore these questions in this paper.
The key observation is that fitting a single-layer single-head
bidirectional attention, upon reparameterization, is equivalent to
fitting a \gls{CBOW} model with \gls{MoE} weights. Further,
bidirectional attention with multiple heads and multiple layers is
equivalent to stacked \glspl{MoE} and a mixture of
\glspl{MoE}, respectively. This statistical viewpoint reveals the
distinct use of \gls{MoE} in bidirectional attention, which aligns
with its practical effectiveness in handling heterogeneous data. It
also suggests an immediate extension to categorical tabular data, if
we view each word location in a sentence as a tabular feature. Across
empirical studies, we find that this extension outperforms existing
tabular extensions of transformers in \gls{OOD} generalization.
Finally, this statistical perspective of bidirectional attention
enables us to theoretically characterize when linear word analogies
are present in its word embeddings. These analyses show that
bidirectional attention can require much stronger assumptions to
exhibit linear word analogies than its non-attention predecessors.
\end{abstract}

\begin{bibunit}[abbrvnat]



\section{Introduction}
\glsresetall

\label{sec:intro}

Bidirectional attention has recently emerged as a cornerstone in the
construction of \glspl{LLM}. It is composed of the self-attention
mechanism with positional encodings, and is trained with the \gls{MLM}
objective. First introduced by \citet{vaswani2017attention}, the
attention-based architecture represents a departure from the
traditional recurrent or convolutional neural networks in language
modeling. This architecture has since become the backbone of many
large language models, including BERT~\citep{devlin2018bert},
RoBERTa~\citep{liu2019roberta}, and GPT-2~\citep{radford2019language};
all of them have achieved exceptional performance in natural language
processing benchmarks.

At the heart of bidirectional attention lies the self-attention
mechanism; it creates a holistic representation of a sentence by
capturing pairwise relationships between tokens in each sentence.
Equally important to bidirectional attention are positional encodings,
supplying word ordering information that allows bidirectional
attention to move beyond bag-of-words. Finally, bidirectional
attention employs the \gls{MLM} objective. It is a self-supervised
learning objective for unlabelled text data, optimizing the model's
predictive accuracy on randomly masked words within each sentence.

Despite the empirical success of attention-based language models, few
works have examined their statistical underpinnings: What statistical
models are these attention-based models implicitly fitting? What sets
these models apart from their non-attention predecessors like
\gls{CBOW}~\citep{mikolov2013distributed}? How does the use of the
self-attention mechanism contribute to their empirical success? We
explore these questions in this work.

\parhead{Main idea.} We theoretically study bidirectional attention,
i.e., the self-attention module that is accompanied by positional
encodings and is trained using
\gls{MLM}. The key observation is that fitting a single-head and
single-layer bidirectional attention, upon reparametrization, is
equivalent to fitting
\gls{CBOW} word with \gls{MoE} weights~\citep{jacobs1991adaptive}.
Moreover, bidirectional attention with multiple heads and multiple
layers are equivalent to stacked \glspl{MoE} and mixture of
\glspl{MoE}, respectively. These analyses reveal the distinct use of
\gls{MoE} in bidirectional attention as compared with its
non-attention predecessor; they partially explain its practical
effectiveness in capturing heterogeneous patterns in natural
language~\citep{devlin2018bert,liu2019roberta}.

This statistical interpretation of bidirectional attention suggests an
immediate extension of bidirectional attention to (categorical)
tabular data: one can view each word location in a sentence as a
tabular feature, and each word as the value that the feature takes.
Across empirical studies, we find that this tabular extension of
attention improves \gls{OOD} generalization, compared with existing
tabular data algorithms or tabular extensions of attention. Moreover,
this tabular extension of attention facilitates the integration of
heterogeneous datasets with partially overlapping features: the
learned feature encodings (akin to the positional encodings in the
original attention module) bring all features into the same embedding
space.


Finally, this connection between bidirectional attention and
\gls{CBOW}+\gls{MoE} empowers us to theoretically characterize when linear
word analogies (e.g. $\mathrm{king} - \mathrm{man} +
\mathrm{woman} \approx
\mathrm{queen}$) can be present in its word embeddings. We draw on a
classical finding in \citet{levy2014neural}: the similarity between
two tokens from word2vec embeddings is equal to their pointwise mutual
information~\citep{church1990word}, provided that the embeddings have
sufficient dimensionality and the models were trained using the
\gls{SGNS} objective. This result enables us to analyze the embeddings
of bidirectional attention, given their connections to \gls{CBOW}.
Adopting the paraphrasing argument of \citet{allen2019analogies} for
\gls{SGNS}, we characterize the conditions under which both \gls{CBOW}
and attention-based embeddings exhibit linear word analogies. We show
that bidirectional attention can require much stronger assumptions to
exhibit linear word analogies than its non-attention predecessors.
These results partially explain the empirical observations that
bidirectional attention may not always achieve meaningful improvements
over classical word embeddings in capturing abstract and complex
relationships~\citep{ushio2021bert}.


\parhead{Contributions.} We prove that bidirectional attention, upon
reparametrization, is equivalent to \gls{CBOW} with \gls{MoE} weights.
Moreover, bidirectional attention with multiple heads and multiple
layers is equivalent to stacked MoEs and mixture of MoEs,
respectively. This statistical interpretation with \gls{MoE} partially
explains the power of bidirectional attention in handling
heterogeneous data. Further, it suggests an immediate extension of
bidirectional attention to categorical tabular data. Across empirical
studies, it outperforms existing tabular algorithms or tabular
extensions of attention in \gls{OOD} generalization. Finally, we
leverage this statistical perspective of attention to characterize the
presence of linear word analogies in word embeddings. We show that
bidirectional attention can require much stronger assumptions to
exhibit linear word analogies than its non-attention predecessors.
These results align with the empirical observations that bidirectional
attention can sometimes perform worse in complex analogy tasks than
classical word embeddings.




\parhead{Related work.} Our work draws on three themes around
attention-based models.

The first is a body of work on the theoretical foundations of
attention-based models. \citet{elhage2021mathematical} analyzed how
the different components of decoder-only attention-based architectures
relate to each other. \citet{edelman2022inductive} provided a rigorous
justification of the ability of attention-based architectures to
represent sparse functions. \citet{tsai2019transformer} viewed
attention through the perspective of kernels. \citet{peng2020a}
established a connection between the use of multiple heads in
transformers and \gls{MoE}. \citet{li2023transformers} showed that the
embedding and self-attention layers in a transformer architecture are
capable of capturing topic structures.
\citet{bai2023transformers,bietti2023birth,xie2022explanation,han2023incontext}
provided theoretical analyses about the in-context learning ability of
attention-based models. In contrast to these works, we provide a
statistical interpretation of the bidirectional attention objective,
showing that fitting a single-layer single-head attention-based
architectures is equivalent to fitting a \gls{CBOW} model with
\gls{MoE} weights; this statistical interpretation provides a
theoretical basis for the empirical effectiveness of bidirectional
attention in handling heterogeneous
data~\citep{devlin2018bert,liu2019roberta}.

The second theme is the extension of attention-based models to
tabular data. One prominent work along this line is TabTransformer
\citep{huang2020tab}, which utilizes a concatenation of token embeddings and unique feature
identifiers---in lieu of positional encodings---to learn contextual
embeddings for categorical features with self-attention. Different from TabTransformer, we
view each word location in a sentence as a tabular feature; our
extension thus represents each feature in tabular data via an encoding
akin to the positional encodings. Other tabular extensions of
self-attention include FTTransformer (tokenizing each feature,
applying transformer layers, and using the \texttt{[CLS]} token for
prediction)~\citep{gorishniy2021revisiting}, AutoInt (mapping all
features into the same space and applying self-attention to model
between-feature interactions)~\citep{song2019autoint} and TabNet
(utilizing sequential attention for feature selection in different
learning steps) \citep{arik2020tabnet}. Compared with these existing
approaches, our approach is more robust to covariate shifts across
empirical studies; it also facilitates the integration of heterogeneous
datasets with partially overlapping features.


The third theme relates to linear word analogy structures in word
embeddings. Neural word embeddings such as word2vec
\citep{mikolov2013distributed} and GloVe \citep{pennington2014glove}
have been empirically shown to exhibit linear structures, often
manifested through analogies. Concretely, given an analogy
``\textit{a} is to \textit{b} as \textit{c} is to \textit{d}", we
often find $w_b + w_c - w_a \approx w_d$, where $w_i$ denotes the
embedding of word $i \in \{a,b,c,d\}$. Many works provide theoretical
justifications for this phenomenon. \citet{arora2016a} offered a
latent variable argument, assuming that texts are generated from
random walks of discourse vectors and word vectors are spatially
isotropic. \citet{ethayarajh2018towards} introduced the
\textit{co-occurrence shifted PMI} concept which characterizes when
linear analogy holds in
\gls{SGNS} and GloVe. \citet{allen2019analogies} adopted the
paraphrasing framework of \citet{gittens2017skip} and used
\textit{word transformation} to connect linear analogy in \gls{SGNS}
with paraphrases. In contrast to these existing works, our work moves
beyond  \gls{SGNS} and GloVe; we characterize when linear word
analogies may be present in \gls{CBOW} and attention-based embeddings.



\section{Bidirectional attention as a mixture of continuous word
experts}
\label{sec:bid-attn-CBOW}

In this section, we first review bidirectional attention, a language
model composed of the self-attention architecture, positional
encodings, and the use of \gls{MLM} training objective.  En route, we
derive an explicit form of the \gls{MLM} objective for a single-layer
single-head attention-based architecture in \Cref{sec:der-mlm-obj}. We
then formally establish the equivalence between fitting bidirectional
attention and fitting the \gls{CBOW} model with \gls{MoE} weights in
\Cref{sec:mlm-as-cbow}, with extensions to multi-head and multi-layer
attention-based architectures.


\subsection{Bidirectional attention: Self-attention, positional
encodings, and the \gls{MLM} objective}
\label{sec:der-mlm-obj}


We begin with describing the structure of bidirectional
attention---self-attention, positional encodings, and the \gls{MLM}
objective---in the context of language modeling.
(\Cref{sec:app-sum-not} contains a summary of the notations used in
this section.)

\parhead{Building blocks of bidirectional attention.} Consider a
corpus that consists of sentences of length $S$, with a vocabulary
size of $|V|$. The self-attention mechanism takes sentences and
outputs their sentence embeddings, by transforming the token
embeddings and positional encodings of each token in the sentence. We
denote $C \in \mathbb{R}^{(|V| + 1) \times p}$ as the matrix such that
each row $c_i^\top$ corresponds to the token embedding of the $i$-th
token in the vocabulary. The $(|V|+1)$-th token is the \texttt{[MASK]}
token, representing a token in the training corpus that is masked.
Further denote $P \in \mathbb{R}^{S \times p}$as the positional
encoding matrix.

To learn these token embeddings and positional encodings,
bidirectional direction employs an \gls{MLM} objective: it randomly
masks a random subset of the tokens in the training corpus; then it
aims to predict these masked tokens from the sentence embeddings,
which are produced by transforming the token embeddings and
positional encodings through the attention mechanisms. To
operationalize the \gls{MLM} objective, we use $\overline{X} \in
\{0,1\}^{S \times (|V| + 1)}$ to denote the one-hot encoding matrix of
the $S$ tokens (including the masked tokens) in each sentence. For
notational simplicity, we consider a simple masking strategy: each
sentence produces $S$ prediction tasks in the \gls{MLM} objective,
each of which involves masking exactly one of the $S$ positions in the
sentence and predicting the token in that position. (Results in this
section can be easily generalized to general masking strategies.)

\parhead{Predicting masked tokens with self-attention.} We next
describe how the self-attention mechanism (with positional encodings)
produces predictions of masked tokens. For ease of exposition, we
focus on a single-head single-layer attention module. It takes in
$\overline{X}$, the one-hot encoding matrix of the $S$ tokens in a
sentence (including the masked tokens); it then outputs a probability
vector $\hat{y}\in
\Delta^{|V|}$ as a prediction of the masked token, indicating the probability of the
masked token being each of the $|V|$ words in the vocabulary.

The self-attention architecture transforms $\overline{X}$ into the
prediction $\hat{y}$ following steps:
\begin{enumerate}[leftmargin=*]
    \item \textbf{Token embeddings with positional encodings}: Produce
    a matrix consisting of the token embeddings of all the tokens in the
    masked sentence: $X = \overline{X} C \in \mathbb{R}^{S \times p}$.
    Then add positional encodings to the matrix: $X' = X + P$.
    % \in
    % \mathbb{R}^{S \times p}$.
    \item \textbf{Sentence embeddings with attention weight matrices:}
    Employing value mapping $W^{V} \in \mathbb{R}^{d \times p}$, query
    mapping $W^{Q} \in
    \mathbb{R}^{d_w \times p}$, and key mapping $W^{K} \in
    \mathbb{R}^{d_w \times p}$, we obtain the sentence embedding
     $X^{\textrm{attn}}\in \mathbb{R}^{S \times d}$ after applying the
     attention weights:
    \[X^{\textrm{attn}} = \textrm{softmax}\left( \frac{X' (W^{Q})^\top W^{K} (X')^\top}{\sqrt{d_w}} \right) X' (W^{V})^\top ,\] where the softmax is taken row-wise.
    \item \textbf{Intermediate representations with residual
    connections:} Obtain an intermediate representation  with
    coefficient matrix $W^O
    \in \mathbb{R}^{d \times p}$ and a residual connection: $Z =
    X^{\textrm{attn}} W^O
    \in
    \mathbb{R}^{S \times p}$; then $Z' = X' + Z \in \mathbb{R}^{S
    \times p}$. 
    \item \textbf{Final predictions with linear layer and residual
    connections.} For each position $i \in [S]$ of the sentence, apply
    a linear layer $\mathrm{LIN}_1(Z_i') = W' Z_i' \in \mathbb{R}^{p}$
    with a weight matrix $W' \in \mathbb{R}^{p \times p}$; then
    another residual connection $Z'' = Z' + \mathrm{LIN}_1(Z') \in
    \mathbb{R}^{S \times p}$; finally another linear layer and softmax
    operation \[\hat{y} = \mathrm{softmax}(\mathrm{LIN}_2(Z''_i)),\] where $\mathrm{LIN}_2(Z''_i) = W''
    Z''_i \in \mathbb{R}^{|V|}$ with weight matrix $W'' \in
    \mathbb{R}^{|V| \times p}$.
\end{enumerate}

Given the self-attention transformations from input sentences
$\overline{X}$ to masked token predictions $\hat{y}$, bidirectional
attention learns the token embeddings, positional encodings, and
weight matrices by optimizing the cross entropy loss of $\hat{y}$ in
predicting the masked tokens. This loss objective is also known as the
\gls{MLM} objective.



% \label{sec:app-simple-attn}


\parhead{The loss objective of bidirectional attention.} We next
derive an explicit form for the loss objective of bidirectional
attention. This derivation will pave the road for the statistical
interpretations of bidirectional attention.

In more detail, we consider an input-output pair $(\overline{X},
\overline{y})$ for the masked token prediction task, where
$\overline{X}$ is the one-hot encoding matrix of all the tokens in the
sentence, and $\overline{y} \in
\{0,1\}^{|V|}$ is the one-hot encoding of the token being masked. We
denote $m \in [S]$ and $b \in [|V|]$ as the masked position and masked
token, respectively. \Cref{prop:mlm-obj} below derives an explicit form
of the \gls{MLM} objective $L_{\gls{MLM}}(m,b)$.


% , and each word $i$ in the vocabulary is represented by a token
% embedding $x_i \in \mathbb{R}^p$. The \gls{MLM} framework involves
% randomly substituting some tokens in each sentence with the
% \texttt{[MASK]} token. This suggests a need for a separate embedding
% for that token, which we denote by $x_{|V|+1} \in \mathbb{R}^p$.  In
% an attempt to build a connection with \gls{CBOW}, we mask the word
% in each position of each sentence one at a time and aim to predict
% the masked word. The detail of the architecture is given in
% \Cref{sec:app-simple-attn}.


\begin{lemma}[The loss objective of bidirectional attention]
\label{prop:mlm-obj}
Upon reparametrization, the \gls{MLM} objective for predicting token
$b$ in the $m$th position is given by
\begin{align*}
L_{\gls{MLM}}&(m,b) = - \frac{\sum_{j=1}^S\theta(j,m)\chi(j,m,b) }{\sum_{j=1}^S \theta(j,m)}\\
&+\log\left( \sum_{k=1}^{|V|} \exp\left( \frac{\sum_{j=1}^S\theta(j,m)\chi(j,m,k) }{\sum_{j=1}^S \theta(j,m)} \right)\right),
\end{align*}
where
\begin{align*}
\theta(j,m) &\triangleq \exp\left(\frac{ e_j^\top (\overline{X} C+P) W^{KQ} (c_{|V|+1} + P^\top e_m)}{\sqrt{d_w}} \right),\\
\chi(j,m,k) &\triangleq \left( W^{LOV} (\overline{X} C+P)^\top e_j + g + D e_m \right)_k,
\end{align*}
and $g \in \mathbb{R}^{|V|}$, $D \in \mathbb{R}^{|V| \times S}$,
$W^{LOV} \in \mathbb{R}^{|V| \times p}$, $W^{KQ} \in \mathbb{R}^{p
\times p}$; $e_j \in \{0,1\}^{S}$ denotes a zero vector with 1 on the
$j$-th entry. (The proof is in \Cref{sec:app-proof-mlm-objective}.)
\end{lemma}


\Cref{prop:mlm-obj} performs a reparametrization over the weight
matrices $W^V, W^Q, W^K, W^O$, arriving at an explicit form of the
\gls{MLM} objective with only two weight matrices $W^{KQ}, W^{LOV}$.
\Cref{prop:mlm-obj} also reveals two key components of the \gls{MLM}
objective: $\theta(j,m)$, the attention weight of token $m$ on token
$j$, and $\chi(j,m,\cdot)$, the similarity between token $m$ and token
$j$. These quantities will play a key role in facilitating the
statistical interpretation of bidirectional attention.

\subsection{Bidirectional attention as a mixture of continuous word
experts}
\label{sec:mlm-as-cbow}


\glsreset{CBOW}
\glsreset{MoE}

Building on the derivations in \Cref{prop:mlm-obj}, we next establish
the equivalence between the loss objective of bidirectional attention
and that of the \gls{CBOW} model with
\gls{MoE} weights. This equivalence will enable us to interpret
bidirectional attention as fitting a statistical model of
\gls{CBOW}+\gls{MoE}.

\parhead{The continuous bag of words model (\gls{CBOW}).} We begin with reviewing
the \gls{CBOW} formulation of word2vec~\citep{mikolov2013distributed}.
\gls{CBOW} aims to predict the center token based on the
surrounding tokens (a.k.a. context tokens). It has two parameter
matrices, representing the center and context embeddings respectively. 

In more detail, we consider an input-output pair $(\overline{X},
\overline{y})$ as in \Cref{sec:der-mlm-obj}, where $m \in [S]$ and $b
\in [|V|]$ represent the masked position and masked token. We note
that, while masking is never employed in \gls{CBOW}, introducing
masking into \gls{CBOW} does not change its objective. The reason is
that the context of a token in \gls{CBOW} does not include the token
itself. Thus,  with window size $w$, the loss objective for predicting
the token in the $m$th position of \gls{CBOW} (a.k.a. the negative
log-likelihood) is
\begin{align*}
L_{\gls{CBOW}}(m,b)=\log &\left( \sum_{k=1}^{|V|} \exp \left( \sum_{j=1}^S \frac{\omega_{j,m,w}\xi(j, k)}{\sum_{j=1}^S \omega_{j,m,w}} \right)\right) \\
&-\sum_{j=1}^S \frac{\omega_{j,m,w}\xi(j, b)}{\sum_{j=1}^S \omega_{j,m,w}},
\end{align*}
\begin{align*}
\text{where  }\qquad \omega_{j,m,w} &= \mathds{1}(1 \leq |j-m| \leq w),\\
\xi(j,k) &= \left( W^{LOV} (\overline{X} C)^\top e_j \right)_k,
\end{align*}
if we denote the center and context matrices by $W^{LOV}$ and~$C$ to
match the notations of bidirectional attention.

\parhead{Weight and similarity matrices in \gls{CBOW} and
bidirectional attention.} The \gls{CBOW} model appears related to
bidirectional attention: it admits natural notions of (attention)
\textit{weight} and (token) \textit{similarity} as in bidirectional
attention. Specifically, the \textit{weight} of the token in position
$j \in [S]$ is determined by the distance between $j$ and $m$ and the
number of integers between $m-w$ and $m+w$ (inclusive) that are within
the range $[1,S]$. The \textit{similarity} of token $\alpha \in [|V|]$
in the center and token $\beta \in [|V|]$ in the context is
$(W^{LOV}_\alpha)^\top c_\beta$, regardless of their positions in the
sentence.

To compare \gls{CBOW} and bidirectional attention, we next inspect the
weight matrices in the \gls{MLM} objective of bidirectional attention.
Specifically, the weight of the token in position $j$ in
$L_{\gls{MLM}}$ is given by\footnote{The weight and similarity
matrices can take other parametric forms; e.g.
\citet{sonkar2020attention} uses a different weight function that
depends on the center token $b$ in their \gls{AWE} model.}
\begin{align*}
    \frac{\exp\left(e_j^\top (\overline{X} C+P) W^{KQ} (c_{|V|+1} + P^\top e_m)/\sqrt{d_w}\right)}{\sum_{j=1}^S \exp\left( e_j^\top (\overline{X} C+P) W^{KQ} (c_{|V|+1} + P^\top e_m)/\sqrt{d_w} \right)}.
\end{align*}
Unlike that of \gls{CBOW}, this weight matrix of bidirectional
attention depends on all tokens in the masked sentence and their
corresponding positions. Yet, it does not depend on the center
(masked) token $b$. Further, the term inside the $\exp(\cdot)$ can be
decomposed into four components: (1) $e_j^\top \overline{X} C W^{KQ}
c_{|V|+1}/\sqrt{d_w}$, which depends only on the token in position $j$;
(2) $e_j^\top
\overline{X} C W^{KQ} P^\top e_m/\sqrt{d_w}$, which depends on both
position $j$ and position $m$; (3) $e_j^\top P W^{KQ} c_{|V|+1}
/\sqrt{d_w}$, which depends only on position $j$; and (4) $e_j^\top P
W^{KQ} P^\top e_m/\sqrt{d_w}$, which depends on both position $j$ and
position $m$. 

The similarity matrix of bidirectional attention also appears related
to that of \gls{CBOW}. In bidirectional attention, the similarity of
token $\alpha$ in the center (in position $m$) and token $\beta$ in the
context (in position $j$) is given by $(W_\alpha^{LOV})^\top c_\beta +
(W_\alpha^{LOV})^\top P^\top e_j + g_\alpha + (D_\alpha)^\top e_m$,
which also contains four components as above. Moreover, the first
component coincides with the similarity matrix of \gls{CBOW}.

\parhead{Bidirectional attention as a mixture of continuous word
experts.} Following these observations that bidirectional attention
appears closely related to \gls{CBOW}, we conclude this section with
\Cref{prop:mlm-moe-equiv}: it proves that the \gls{MLM} objective of
bidirectional attention in \Cref{prop:mlm-obj} is equivalent to the
\gls{CBOW} objective with \gls{MoE} weights, where the token in each
position serves as an expert.

\glsreset{MoE}

\begin{thm}[Bidirectional attention as a mixture of continuous word experts]
\label{prop:mlm-moe-equiv}
The \gls{MLM} objective of bidirectional attention is equivalent to the cross-entropy loss between the token being masked $\overline{y}$ and the prediction probabilities $\textrm{softmax}(F(\overline{X}))$ from a \gls{MoE} predictor: 
\[F(\overline{X}) = \sum_{j \in [S]} \pi_j(\overline{X}) f_j(\overline{X}),\]
where the $j$th expert $f_j(\overline{X})$ relies on the embedding of
the token in position $j$,
\begin{align*}
    f_j(\overline{X}) = W^{LOV} (\overline{X} C+P)^\top e_j + g + D e_m,
\end{align*}
and its weight (namely the contribution of expert $j$ to the prediction)
is $\pi_j(\overline{X}) =
\left(\mathrm{softmax}(h(\overline{X}))\right)_j$ with
\begin{align*}
    h_j(\overline{X}) = e_j^\top (\overline{X} C+P) W^{KQ} (c_{|V|+1} + P^\top e_m)/\sqrt{d_w}.
\end{align*}
\end{thm}

\Cref{prop:mlm-moe-equiv} is an immediate consequence of
\Cref{prop:mlm-obj}. It formally establishes the equivalence between
bidirectional attention and \gls{CBOW}+\gls{MoE}, enabling a
statistical interpretation of bidirectional attention. In particular,
\Cref{prop:mlm-moe-equiv} reveals the distinct use of \gls{MoE} in
bidirectional attention, which is a machine learning technique that
excels at handling heterogeneous data. It thus can partially explain
the empirical effectiveness of attention-based models in capturing
heterogeneous patterns in complex natural language
data~\citep{devlin2018bert,liu2019roberta}.

\parhead{Extensions to multi-head and multi-layer bidirectional
attention.} We finally extend \Cref{prop:mlm-moe-equiv} to multi-head
and multi-layer bidirectional attention. For bidirectional attention
with multiple attention heads, its \gls{MLM} objective can be shown to
be equivalent to a stacked \gls{MoE} of \gls{CBOW}. For example, for
bidirectional attention with two attention heads, its \gls{MLM}
objective is equivalent to cross entropy loss with the following
stacked \gls{MoE} predictor: 
\[F(\overline{X}) = \sum_{j \in [S]}
\pi^1_j(\overline{X}) f^1_j(\overline{X}) + \sum_{j \in [S]}
\pi^2_j(\overline{X}) f^2_j(\overline{X}),\]
where the $j$th expert of the $i$th head is
\begin{align*}
    f_j^i(\overline{X}) = W^{LOV_i} (\overline{X} C+P)^\top e_j + \frac{g}{2} + \frac{D e_m}{2},
\end{align*}
whose \gls{MoE} weight is $\pi^i_j(\overline{X}) =
\left(\textrm{softmax}(h^i(\overline{X}))\right)_j$ with
\begin{align*}
    h_j^i(\overline{X}) = e_j^\top (\overline{X} C+P) W^{KQ_i} (c_{|V|+1} + P^\top e_m)/\sqrt{d_w}.
\end{align*} 

Following similar derivations, one can show that bidirectional
attention with multiple attention layers is equivalent to a mixture of
\glspl{MoE}.

% To see this, it suffices to argue that a two-layer attention architecture is equivalent to following steps 1 to 6 of \Cref{sec:app-simple-attn}, succeeded by steps 2 to 8 with $X'$ replaced by the previously obtained $Z''$.



\section{Bidirectional attention for tabular data}

The equivalence between \gls{MLM} with self-attention and \gls{CBOW}
with \gls{MoE} weights (\Cref{prop:mlm-moe-equiv}) suggests an
immediate extension to categorical tabular data. We develop this
tabular extension in this section. Across empirical studies, we find
that this tabular extension of attention achieves significant
improvement in \gls{OOD} generalization over existing methods,
including existing algorithms for tabular data (e.g. random forest,
gradient boosting) and existing tabular generalizations of attention
modules (e.g. TabTransformer, FTTransformer).

% yields some methodological insights: (1) it enables us to integrate probabilistic models into attention-based models, thanks to the probabilistic formmulation of \gls{CBOW}---namely exponential family embeddings \citep{rudolph2016exponential}; and (2) it allows for immediate generalization of attention-based architectures to model tabular data. In this section, we explore both ideas in more detail.

% \subsection{A probabilistic view of attention-based models}
% \citet{rudolph2016exponential} proposed exponential family embeddings, a probabilistic framework of which \gls{CBOW} is a special case. A concise summary of this framework is given in \Cref{sec:app-exp-fam-and-cbow}. We demonstrate that the \gls{MLM} objective in \Cref{prop:mlm-obj} can also be framed in terms of exponential family embeddings. Following the notations in \Cref{sec:app-exp-fam-and-cbow}, let our masked sentence be $x = x_{1:I}$, where $x_i \in \mathbb{R}^{D+1}$ due to the introduction of the \texttt{[MASK]} token. Suppose we mask the $i$-th word in the sentence, i.e., $x_i = (0, \cdots, 0, 1)^\top$. Let $\bar{x}_i \in \mathbb{R}^D$ be the one-hot encoding of the masked word, $K$ be the embedding dimension, and $c_i = [I]$. We then have $p(\tilde{x}_i | x_{c_i}) \sim \textrm{Cat}(\eta_i)$, where
% \begin{equation*}
% \begin{split}
%     \eta_{i} =  \sum_{j=1}^S &\frac{\exp\left( \frac{(\alpha x_j + p_j)^\top W^{KQ} (\alpha x_i + p_i)}{\sqrt{d_w}} \right)}{\sum_{j=1}^S \exp\left( \frac{(\alpha x_j + p_j)^\top W^{KQ} (\alpha x_i + p_i)}{\sqrt{d_w}} \right)} \\ &\left( W^{LOV} (\alpha x_j + p_j) + c + W^\ell p_i \right),
% \end{split}
% \end{equation*}
% with $\alpha \in \mathbb{R}^{K \times (D+1)}$, $p \in \mathbb{R}^{K \times I}$, $W^{LOV}, W^\ell \in \mathbb{R}^{D \times K}$, $W^{KQ} \in \mathbb{R}^{K \times K}$ and $c \in \mathbb{R}^D$. In \Cref{sec:app-prob-appl}, we propose some application of attention-based models for non-text data with sequential information.

% \subsection{Attention-based model for tabular data}


\subsection{Tabular extension of bidirectional attention}
\label{sec:att-tab}

To extend bidirectional attention to tabular data, we consider a
classification problem with categorical features.  For simplicity, we
assume the response variable $Y_i$ is ordinal with $C$ classes.
Further assume each of the $K$-dimensional features $X_i$ is also
ordinal with $C$ classes. The training data contains pairs of features
and responses $(X_i, Y_i)$. The goal is to predict the response for
some test $X$.

Extending bidirectional attention to this tabular setting requires
that we handle tabular features with bidirectional attention. To this
end, we leverage the observations in \Cref{prop:mlm-moe-equiv} that
bidirectional attention can be viewed as prediction with \gls{MoE},
where the token in each position of the sentence (endowed with
positional encodings) serves as an expert. This
\gls{MoE} perspective of bidirectional attention immediately suggests
that we consider each tabular feature as an expert in tabular data, since
each position in a sentence can be viewed as a tabular feature for
predicting masked tokens. One can thus consider using tabular feature
encodings in the place of positional encodings for analyzing tabular
data with bidirectional attention.

To operationalize this tabular extension of bidirectional attention,
we first introduce ``word" embeddings $w_1, \cdots, w_C \in
\mathbb{R}^d$ for each class and $w_0$ for the \texttt{[MASK]} token.
We then introduce ``position" encodings $p_1, \cdots, p_{K+1} \in
\mathbb{R}^d$, one for each feature. Finally, we consider the
concatenation of features and covariates $(X_i, Y_i)$ of each data
point as a sentence in bidirectional attention. These mappings enable
us to learn the embeddings and encodings using the \gls{MLM}
objective. At test time, given a test $X$, one can use the
bidirectional attention model to predict the most probable class for
the input $(X_i, \texttt{[MASK]})$.

We note that this use of \gls{MLM} objective for tabular data
implicitly models the joint distribution $p(X,Y)$, as opposed to the
conditional distribution $p(Y|X)$ that standard supervised algorithms
commonly model. As a consequence, tabular extensions of bidirectional
attention can potentially achieve better \gls{OOD} generalization, as we demonstrate
empirically next.

Finally, this tabular extension of bidirectional attention can be
applied beyond supervised classification. It readily extends to
unsupervised settings (if we ignore the $Y_i$'s) and semi-supervised
settings (if we consider both the labeled and unlabeled data and set
the $Y_i$'s for the unlabeled data to be \texttt{[MASK]}). This
approach is also applicable to handling multiple datasets with only
partially overlapping features: the learned feature encodings will
allow us to bring all features into the same embedding space. These
learned encodings can also reveal the relationships between different
tabular features across different data sets.



\subsection{Empirical studies of tabular bidirectional attention}
\label{sec:att-tab-emp}

In this section, we empirically study the tabular extension of
bidirectional attention using simulated and real datasets. Across
empirical studies, we find that this approach outperforms  in
\gls{OOD} generalization for tabular data, as is compared with both
existing tabular data algorithms and existing tabular extensions of
attention modules.

\subsubsection{Simulated data}
\label{sec:sim-data}
Begin with evaluating tabular bidirectional attention on simulated. We
focus on the common \gls{OOD} generalization setting of covariate
shift; it refers to prediction tasks where $p(X_{\textrm{train}}) \neq
p(X_{\textrm{test}})$ and $p(Y_{\textrm{train}} | X_{\textrm{train}})
= p(Y_{\textrm{test}} | X_{\textrm{test}})$.

\parhead{Data generation.} We describe the key components of data
generation process; we refer the readers to
\Cref{sec:app-tab-data-exp} for full details. We set the number of
features $K$ to be 5, the number of classes $C$ to be 10, and the
training and test set size to be 2,000 each. Twenty data sets are
generated for each combination of hyperparameters.

\parhead{Competing methods and evaluation metrics.} We fit the proposed tabular
extension of bidirectional attention model to each training set,
together with a few competing methods, namely logistic regression
(LR), random forests (RF), gradient boosting (GB) and multilayer
perceptron (MLP). See \Cref{app:hyp-tun} for implementation details.


\parhead{Results.} \Cref{fig:sim-res-acc} summarizes the test accuracy
and mean squared error of all methods. We find that the proposed
tabular extension of bidirectional attention outperforms or
competitively compares to all competing methods. Moreover, its
performance gain is more apparent when $\textrm{corr} = 0.9$ (very
correlated training features) as compared to when $\textrm{corr} =
0.1$; the former corresponds to a more challenging case of covariate
shift.

% \input{table/attention-ood-acc}
% \input{table/attention-ood-mse}




\begin{table*}[t]
\centering
\caption{The proposed tabular extension of bidirectional attention
(ATN) achieves better or competitive accuracy and MSE than competing
methods, across all parameter settings. The parameter tuples indicate
different choices of $(n_c, \textrm{noise}, \textrm{corr})$.}
\label{fig:sim-res-acc}
{\footnotesize
\begin{tabular}{cccccc}
\hline
Param. $\backslash$ \textbf{Acc.} & LR & RF & GB & MLP & ATN \\ \hline
$(1, 0, 0.1)$ & 0.388 & 0.409 & \textbf{0.413} & 0.323 & 0.404 \\ 
$(1, 0, 0.9)$ & 0.313 & 0.298 & 0.350 & 0.237 & \textbf{0.389} \\ \hline
$(1, 0.5, 0.1)$ & 0.345 & 0.361 & \textbf{0.366} & 0.292 & 0.359 \\ 
$(1, 0.5, 0.9)$ & 0.270 & 0.253 & 0.299  & 0.202 & \textbf{0.306} \\ \hline
$(1, 1.5, 0.1)$ & 0.250 & 0.243 & \textbf{0.253} & 0.204 & 0.252 \\ 
$(1, 1.5, 0.9)$ & 0.169 & 0.158 & \textbf{0.172} & 0.142 & 0.170 \\ \hline
$(5, 0, 0.1)$ & 0.250 & 0.207 & 0.244 & 0.306 & \textbf{0.419} \\ 
$(5, 0, 0.9)$ & 0.162 & 0.150 & 0.156 & 0.169 & \textbf{0.392} \\ \hline
$(5, 0.5, 0.1)$ & 0.227 & 0.173 & 0.214 & 0.252 & \textbf{0.318} \\ 
$(5, 0.5, 0.9)$ & 0.154 & 0.133 & 0.153 & 0.151 & \textbf{0.269} \\ \hline
$(5, 1.5, 0.1)$ & 0.167 & 0.099 & 0.157 & 0.165 & \textbf{0.171} \\ 
$(5, 1.5, 0.9)$ & 0.125 & 0.108 & 0.114 & 0.118 & \textbf{0.133} \\ \hline
\end{tabular}
\qquad
\begin{tabular}{cccccc}
\hline
Param. $\backslash$ \textbf{MSE} & LR & RF & GB & MLP & ATN \\ \hline
$(1, 0, 0.1)$ & 3.015 & \textbf{2.694} & 2.730 & 4.059 & 2.941 \\ 
$(1, 0, 0.9)$ & 5.163 & 9.331 & 4.855 & 7.911 & \textbf{3.078} \\ \hline
$(1, 0.5, 0.1)$ & 3.416 & 3.201 & \textbf{3.123} & 4.704 & 3.281 \\ 
$(1, 0.5, 0.9)$ & 5.955 & 10.106 & 6.070 & 8.123 & \textbf{4.465} \\ \hline
$(1, 1.5, 0.1)$ & 5.725 & 5.685 & \textbf{5.415} & 7.199 & 5.594 \\ 
$(1, 1.5, 0.9)$ & 8.942 & 12.340 & 9.837 & 9.874 & \textbf{7.339} \\ \hline
$(5, 0, 0.1)$ & 5.333 & 8.498 & 5.967 & 2.814 & \textbf{1.521} \\ 
$(5, 0, 0.9)$ & 5.674 & 10.101 & 8.858 & 7.842 & \textbf{1.633} \\ \hline
$(5, 0.5, 0.1)$ & 6.021 & 10.236 & 6.844 & 4.056 & \textbf{2.605} \\ 
$(5, 0.5, 0.9)$ & 6.118 & 10.427 & 8.283 & 7.884 & \textbf{2.355} \\ \hline
$(5, 1.5, 0.1)$ & 9.159 & 16.154 & 9.538 & \textbf{8.313} & 8.316 \\ 
$(5, 1.5, 0.9)$ & 8.410 & 10.409 & 10.110 & 9.966 & \textbf{6.501} \\ \hline
\end{tabular}}
\end{table*}


\subsubsection{UCI's auto-mpg data}

We next study the tabular extension of bidirectional attention on a
real dataset, namely the \texttt{auto-mpg} data from the UCI data set.
This data set contains the following information from 398 different
car models: \textit{mpg}, \textit{cylinders}, \textit{displacement},
\textit{horsepower}, \textit{weight}, \textit{acceleration},
\textit{model year}, \textit{origin}, and \textit{car name}.

\parhead{Data processing.} To simulate covariate shift, we follow the
approach of \citet{storkey2006mixture}: we assigns cars from origin 1
to the training set, and origins 2 and 3 to the test set. In addition,
we only consider cars with 4, 6 or 8 cylinders and remove data points
with missing values. Lastly, similar to the synthetic data
experiments, we convert each column into three quantile-based
categories. The final data set has 385 data points, where 245 belong
to the training set and 140 belong to the test set.

\parhead{Competing methods and evaluation metrics.} We use the same
competing methods and evaluation metrics as in \Cref{sec:sim-data}.
Additionally, we compare with other existing tabular extensions of
attention modules, including CategoryEmbedding (CE)
\citep{joseph2021pytorch}, FTTransformer (FT)
\citep{gorishniy2021revisiting}, TabTransformer (TT)
\citep{huang2020tab}, AutoInt (AI) \citep{song2019autoint}, and TabNet
(TN) \citep{arik2020tabnet}.\footnote{We use
\texttt{pytorch\_tabular}'s \citep{joseph2021pytorch} implementation
with the default parameters. The batch and epoch sizes are set to be
128 and 200, respectively.}

\parhead{Results.} \Cref{fig:auto-mpg-res} summarizes the test
accuracy and mean squared error of all methods. We find that the
proposed tabular extension of bidirectional attention outperforms all
competing methods. This performance gain is likely due to its focus on
modeling the joint distribution of the covariates and response
variable; it is in contrast to the practice of modeling only the
conditional distribution of the response variable given the covariates
in supervised learning.


\begin{table*}[t]
\centering
\caption{The proposed tabular extension of attention (ATN) achieves
superior performance as compared to all baselines. (Lower MSE and
higher accuracy is better.)}
\label{fig:auto-mpg-res}
% \begin{tabular}{ccccccc}
% \hline
%  & LR & RF & GB & MLP & ATN \\ \hline
% Accuracy & 0.657 & 0.721 & 0.657 & 0.700 & \textbf{0.793} \\ \hline
% MSE & 0.343 & 0.279 & 0.343 & 0.300 & \textbf{0.207}\\ \hline
% \end{tabular}
% \qquad
% \begin{tabular}{ccccccc}
% \hline
%  & CE & FT & TT & AI & TN \\ \hline
% Accuracy & 0.764 & 0.707 & 0.707 & 0.364 & 0.600 \\ \hline
% MSE & 0.236 & 0.293 & 0.293 & 0.636 & 0.486 \\ \hline
% \end{tabular}

\begin{tabular}{ccccccccccc}
\hline
 & LR & RF & GB & MLP  & CE & FT & TT & AI & TN & ATN (ours)\\ \hline
Accuracy & 0.657 & 0.721 & 0.657 & 0.700  & 0.764 & 0.707 & 0.707 & 0.364 & 0.600 & \textbf{0.793}\\ \hline
MSE & 0.343 & 0.279 & 0.343 & 0.300  & 0.236 & 0.293 & 0.293 & 0.636 & 0.486 & \textbf{0.207}\\ \hline
\end{tabular}

\end{table*}


\section{Linear word analogies in attention-based embeddings}
\label{sec:lin-rel-anal}

In this section, we explore the presence of linear word analogies in
the embeddings of bidirectional attention and its non-attention
predecessors. En route, we leverage the close connections between
\gls{CBOW} and bidirectional attention in \Cref{prop:mlm-moe-equiv} to
facilitate the theoretical analysis. This exploration is motivated by
a curious empirical observation: While bidirectional attention (e.g.
BERT) often significantly outperforms its non-attention predecessors
in natural language processing benchmarks, it does not seem to
outperform its predecessors in word analogy tasks. In particular, it
can sometimes perform worse in word analogy tasks than classical word
embedding algorithms like word2vec~\citep{mikolov2013distributed} and
GloVe~\citep{pennington2014glove}.

Thanks to these empirical observations, we characterize under which
conditions bidirectional attention and \gls{CBOW} can exhibit linear
word analogies in their embeddings. We find that bidirectional
attention requires much stronger conditions to exhibit linear word
analogies than its non-attention predecessors. These results partially
explain the limited empirical gain in using bidirectional attention
for word analogy tasks.



% A summary of notations used in this section can be found in \Cref{sec:app-sum-not}.

\subsection{A curious empirical study: Do attention-based token embeddings
exhibit linear word analogies?}
\label{sec:attn-lin-emp}

We begin with a curious empirical study about the presence of linear
word analogies in attention-based and non-attention-based token
embeddings. Linear structure in neural word embeddings such as
word2vec \citep{mikolov2013distributed} and GloVe
\citep{pennington2014glove} is a well-known empirical phenomenon.
However, most studies focused on embeddings trained via
\gls{SGNS}~\citep{ethayarajh2018towards,allen2019analogies}. This
phenomenon is less studied in more recent language modeling
approaches, e.g. \gls{CBOW} and bidirectional attention, with few
exceptions~\citep{ushio2021bert}.

To this end, we first perform an empirical study about whether linear
relationships are observed in embeddings from word2vec trained with
the \gls{CBOW} objective and BERT \citep{devlin2018bert}, a large language
model based on bidirectional attention. Following existing
studies, we use the analogy identification task as a proxy for
identifying the presence of linear relationships, using the analogy
data set first introduced in \citet{pennington2014glove}. We refer the
readers to \Cref{sec:analogy-details-expm} for dataset and
implementation details.


\parhead{Evaluation metrics.} For each model, we are interested in (1) the overall and per-category accuracies, where accuracy is defined as the proportion of correct answers; and (2) the overall and per-category average cosine similarity between $x_b + x_c - x_a$ and the correct answer. We note that (2) is a better metric than (1) due to the difference in vocabulary sizes across models.

\parhead{Results.} The accuracy and average cosine similarity for each
model is displayed in \Cref{table:acc}. We observe that all three
models generally result in word embeddings that exhibit certain linear
word analogies. However, the bidirectional attention model BERT can
often perform worse than its non-attention predecessor GloVe in this
task, despite it being a much more powerful language model in common
natural language benchmarks. 

What factors have limited BERT's (and bidirectional attention's)
ability to exhibit linear word analogies? What about \gls{CBOW} and
GloVe? Below we study these questions theoretically, leveraging the
close connection between \gls{CBOW} and bidirectional attention in
\Cref{prop:mlm-moe-equiv}. In particular, we characterize the
conditions under which \gls{CBOW} and bidirectional attention may
exhibit linear word analogies respectively. We find that the
conditions required by bidirectional attention is much stronger, which
partially explains the empirical observations above.




% \begin{table}[h]

% \centering
% \caption{BART achieves the highest accuracy overall and on each category, where accuracy is defined as the proportion of correct answers.}
% \label{table:acc}
% \begin{tabular}{lllll}
% \hline
%           & BERT  & GloVe  & CBOW & BART \\ \hline
% Semantic  & 0.641  & 0.759 & 0.234 & \textbf{0.846}     \\ \hline
% Syntactic & 0.754 & 0.692 & 0.667  & \textbf{0.825}   \\ \hline
% Overall   & 0.727 & 0.708 & 0.563 & \textbf{0.830}     \\ \hline
% \end{tabular}
% \end{table}

% \begin{table}[h]

% \centering
% \caption{BERT achieves the highest average cosine similarity on syntactic analogies, while GloVe achieves the highest average cosine similarity on semantic analogies}
% \label{table:cos}
% \begin{tabular}{lllll}
% \hline
%           & BERT  & GloVe  & CBOW & BART \\ \hline
% Semantic  & 0.500  & \textbf{0.600} & 0.504 & 0.525     \\ \hline
% Syntactic & \textbf{0.610} & 0.610 & 0.582  & 0.596   \\ \hline
% Overall   & 0.584 & \textbf{0.607} & 0.564 & 0.579    \\ \hline
% \end{tabular}
% \end{table}



\begin{table*}[t]
\centering
\caption{Classical word embedding methods can achieve similar or
higher performance than attention-based model in word analogy tasks:
GloVe achieve higher or the same average cosine similarity than BERT
on both syntactic and semantic analogies; GloVe also outperforms BERT
in accuracy for semantic analogies. (Higher is better.)}
\label{table:acc}
\begin{tabular}{lllll}
\hline
\textbf{Accuracy}  & BERT  & GloVe & CBOW \\ \hline
Semantic  & 0.641 & 0.759 & 0.234 \\ \hline
Syntactic & 0.754 & 0.692 & 0.667 \\ \hline
Overall   & 0.727 & 0.708 & 0.563 \\ \hline
\end{tabular}
\qquad \qquad 
% \label{table:cos}
\begin{tabular}{lllll}
\hline
\textbf{Cosine similarity}  & BERT  & GloVe  & CBOW \\ \hline
Semantic  & 0.500 & 0.600 & 0.504    \\ \hline
Syntactic & 0.610 & 0.610 & 0.582     \\ \hline
Overall   & 0.584 & 0.607 & 0.564     \\ \hline
\end{tabular}
\end{table*}

% \begin{table}[t]

% \centering
% \caption{GloVe achieve higher or the same average cosine similarity
% than BERT on both syntactic and semantic analogies . (Higher is
% better.)}
% \label{table:cos}
% \begin{tabular}{lllll}
% \hline
%           & BERT  & GloVe  & CBOW \\ \hline
% Semantic  & 0.500 & 0.600 & 0.504    \\ \hline
% Syntactic & 0.610 & 0.610 & 0.582     \\ \hline
% Overall   & 0.584 & 0.607 & 0.564     \\ \hline
% \end{tabular}
% \end{table}

\subsection{Linear word analogies in CBOW and bidirectional attention
embeddings}
\label{sec:lin-rel-cbow}

We begin with theoretically characterize under which conditions can
\gls{CBOW} embeddings exhibit linear word analogies. Starting with
\citeauthor{allen2019analogies}'s [\citeyear{allen2019analogies}]
argument for \gls{SGNS}, we extend the argument to both \gls{CBOW} and
attention-based token embeddings, thanks to the equivalence we
established in \Cref{prop:mlm-moe-equiv}.


% CBOW embeddings have been empirically shown to also exhibit linear structures, albeit not as strong as compared to \gls{SGNS} or GloVe \citep{pennington2014glove}. This finding is also supported by the experiment results in \Cref{sec:attn-lin-emp}. 

To perform this theoretical analysis, we follow existing analyses about \gls{SGNS}: \citet{levy2014neural} showed that for a sufficiently large embedding dimension, embeddings from \gls{SGNS} satisfy
% \begin{equation*}
% \begin{split}
    $w_i^\top c_j = \log \left( \frac{p(w_i, c_j)}{p(w_i) p(c_j)}\right) - \log k = \textrm{PMI}(w_i, c_j) - \log k,$
% \end{split}
% \end{equation*} 
where $k$ is the number of negative samples for each positive sample;
$W^{LOV}, C \in \mathbb{R}^{|V| \times p}$ are the center and context
embedding matrix, respectively. For each $i \in [|V|]$, $w_i^\top$
($c_i^\top$) is the $i$-th row of $W^{LOV}$ ($C$), which represents
the center (context) embedding of word $i$.

Using this result, \citet{allen2019analogies}
considered embeddings which factorize the unshifted PMI matrix, namely
$w_i^\top c_j = \textrm{PMI}(w_i, c_j)$, compactly written as $W^\top
C = \textrm{PMI}$. Through the ideas of \textit{paraphrases} and
\textit{word transformations}, they explained why linear relationships
exist for analogies on \gls{SGNS} word embeddings.

Here we perform similar analyses for \gls{CBOW} and bidirectional
attention; the goal is to characterize the conditions under which
\gls{CBOW} and bidirectional attention can exhibit linear word
analogies respectively. Below we sketch the main results we obtain,
leaving full details to \Cref{sec:detailed-analogy}.


\parhead{Linear word analogies in \gls{CBOW} embeddings.} We first
characterize the inner product of center and contextual embeddings of
\gls{CBOW}.
\begin{prop}
\label{prop:cbow-sim}
Embeddings from fitting \gls{CBOW} without negative sampling must satisfy
% \begin{equation*}
    $w_i^\top c_j \approx \log \left( \frac{p(w_i, c_j)}{p(c_j)} \right) + \log |V|.$
% \end{equation*}
\end{prop}
This result suggests that \gls{CBOW} approximately factorizes $M$, a
$|V| \times |V|$ matrix such that $M_{i,j} = \log \left(\frac{p(w_i,
c_j)}{p(c_j)} \right) + \log |V|.$ Following this result, we next
argue that the \gls{CBOW} embeddings approximately form a linear
relationship, up to some error terms.
\begin{prop}
\label{prop:anal-w2v}
    Given any $w_a, w_{a^*}, w_b, w_{b^*} \in \mathcal{E}$, we have
    \begin{align*}
            w_{b^*}
            &= w_{a^*} - w_a + w_b + C^{\dagger}(\rho^{\mathcal{W}, \mathcal{W}_*} + \Delta^{\mathcal{W}, \mathcal{W}_*}+ \delta^{\mathcal{W}, \mathcal{W}_*}) \\
            &= w_{a^*} - w_a + w_b + C^{\dagger}(\xi^{\mathcal{W}, \mathcal{W}_*} + \Delta^{\mathcal{W}, \mathcal{W}_*}),
    \end{align*}
    where $\mathcal{E}$ is the set of all words in the vocabulary,
    $\mathcal{W} = \{ w_b, w_{a^*}\}$, $\Delta^{\mathcal{W},
    \mathcal{W}_*}=\sigma^{\mathcal{W}} - \sigma^{\mathcal{W}_*} $ and
    $\mathcal{W}_* = \{ w_{b^*}, w_a \}$. The quantities
    $\rho^{\mathcal{W},
    \mathcal{W}_*}, \Delta^{\mathcal{W}, \mathcal{W}_*},
    \delta^{\mathcal{W}, \mathcal{W}_*}, \xi^{\mathcal{W},
    \mathcal{W}_*}$ are all statistics that characterize the
    relationships between the two word sets $\mathcal{W},
    \mathcal{W}_*$. We refer the reader to \Cref{sec:detailed-analogy}
    for their precise definitions and complete details of the results.
\end{prop}

\Cref{prop:anal-w2v} reveals that we have linear word analogies
$w_{b^*} \approx w_{a^*} - w_a + w_b$ when $\mathcal{W}$ paraphrases
$\mathcal{W}_*$ in the sense of \citet{allen2019analogies} (i.e.
$\rho^{\mathcal{W}, \mathcal{W}_*}\approx 0$), and
$\sigma^{\mathcal{W}}$, $\sigma^{\mathcal{W}_*}$ and
$\delta^{\mathcal{W}, \mathcal{W}_*}$ are small. The latter conditions
hold true only when all $w_i \in \mathcal{W}$ ($w_i \in
\mathcal{W}_*$) are approximately conditionally independent given
$c_j$, and $p(\mathcal{W}) \approx p(\mathcal{W}_*)$. If we consider
alternative definitions of paraphrase---which we detail in
\Cref{sec:detailed-analogy}, then the linear analogy error may only
depend on the approximate conditional independence of $w_i$'s
given~$c_j$.

Finally, we characterize the conditions under which, if token embeddings
of \gls{CBOW} exhibit linear word analogies, then its contextual
embedding will also exhibit this structure.
\begin{prop}
\label{prop:anal-ctx}
    Let $\mathcal{W} = \{r,s\}$ and $\mathcal{W}_* = \{t,u\}$. Assume
    $p(\mathcal{W}) \approx p(\mathcal{W_*})$ and $w_i \in
    \mathcal{W}$ ($w_i \in \mathcal{W_*}$) are approximately
    marginally independent. Further, assume that $W$ has full row
    rank. If $w_r + w_s \approx w_t + w_u$, then $c_r + c_s \approx
    c_t + c_u$.
\end{prop}


\parhead{Linear word analogies for bidirectional attention.} We next
extend these \gls{CBOW} arguments to bidirectional attention,
leveraging the close connection established in
\Cref{prop:mlm-moe-equiv}. We will show that the same linear word
analogies may emerge in bidirectional attention, but under much
stronger assumptions.


\begin{prop}
\label{prop:sim-attn}
    Token embeddings from bidirectional attention must satisfy
    \begin{equation*}
    \label{eq:sim-attn}
    \begin{split}
        w_i^\top c_j 
         &\approx \frac{|V| \sum_{(i,j)} \gamma_{j}^i - \left(\sum_{(1,j)} \gamma_{j}^1 + \cdots + \sum_{(|V|,j)} \gamma_{j}^{|V|} \right)}{S \left(\sum_{(1,j)} (\gamma_{j}^1)^2 + \cdots + \sum_{(|V|,j)} (\gamma_{j}^{|V|})^2 \right)},
    \end{split}
    \end{equation*}
where for a center-context pair $(d,j)$ in the masked sentence $(a_1, \cdots, a_S)$, we define $\gamma_j^d = \tau_j / \sum_{s=1}^S \tau_{a_s}$, and $\tau_j = \exp\left( c_j ^\top W^{KQ} c_{|V|+1}/\sqrt{d_w} \right)$.
\end{prop}
\Cref{prop:sim-attn} shows that bidirectional attention approximately
factorizes a $|V| \times |V|$ matrix whose $(i,j)$-th entry is given
by the equation above. Unlike in \gls{CBOW}, the token embedding for
each word $i$ is $c_i$ (the \textit{context} embedding), and not $w_i$
(the \textit{center} embedding). In the case where $\tau_j$ is
approximately the same for every $j \in [|V|+1]$, the problem
approximately reduces to a vanilla \gls{CBOW}: we always have
$\gamma_j^d \approx 1/S$, whence \Cref{prop:sim-attn} yields $w_i^\top
c_j \approx \frac{p(w_i, c_j)}{p(c_j)} \cdot |V| - 1 \approx \log
\left( \frac{p(w_i, c_j)}{p(c_j)} \right) + \log |V|$.

Following a similar argument as \Cref{prop:anal-w2v}, we argue that the bidirectional attention embedding can also exhibit linear word analogies, up to some error.

\begin{prop}
\label{prop:anal-w2v-attn}
    Given any $w_a, w_{a^*}, w_b, w_{b^*} \in \mathcal{E}$, we have
    \begin{align*}
            w_{b^*} 
            &= w_{a^*} - w_a + w_b + \tilde{C}^{\dagger}(\bar{\rho}^{\mathcal{W}, \mathcal{W}_*} + \overline{\Delta}^{\mathcal{W},
    \mathcal{W}_*} + \bar{\delta}^{\mathcal{W}, \mathcal{W}_*}) \\
            &= w_{a^*} - w_a + w_b + \tilde{C}^{\dagger}(\bar{\xi}^{\mathcal{W}, \mathcal{W}_*} + \overline{\Delta}^{\mathcal{W},
    \mathcal{W}_*}),
    \end{align*}
    where $\overline{\Delta}^{\mathcal{W},
    \mathcal{W}_*}=\overline{\sigma}^{\mathcal{W}}- \overline{\sigma}^{\mathcal{W}_*}$, $\mathcal{W} = \{ w_b, w_{a^*}\}$, and $\mathcal{W}_* = \{ w_{b^*}, w_a \}$. The quantities $\bar{\rho}^{\mathcal{W}, \mathcal{W}_*}$, $\overline{\Delta}^{\mathcal{W}
    \mathcal{W}_*}$, $\bar{\delta}^{\mathcal{W}, \mathcal{W}_*}$ characterize the relationships between $\mathcal{W},
    \mathcal{W}_*$ based on $\bar{p}(w_i, c_j) \triangleq
    \sum_{(i,j)}\gamma_j^i/E$; see details in \Cref{sec:detailed-analogy}.
\end{prop}

Under additional conditions, similar linear word analogy relationships
may also emerge for the contextual embeddings of bidirectional
attention.

\begin{prop}
\label{prop:anal-ctx-attn}
    Let $\mathcal{W} = \{r,s\}$ and $\mathcal{W}_* = \{t,u\}$. Assume
    $\bar{p}(\mathcal{W}) \approx \bar{p}(\mathcal{W_*})$ and $w_i \in
    \mathcal{W}$ ($w_i \in \mathcal{W_*}$) are approximately
    marginally independent. Further assume that $W$ has full row rank
    and $\bar{p}(w_i, c_j) \approx \bar{p}(w_j,c_i)$. If $w_r + w_s
    \approx w_t + w_u$, then $\tilde{c}_r + \tilde{c}_s \approx
    \tilde{c}_t + \tilde{c}_u$.
\end{prop}

While we leave the full details of these results to \Cref{sec:detailed-analogy}, \Cref{prop:anal-w2v-attn,prop:anal-ctx-attn} suggest that bidirectional attention requires much stronger conditions to exhibit linear relationships than \gls{CBOW}. Specifically, it requires the quantity $\bar{p}(w_i, c_j) = \sum_{(i,j)}\gamma_j^i/E$ to be approximately symmetric. Even when this condition holds, linear word analogy would only hold for some transformed embeddings $\tilde{c}_i$'s, as opposed to the token embeddings $c_i$'s. Only under an additional assumption that
% \begin{equation*}
   $\zeta_j := \frac{\sum_{(1,j)} (\gamma_{j}^1)^2 + \cdots + \sum_{(|V|,j)} (\gamma_{j}^{|V|})^2 }{\sum_{(1,j)} \gamma_{j}^1 + \cdots + \sum_{(|V|,j)} \gamma_{j}^{|V|} }$
% \end{equation*}
is approximately the same for each $j$ (e.g., when $\tau_j$ is
approximately the same for every $j$), we will approximately have
linear word analogies for the token embeddings $c_i$'s.

Finally, we note that all these results can be easily extended to
incorporate positional encodings by considering each (word, position)
pair as a unit. In these cases, analogies will be drawn between (word,
position) pairs.


\section{Discussion}
\glsresetall

In this paper, we prove that a single-head single-layer bidirectional
attention is equivalent to a \gls{CBOW} model with \gls{MoE} weights,
upon reparameterization. This statistical perspective reveals the
distinct use of \gls{MoE} in bidirectional attention, supporting the
empirical observations that bidirectional attention excels in
capturing heterogeneous patterns. This connection further suggests
immediate extensions of attention to tabular data, leading to improved
\gls{OOD} generalizations when compared to existing approaches. It also
allows us to characterize the conditions under which embeddings from
bidirectional attention and \gls{CBOW} exhibit linear word analogies.
These analyses show that bidirectional attention requires much
stronger assumptions than its non-attention predecessors to exhibit
linear word analogies.

One limitation of this work is that the linear word analogy argument
in \Cref{sec:lin-rel-anal} ignores residual connections. In addition,
we only consider bidirectional attention architectures that use linear
layers, as opposed to feed-forward layers used
in~\citet{devlin2018bert}. Beyond addressing these limitations,
exploring the statistical properties of bidirectional attention is an
interesting avenue for future work. It will also be useful to provide
theoretical justifications for the observed robustness of
bidirectional attention to covariate shifts, and to understand the
fundamental differences between static and contextual word embeddings
in their abilities to form linear analogies.


\section*{Acknowledgements} This work
was supported in part by the Office of Naval Research under grant
number N00014-23-1-2590 and the National Science Foundation under
Grant No. 2231174 and No. 2310831. We thank Sasha Rush for suggesting
the
name ``bidirectional attention.''
\clearpage
\putbib[attention-uai]
\end{bibunit}

% \appendix

% \onecolumn
% \begin{bibunit}[abbrvnat]
% \input{sec_supp.tex}
% \putbib[attention-uai]
% \end{bibunit}

% References
\bibliography{attention-uai.bib}

\end{document}


% KEEP THIS for LATEX-TOOLS in SublimeText, as it does not recognize bibunit
% IT WILL NOT APPEAR IN THE PDF
\bibliography{attention-uai.bib}