%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{dsfont}

% FONTS
\usepackage[T1]{fontenc}

\usepackage{tgtermes}
\usepackage{amsmath}
% \usepackage[subscriptcorrection,
%             amssymbols,
%             mtpbb,
%             mtpcal,
%             nofontinfo  % suppresses all warnings
%            ]{mtpro2}
\usepackage{scalefnt,letltxmacro}
\LetLtxMacro{\oldtextsc}{\textsc}
\renewcommand{\textsc}[1]{\oldtextsc{\scalefont{1.10}#1}}
\usepackage[scaled=0.92]{PTSans}
\usepackage{inconsolata}
\usepackage{mathbbol}
\usepackage{amsthm}
\usepackage{amssymb}
\usepackage{thmtools,thm-restate}


% COLOR
% \usepackage[usenames,dvipsnames]{xcolor}
% \definecolor{shadecolor}{gray}{0.9}

% SPACING and TEXT
% \usepackage[final, expansion=alltext]{microtype}
% \usepackage[english]{babel}
% \usepackage[parfill]{parskip}
\usepackage{afterpage}
\usepackage{framed}
\usepackage{nicefrac}

% Redefine the leftbar environment to accept a width and coloring options
\renewenvironment{leftbar}[1][\hsize] {%
  \def\FrameCommand {%
    {\color{Gray}\vrule width 3pt}%
    \hspace{10pt}%
    %\hspace{0pt}\fboxsep=\FrameSep\colorbox{black!10}%
  }%
  \MakeFramed{\hsize#1\advance\hsize-\width\FrameRestore}%
}%
{\endMakeFramed}

% Define a paragraph header function
\DeclareRobustCommand{\parhead}[1]{\textbf{#1}~}

% paragraph helper
\DeclareRobustCommand{\PP}{\textcolor{Plum}{\texttt{\P}}~}
\DeclareRobustCommand{\pp}{\textcolor{Plum}{\texttt{\P}}~}

% COUNTERS
\renewcommand{\labelenumi}{\color{black!67}{\arabic{enumi}.}}
\renewcommand{\labelenumii}{{\color{black!67}(\alph{enumii})}}
\renewcommand{\labelitemi}{{\color{black!67}\textbullet}}

% FIGURES
\usepackage{graphicx}
\usepackage[labelfont=bf]{caption}
\usepackage[format=hang]{subcaption}

% TABLES
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{longtable}
\usepackage{etoolbox,siunitx}
\robustify\bfseries
\sisetup{detect-weight=true, detect-shape=true, detect-mode=true,
table-format=5.1, table-number-alignment=center}

% BIBLIOGRAPHY
\usepackage{natbib}
\usepackage{bibunits}

% ALGORITHMS
% \usepackage[algoruled]{algorithm2e}
\usepackage{listings}
\usepackage{fancyvrb}
\fvset{fontsize=\normalsize}
\usepackage{algorithm}
\usepackage{algorithmic}

% HYPERREF
% \usepackage[colorlinks, linktoc=all, hidelinks]{hyperref}
% \usepackage[all]{hypcap}
% \hypersetup{citecolor=Violet}
% \hypersetup{linkcolor=black}
% \hypersetup{urlcolor=MidnightBlue}

% CLEVEREF must come after HYPERREF
\usepackage[nameinlink]{cleveref}

% ACRONYMS
\usepackage[acronym,smallcaps,nowarn]{glossaries}
\glsdisablehyper{}
% \makeglossaries

% COLOR DEFINITIONS
\newcommand{\red}[1]{\textcolor{BrickRed}{#1}}
\newcommand{\orange}[1]{\textcolor{BurntOrange}{#1}}
\newcommand{\green}[1]{\textcolor{OliveGreen}{#1}}
\newcommand{\blue}[1]{\textcolor{MidnightBlue}{#1}}
\newcommand{\gray}[1]{\textcolor{black!60}{#1}}

% LISTINGS DEFINTIONS
\usepackage{listings}
\lstdefinestyle{mystyle}{
    commentstyle=\color{OliveGreen},
    numberstyle=\tiny\color{black!60},
    stringstyle=\color{BrickRed},
    basicstyle=\ttfamily\scriptsize,
    breakatwhitespace=false,
    breaklines=true,
    captionpos=b,
    keepspaces=true,
    numbers=none,
    numbersep=5pt,
    showspaces=false,
    showstringspaces=false,
    showtabs=false,
    tabsize=2
}
\lstset{style=mystyle}




\DeclareRobustCommand{\mb}[1]{\ensuremath{\mathbf{\boldsymbol{#1}}}}
% \DeclareRobustCommand{\mb}[1]{\mathbold{#1}}

\DeclareRobustCommand{\KL}[2]{\ensuremath{\textrm{KL}\left(#1\;\|\;#2\right)}}

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\crefname{lemma}{lemma}{lemmas}
\Crefname{lemma}{Lemma}{Lemmas}
\crefname{thm}{theorem}{theorems}
\Crefname{thm}{Theorem}{Theorems}
\crefname{prop}{proposition}{propositions}
\Crefname{prop}{Proposition}{Propositions}


\newtheorem{thm}{Theorem} % reset theorem numbering for each chapter
\newtheorem{defn}[thm]{Definition} % definition numbers are dependent on theorem numbers
\newtheorem{prop}[thm]{Proposition}
\newtheorem{exmp}[thm]{Example} % same for example numbers
\newtheorem{lemma}[thm]{Lemma}
\newtheorem{assumption}{Assumption}
\newtheorem{cor}[thm]{Corollary}
\newcommand\independent{\protect\mathpalette{\protect\independenT}{\perp}}
\def\independenT#1#2{\mathrel{\rlap{$#1#2$}\mkern2mu{#1#2}}}


\renewcommand{\mid}{~\vert~}
\newcommand{\prm}{\:;\:}

\newcommand{\mbw}{\mb{w}}
\newcommand{\mbW}{\mb{W}}

\newcommand{\mbx}{\mb{x}}
\newcommand{\mbX}{\mb{X}}

\newcommand{\mby}{\mb{y}}
\newcommand{\mbY}{\mb{Y}}

\newcommand{\mbz}{\mb{z}}
\newcommand{\mbZ}{\mb{Z}}

\newcommand{\mbI}{\mb{I}}
\newcommand{\mbone}{\mb{1}}

\newcommand{\mbL}{\mb{L}}

\newcommand{\mbtheta}{\mb{\theta}}
\newcommand{\mbTheta}{\mb{\Theta}}
\newcommand{\mbomega}{\mb{\omega}}
\newcommand{\mbOmega}{\mb{\Omega}}
\newcommand{\mbsigma}{\mb{\sigma}}
\newcommand{\mbSigma}{\mb{\Sigma}}
\newcommand{\mbphi}{\mb{\phi}}
\newcommand{\mbPhi}{\mb{\Phi}}

\newcommand{\mbalpha}{\mb{\alpha}}
\newcommand{\mbbeta}{\mb{\beta}}
\newcommand{\mbgamma}{\mb{\gamma}}
\newcommand{\mbeta}{\mb{\eta}}
\newcommand{\mbmu}{\mb{\mu}}
\newcommand{\mbrho}{\mb{\rho}}
\newcommand{\mblambda}{\mb{\lambda}}
\newcommand{\mbzeta}{\mb{\zeta}}

\newcommand\dif{\mathop{}\!\mathrm{d}}
\newcommand{\diag}{\textrm{diag}}
\newcommand{\supp}{\textrm{supp}}

\newcommand{\E}{\mathbb{E}}
\newcommand{\V}{\mathbb{V}}
\newcommand{\bbH}{\mathbb{H}}

\newcommand{\bbN}{\mathbb{N}}
\newcommand{\bbZ}{\mathbb{Z}}
\newcommand{\bbR}{\mathbb{R}}
\newcommand{\bbS}{\mathbb{S}}

\newcommand{\cL}{\mathcal{L}}

\newcommand{\cN}{\mathcal{N}}
\newcommand{\cT}{\mathcal{T}}
\newcommand{\Gam}{\textrm{Gam}}
\newcommand{\InvGam}{\textrm{InvGam}}

% \newcommand{\qedsymbol}{\rule{0.7em}{0.7em}}

\newcommand{\g}{\, | \,}
\newcommand{\s}{\, ; \,}




\newacronym{KL}{kl}{Kullback-Leibler}
\newacronym{ELBO}{elbo}{\emph{evidence lower bound}}
\newacronym{POPELBO}{pop-elbo}{\emph{population evidence lower bound}}
\newacronym{PROELBO}{pro-elbo}{\emph{profile evidence lower bound}}

\newacronym{SVI}{svi}{stochastic variational inference}
\newacronym{VI}{vi}{variational inference}

\newacronym{ADVI}{advi}{automatic differentiation variational inference}

\newacronym{GMM}{gmm}{Gaussian mixture model}
\newacronym{LDA}{lda}{latent Dirichlet allocation}

\newacronym{SMC}{smc}{Sequential Monte Carlo}
\newacronym{VB}{vb}{variational Bayes}

\newacronym{TDVI}{tdvi}{transdimensional variational inference}
\newacronym{STDVI}{stdvi}{sequential transdimensional variational inference}
\newacronym{MCMC}{mcmc}{Markov chain Monte Carlo}
\newacronym{RJMCMC}{rjmcmc}{reversible jump Markov chain Monte Carlo}
\newacronym{TDMCMC}{tdmcmc}{transdimensional Markov chain Monte Carlo}

\newacronym{SLDS}{slds}{switching linear dynamical system}
\newacronym{HDP-SLDS}{hdp-slds}{hierarchical Dirichlet process switching linear dynamical system}
\newacronym{MDP}{mdp}{Markov decision process}


\newacronym{MLM}{mlm}{masked language model}
\newacronym{CBOW}{cbow}{continuous bag of words}
\newacronym{MoE}{moe}{mixture-of-experts}
\newacronym{SGNS}{sgns}{skip-gram with negative sampling}
\newacronym{LLM}{llm}{large language model}
\newacronym{OOD}{ood}{out-of-distribution}
\newacronym{AWE}{awe}{attention word embedding}





\usepackage{tikz}
\usetikzlibrary{bayesnet}
\usepackage{pgfplots}
\pgfplotsset{compat=newest}
\pgfplotsset{plot coordinates/math parser=false}
\usepgfplotslibrary{statistics}

\pgfdeclarelayer{edgelayer}
\pgfdeclarelayer{nodelayer}
\pgfsetlayers{edgelayer,nodelayer,main}

\definecolor{hexcolor0xbfbfbf}{rgb}{0.749,0.749,0.749}

\tikzset{>=latex}
\tikzstyle{none}   = [inner sep=0pt]
\tikzstyle{line}   = [ thick, -, shorten <=1pt, shorten >=1pt ]
\tikzstyle{arrow}  = [ thick,  ->, shorten <=1pt, shorten >=1pt ]
\tikzstyle{ardash} = [ thick dotted, ->, shorten <=1pt, shorten >=1pt ]

\tikzstyle{empty}=[circle,opacity=0.0,text opacity=1.0,minimum width=4pt,minimum height=4pt]
\tikzstyle{box}=[rectangle,fill=White,draw=Black]
\tikzstyle{filled}=[circle,fill=hexcolor0xbfbfbf,draw=Black]
\tikzstyle{hollow}=[circle,fill=White,draw=Black]
\tikzstyle{param}=[rectangle,fill=Black,draw=Black,inner sep=0pt,minimum width=4pt,minimum height=4pt]
\tikzstyle{paramhollow}=[rectangle,fill=White,draw=Black,inner sep=0pt,minimum
width=4pt,minimum height=4pt]



%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example


\usepackage{xr}
\externaldocument{wibisono_672}

\setcounter{prop}{8}

\title{Bidirectional Attention as a Mixture of Continuous Word Experts}

% yw / i changed the title a little for grammatical reasons

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors

\author[1]{Kevin Christian Wibisono}
\author[1]{Yixin Wang}
% Add affiliations after the authors
\affil[1]{%
    Department of Statistics\\
    University of Michigan\\
    Ann Arbor, MI, USA~\thanks{Correspondence to: \{kwib,yixinw\}@umich.edu; Software that
  replicates the empirical studies is at
  \texttt{https://github.com/yixinw-lab/attention-uai}.}
}

% yw / please fix the capitalization in the bibs. (e.g. bert needs to
% be capitalized; first word after colon in the title needs to be
% capitalized; the journal name should be properly capitalized etc
% etc) and go through all the submission checklist in the github repo
% https://github.com/yixinw-lab/yixinw-lab/blob/main/checklist-before-submitting-papers.md
  
\begin{document}
% \maketitle



\appendix

\onecolumn
\begin{bibunit}[abbrvnat]

\begin{center}
\textbf{\Large{Supplementary Material: Bidirectional Attention as a Mixture of Continuous Word Experts}}
\end{center}

% % The standard author block has changed for UAI 2023 to provide
% % more space for long author lists and allow for complex affiliations
% %
% % All author information is authomatically removed by the class for the
% % anonymous submission version of your paper, so you can already add your
% % information below.
% %
% % Add authors
% \author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2023 paper}{Jane~J.~von~O'L\'opez}{}}
% \author[1]{Harry~Q.~Bovik}
% \author[1,2]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[1]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[3,1]{Further~Coauthor}
% % Add affiliations after the authors
% \affil[1]{%
%     Computer Science Dept.\\
%     Cranberry University\\
%     Pittsburgh, Pennsylvania, USA
% }
% \affil[2]{%
%     Second Affiliation\\
%     Address\\
%     …
% }
% \affil[3]{%
%     Another Affiliation\\
%     Address\\
%     …
%   }
  
%   \begin{document}
  
% \onecolumn %% Turn this off if single column is desired for the supplement
% \maketitle


% \appendix


% \section{A simple attention-based architecture}
% \begin{enumerate}
%     \item For each sentence, mask one word at a time (i.e., replace it with a \texttt{[MASK]} token).
%     \item Let $X \in \mathbb{R}^{S \times p}$ be a matrix consisting of the token embeddings of each word in the masked sentence, and $P \in \mathbb{R}^{S \times p}$ be a matrix of position encodings. There is one positional embedding for each position, regardless of the word in that position. These position encodings make this model not order-invariant. Let $X' := X + P \in \mathbb{R}^{S \times p}$. 
%     \item Introduce attention weight matrices $W^{V} \in \mathbb{R}^{d \times p}$, $W^{Q} \in \mathbb{R}^{d_w \times p}$ and $W^{K} \in \mathbb{R}^{d_w \times p}$.
%     \item Let $X^{\textrm{attn}} := \textrm{softmax}\left( \frac{X' (W^{Q})^\top W^{K} (X')^\top}{\sqrt{d_w}} \right) X' (W^{V})^\top \in \mathbb{R}^{S \times d}$, where the softmax is taken row-wise.
%     \item Let $W^O \in \mathbb{R}^{d \times p}$ and write $Z := X^{\textrm{attn}} W^O \in \mathbb{R}^{S \times p}$.
%     \item Introduce a residual connection $Z' := X' + Z \in \mathbb{R}^{S \times p}$.
%     \item For each position $i \in [S]$, apply a linear layer with zero bias $LIN_1(Z_i') := W^* Z_i' \in \mathbb{R}^{p}$, where $W^* \in \mathbb{R}^{p \times p}$.
%     \item Introduce another residual connection and let the output be $Z^{\textrm{last}} := Z' + LIN_1(Z') \in \mathbb{R}^{S \times p}$.
%     \item For each position $i \in [S]$, apply a linear layer with zero bias $LIN_2(Z^{\textrm{last}}_i) := W^{\textrm{last}} Z^{\textrm{last}}_i \in \mathbb{R}^{|V|}$, where $W^{\textrm{last}} \in \mathbb{R}^{|V| \times p}$
%     \item Perform the softmax operation. Calculate the sum of the cross-entropy loss for each of the masked words in step 1 to obtain the total loss for this particular sentence. 
% \end{enumerate}

\section{Summary of notations}
\label{sec:app-sum-not}
Below is a summary of commonly-used notations in \Cref{sec:lin-rel-anal}. 

\begin{table}[H]
\centering
\begin{tabular}{c|l}
Notation & Explanation \\[0.5mm] \hline
$|V|$ & Vocabulary size \\[0.5mm] 
$S$ & Sentence length \\[0.5mm] 
$p$ & Embedding dimension \\[0.5mm] 
$W^{LOV}$ & Center embedding matrix \\[0.5mm]
$C$ & Token (context) embedding matrix \\[0.5mm] 
$w_i^\top$ & $i$-th row of $W^{LOV}$ \\[0.5mm]
$c_i^\top$ & $i$-th row of $C$ \\[0.5mm]
$P$ & Position encoding matrix \\[0.5mm] 
$\overline{X}$ & One-hot encoding matrix of the masked sentence \\[0.5mm]
$\overline{y}$ & One-hot encoding of the target word \\[0.5mm]
$m$ & Position of the masked word \\[0.5mm] 
$b$ & The masked word \\[0.5mm] 
$e_j$ & A zero vector of length $S$ with 1 on the $j$-th entry \\[0.5mm] 
$f_j(\cdot)$ & The output generated by expert $j$ \\[0.5mm] 
$\pi_j(\cdot)$ & The contribution of expert $j$ \\[0.5mm] 
$a_s$ & The word on the $s$-th position of the masked sentence \\[0.5mm] 
\end{tabular}
\end{table}


\section{A sketch of the attention-based architecture}
\label{sec:app-simple-attn}
\begin{enumerate}
    \item Let $X = \overline{X} C \in \mathbb{R}^{S \times p}$ be a matrix consisting of the token embeddings of each word in the masked sentence, and $X' = X + P \in \mathbb{R}^{S \times p}$. 
    \item Introduce attention weight matrices $W^{V} \in \mathbb{R}^{d \times p}$, $W^{Q} \in \mathbb{R}^{d_w \times p}$ and $W^{K} \in \mathbb{R}^{d_w \times p}$. Let $X^{\textrm{attn}} = \textrm{softmax}\left( \frac{X' (W^{Q})^\top W^{K} (X')^\top}{\sqrt{d_w}} \right) X' (W^{V})^\top \in \mathbb{R}^{S \times d}$, where the softmax is taken row-wise.
    \item Let $W^O \in \mathbb{R}^{d \times p}$, and write $Z = X^{\textrm{attn}} W^O \in \mathbb{R}^{S \times p}$.
    \item Introduce a residual connection, and write $Z' := X' + Z \in \mathbb{R}^{S \times p}$.
    \item For each position $i \in [S]$, apply a linear layer $LIN_1(Z_i') = W' Z_i' \in \mathbb{R}^{p}$, where $W' \in \mathbb{R}^{p \times p}$.
    \item Introduce another residual connection, and write $Z'' = Z' + LIN_1(Z') \in \mathbb{R}^{S \times p}$.
    \item For each position $i \in [S]$, apply a linear layer $LIN_2(Z''_i) := W'' Z''_i \in \mathbb{R}^{|V|}$, where $W'' \in \mathbb{R}^{|V| \times p}$.
    \item Perform the softmax operation and calculate the cross-entropy loss corresponding to predicting the masked word in the sentence.
\end{enumerate}


\section{Proof of Lemma \ref{prop:mlm-obj}}

\label{sec:app-proof-mlm-objective}

% \Cref{prop:mlm-obj}.  \textit{Consider the attention architecture in \Cref{sec:app-simple-attn} and an input-output pair $(\overline{X}, \overline{y})$. Let $m \in [S]$ and $b \in [|V|]$ denote the masked position and masked word, respectively. The MLM objective for this instance is given by 
% \begin{equation*}
%         -\sum_{j=1}^S \frac{\theta(j,m)}{\sum_{j=1}^S \theta(j,m)} \chi(j,m,b) + \log\left( \sum_{k=1}^{|V|} \exp\left( \sum_{j=1}^S \frac{\theta(j,m)}{\sum_{j=1}^S \theta(j,m)} \chi(j,m,k) \right)\right),
% \end{equation*}
% where $g \in \mathbb{R}^{|V|}$, $D \in \mathbb{R}^{|V| \times S}$, $W^{LOV} \in \mathbb{R}^{|V| \times p}$, $W^{KQ} \in \mathbb{R}^{p \times p}$, $e_j \in \{0,1\}^{S}$ denotes a zero vector with 1 on the $j$-th entry, $\theta(a,d) = \exp\left( \frac{e_a^\top (\overline{X} C+P) W^{KQ} (c_{|V|+1} + P^\top e_d)}{\sqrt{d_w}} \right)$,
% and
% $\chi(a,d,f) = \left( W^{LOV} (\overline{X} C+P)^\top e_a + g + D e_d \right)_f$.}

\textit{Proof.} Recall that $m \in [S]$ and $b \in [|V|]$ represent the masked position and masked word, respectively. It is easy to see that $X^\top e_m = c_{|V| + 1}$, where $e_m \in \{0,1\}^{S}$ is a zero vector with 1 on the $m$-th entry. Note that steps 1 to 4 of \Cref{sec:app-simple-attn} give us
\begin{equation*}
    Z' = X + P + \textrm{softmax} \left( \frac{(X+P)(W^Q)^\top W^K (X+P)^\top}{\sqrt{d_w}} \right) (X+P) (W^V)^\top W^O \in \mathbb{R}^{S \times p}.
\end{equation*}
This is followed by steps 5 and 6, which yield $Z'' = Z' + LIN_1(Z')$ where the $i$-th row of $Z''$ is given by $(Z_i'')^\top$, where $Z''_i = Z_i' + W' Z_i'$ for some $W' \in \mathbb{R}^{p \times p}$.  Lastly, steps 7 and 8 result in $\alpha_m = \textrm{softmax} (W'' Z_m'')$ for some $W'' \in \mathbb{R}^{|V| \times p}$, from which the loss is simply $-\log(e_b^\top \alpha_m)$, where $e_b \in \{0,1\}^{|V|}$ is a zero vector with 1 on the $b$-th entry. See that 
\begin{equation*}
    \begin{split}
        W'' Z_m'' &= (W'' + W'' W')(Z')^\top e_m \\
        &= W^\ell \left( (X + P)^\top + (W^O)^\top W^V  (X + P)^\top \textrm{softmax} \left( \frac{(X+P)(W^K)^\top W^Q (X+P)^\top}{\sqrt{d_w}} \right)  \right) e_m,
    \end{split}
\end{equation*}
where $W^\ell = W'' + W'' W' \in \mathbb{R}^{|V| \times p}$ and the softmax is taken column-wise. Writing $W^\ell c_{|V| + 1} = g \in \mathbb{R}^{|V|}$, $W^\ell P^\top = D \in \mathbb{R}^{|V| \times S}$, $W^\ell (W^O)^\top W^V = W^{LOV} \in \mathbb{R}^{|V| \times p}$ and $(W^K)^\top W^Q = W^{KQ} \in \mathbb{R}^{p \times p}$, we obtain
\begin{align*}
        W'' Z_m'' &=  g + D e_m + \sum_{j=1}^S \frac{\exp\left( \frac{e_j^\top (X+P) W^{KQ} (c_{|V|+1} + P^\top e_m)}{\sqrt{d_w}} \right)}{\sum_{j=1}^S \exp\left( \frac{e_j^\top (X+P) W^{KQ} (c_{|V|+1} + P^\top e_m)}{\sqrt{d_w}} \right)} \left( W^{LOV} (X+P)^\top e_j\right) \\
        &= \sum_{j=1}^S \frac{\exp\left( \frac{e_j^\top (\overline{X} C+P) W^{KQ} (c_{|V|+1} + P^\top e_m)}{\sqrt{d_w}} \right)}{\sum_{j=1}^S \exp\left( \frac{e_j^\top (\overline{X} C+P) W^{KQ} (c_{|V|+1} + P^\top e_m)}{\sqrt{d_w}} \right)} \left( W^{LOV} (\overline{X} C+P)^\top e_j + g + D e_m \right),
\end{align*}
and the objective for this particular instance is
\begin{equation*}
    \begin{split}
        &-\sum_{j=1}^S \frac{\exp\left( \frac{e_j^\top (\overline{X} C+P) W^{KQ} (c_{|V|+1} + P^\top e_m)}{\sqrt{d_w}} \right)}{\sum_{j=1}^S \exp\left( \frac{e_j^\top (\overline{X} C+P) W^{KQ} (c_{|V|+1} + P^\top e_m)}{\sqrt{d_w}} \right)} \left( W^{LOV} (\overline{X} C+P)^\top e_j + g + D e_m \right)_b \\
        &+ \log\left( \sum_{k=1}^{|V|} \exp\left( 
        \sum_{j=1}^S \frac{\exp\left( \frac{e_j^\top (\overline{X} C+P) W^{KQ} (c_{|V|+1} + P^\top e_m)}{\sqrt{d_w}} \right)}{\sum_{j=1}^S \exp\left( \frac{e_j^\top (\overline{X} C+P) W^{KQ} (c_{|V|+1} + P^\top e_m)}{\sqrt{d_w}} \right)} \left( W^{LOV} (\overline{X} C+P)^\top e_j + g + D e_m \right)_k \right)\right),
    \end{split}
\end{equation*}
completing the proof.

% \section{Empirical study: linear relationships in CBOW and attention-based embeddings}
% \label{app:lin-rel-attn}
% We empirically verify whether linear relationships are observed in embeddings from word2vec trained with the CBOW objective and BERT \citep{devlin2018bert}, a large language model which utilizes self-attention as its main building block. Similar to other studies, we use the analogy identification task as a proxy for identifying the presence of linear relationships. 

% \parhead{Data description.} We use the analogy data set first introduced in \citet{pennington2014glove}. This data set contains 19,544 questions of the form ``\textit{a} is to \textit{b} as \textit{c} is to ?", together with the correct answers. As an example, the first question in the data set is ``Athens is to Greece as Baghdad is to ?" (correct answer: Iraq). Overall, these questions can be categorized into two groups: semantic (about people and places) and syntactic (about word forms such as comparative, superlative and plural). For each question, we look for the word $d \neq a,b,c$ in the vocabulary such that the cosine similarity between $x_d$ and $x_b + x_c - x_a$ is maximized. Here, $x_i$ represents the embedding of word $i$. 

% To include a question, all four words involved must be present in the vocabulary list of every model. Out of 19,544 questions in the data set, 9,522 (49\%) of them satisfy the condition. Analyzing each category separately, we find that the condition is satisfied for 2,278 (26\%) out of 8,869 semantic questions, and 7,244 (68\%) out of 10,675 syntactic questions. 

% \parhead{Competing methods.} We consider four models: (1) BERT base uncased, which is used in the original BERT paper \citep{devlin2018bert}; (2) GloVe trained on Wikipedia \citep{pennington2014glove}; (3) word2vec trained with CBOW \citep{mikolov2013distributed}; and (4) BART base \citep{lewis2019bart}. The embedding dimensions of these models are 768, 300, 300 and 768, respectively, while the vocabulary size are around 30K, 400K, 3M and 30K, respectively. Since all questions in the data set consist of single words (e.g., not \textit{golden\_retriever}). In order to perform a fair comparison among these models, we only consider single words as possible answers to each question. With the same logic, we exclude non-words (e.g., \textit{[unused9]}, \textit{\#\# ?}) from the list of possible answers. 

% \parhead{Evaluation metrics.} For each model, we are interested in (1) the overall and per-category accuracies, where accuracy is defined as the proportion of correct answers; and (2) the overall and per-category average cosine similarity between $x_b + x_c - x_a$ and the correct answer.

% \parhead{Results.} The accuracy and average cosine similarity results are displayed in \Cref{table:acc,table:cos}. We observe that all four methods generally exhibit linear relationships. BART achieves the best performance in terms of accuracy. In terms of average cosine similarity, however, BERT achieves the best performance on syntactic questions while GloVe achieves the best performance on semantic questions.

% \input{table/attention-linear.tex}



\section{Tabular data generation process}
\label{sec:app-tab-data-exp}

We set the number of
features $K$ to be 5, the number of classes $C$ to be 10, and the
training and test set size to be 2,000 each. Twenty data sets are
generated for each combination of hyperparameters: (1) $n_c \in
\{1,5\}$, the number of features which generate $Y$; (2)
$\textrm{noise} \in \{0, 0.5, 1.5\}$, where a larger value indicates a
larger noise in the observed features; and (3) $\textrm{corr} \in
\{0.1, 0.9\}$, where a larger value indicates a larger between-feature
correlation in the training set as compared to the test set.

To simulate covariate shift, we introduce the parameter
$\textrm{corr}$: the correlation of any covariate pair is $\pm
\hspace{0.5mm} \textrm{corr}$ in the training set, and $1 -
\textrm{corr}$ in the test set. We generate the responses as a linear
combination of the covariates. Moreover, we add Gaussian noise to the
covariates, mimicking settings where covariates are measured with
error. Lastly, we bin each covariate and response into $C = 10$
categories based on their quantiles. This results in a $10$-class
classification problem with ordinal covariates and responses.

For a fixed $n_c \in \{1,5\}$, $\textrm{noise} \in \{0, 0.5, 1.5\}$ and $\textrm{corr} \in \{0.1, 0.9\}$, our data generation process can be described as follows.

\begin{enumerate}
    \item Let $\textrm{train\_cov} = \textrm{corr} \cdot J_5 + (1 - \textrm{corr}) \cdot I_5$ and $\textrm{test\_cov} = (1 - \textrm{ corr}) \cdot J_5 + \textrm{corr} \cdot I_5$. Here, $J_5$ represents a $5 \times 5$ matrix whose entries are all 1, and $I_5$ represents a $5 \times 5$ identity matrix.
    \item Generate samples $\textrm{train\_x\_true}$ and $\textrm{test\_x\_true}$ from zero-mean multivariate normal distributions with covariance matrices $\textrm{train\_cov}$ and $\textrm{test\_cov}$, respectively. Each sample is of size 2,000.
    \item Introduce positively and negatively correlated covariates in the training samples by multiplying data in the first two features by $-1$. 
    \item Add Gaussian observation noises to the training and test samples. For the $n_c$ features which generate the response, add $0.4 \cdot \textrm{noise} \cdot \mathcal{N}(0,1)$; otherwise, add $0.3 \cdot \textrm{noise} \cdot \mathcal{N}(0,1)$. Let the resulting samples be $\textrm{train\_x}$ and $\textrm{test\_x}$.
    \item Generate the true coefficient for each of the $n_c$ features from $\mathcal{U}(0,10)$. \item Generate the training response $\textrm{train\_y}$, which is a linear combinations of the $n_c$ features of $\textrm{train\_x\_true}$ with the true coefficients as weights, plus a Gaussian noise from $\mathcal{N}(0,4)$. Generate the test response $\textrm{test\_y}$ in a similar manner.
    \item Bin each feature and response of $(\textrm{train\_x}, \textrm{train\_y})$ and $(\textrm{test\_x}, \textrm{test\_y})$ into 10 quantile-based categories. 
\end{enumerate}

\section{Implementation and Hyperameter tuning process for competing models}
\label{app:hyp-tun}

We fit the proposed tabular extension of bidirectional attention model
to each training set, together with a few competing methods, namely
logistic regression (LR), random forests (RF), gradient boosting (GB)
and multilayer perceptron (MLP). We then evaluate the prediction
accuracy (Acc) and mean squared error (MSE) on the corresponding test
set. For each set of hyperparameters, we take the average of both
metrics across the 20 generated data sets.

We implement the proposed extension of bidirectional attention (ATN)
in Keras using a single-layer BERT \citep{devlin2018bert} with 5
heads, an embedding size of 20, and a feed-forward layer of dimension
5. We use the Adam optimizer with the default parameters, and a batch
and epoch size of 128 and 200, respectively. For the competing
methods, we use sklearn's implementation with hyperparameters chosen
via 5-fold cross-validation in classification accuracy.


For each data set, the hyperparameters of the random forest (RF),
gradient boosting (GB) and multilayer perceptron (MLP) models are
chosen via 5-fold cross-validation based on the classification
accuracy.

\textbf{Random forest.} We consider every combination of the following hyperparameters: (a) \textit{criterion}: \texttt{gini} or \texttt{entropy}; (b) \textit{n\_estimators}: 50, 100 or 200; and (c) \textit{max\_depth}: 1, 3 or \texttt{None}.

\textbf{Gradient boosting.} We consider every combination of the following hyperparameters: (a) \textit{learning\_rate}: 0.01, 0.1 or 1; (b) \textit{n\_estimators}: 50, 100 or 200; and (c) \textit{max\_depth}: 1, 3 or 5.

\textbf{Multilayer perceptron.} We consider every combination of the following hyperparameters: (a) \textit{hidden\_layer\_sizes}: (50,), (100,) or (100,50); (b) \textit{alpha}: 0.0001, 0.001 or 0.01; and (c) \textit{learning\_rate}: \texttt{constant} or \texttt{adaptive}.


\section{Details of the word analogy experiment}
\label{sec:analogy-details-expm}

\parhead{Data description.} We use the analogy data set first
introduced in \citet{pennington2014glove}. This data set contains
19,544 questions of the form ``\textit{a} is to \textit{b} as
\textit{c} is to ?", together with the correct answers. As an example,
the first question in the data set is ``Athens is to Greece as Baghdad
is to ?" (correct answer: Iraq). Overall, these questions can be
categorized into two groups: semantic (about people and places) and
syntactic (about word forms such as comparative, superlative and
plural). For each question, we look for the word $d \neq a,b,c$ in the
vocabulary such that the cosine similarity between $x_d$ and $x_b +
x_c - x_a$ is maximized; $x_i$ represents the embedding of word $i$.

We only include a question when all four words involved are present in
the vocabulary list of each model. Out of 19,544 questions in the data
set, 9,522 (49\%) of them satisfy this condition. Analyzing each
category separately, we find that the condition is satisfied for 2,278
(26\%) out of 8,869 semantic questions, and 7,244 (68\%) out of 10,675
syntactic questions.

\parhead{Models.} We consider three models: (1) BERT base uncased,
which is used in the original BERT paper \citep{devlin2018bert}; (2)
GloVe trained on Wikipedia \citep{pennington2014glove}; (3) word2vec
trained with CBOW \citep{mikolov2013distributed}. The embedding
dimensions of these models are 768, 300, 300 and 768, respectively,
while the vocabulary size are around 30K, 400K, 3M and 30K,
respectively. Since all questions in the data set consist of single
words (e.g., not \textit{golden\_retriever}). In order to perform a
fair comparison among these models, we only consider single words as
possible answers to each question; we also exclude non-words (e.g.,
\textit{[unused9]}, \textit{\#\# ?}) from the list of possible
answers. 
    
\section{Detailed analysis of embeddings for \gls{CBOW} and bidirectional attention}

\label{sec:detailed-analogy}


We begin with theoretically characterize under which conditions can
\gls{CBOW} embeddings exhibit linear word analogies. Adopting
\citeauthor{allen2019analogies}'s [\citeyear{allen2019analogies}]
argument for \gls{SGNS}, we extend the argument to both \gls{CBOW} and
attention-based token embeddings, thanks to the equivalence we
established in \Cref{prop:mlm-moe-equiv}.

\subsection{Linear word analogies in \gls{CBOW} embeddings}

% CBOW embeddings have been empirically shown to also exhibit linear structures, albeit not as strong as compared to \gls{SGNS} or GloVe \citep{pennington2014glove}. This finding is also supported by the experiment results in \Cref{sec:attn-lin-emp}. 

To perform this theoretical analysis, we follow existing analyses about \gls{SGNS}: \citet{levy2014neural} showed that for a sufficiently large embedding dimension, embeddings from \gls{SGNS} satisfy
% \begin{equation*}
% \begin{split}
    $w_i^\top c_j = \log \left( \frac{p(w_i, c_j)}{p(w_i) p(c_j)}\right) - \log k = \textrm{PMI}(w_i, c_j) - \log k,$
% \end{split}
% \end{equation*} 
where $k$ is the number of negative samples for each positive sample;
$W^{LOV}, C \in \mathbb{R}^{|V| \times p}$ are the center and context
embedding matrix, respectively. For each $i \in [|V|]$, $w_i^\top$
($c_i^\top$) is the $i$-th row of $W^{LOV}$ ($C$), which represents
the center (context) embedding of word $i$.

Using this result, \citet{allen2019analogies}
considered embeddings which factorize the unshifted PMI matrix, namely
$w_i^\top c_j = \textrm{PMI}(w_i, c_j)$, compactly written as $W^\top
C = \textrm{PMI}$. Through the ideas of \textit{paraphrases} and
\textit{word transformations}, they explained why linear relationships
exist for analogies on \gls{SGNS} word embeddings.

We next perform similar analyses for \gls{CBOW} and bidirectional
attention to characterize their conditions required for linear word
analogies.






\parhead{What matrix does CBOW (approximately) factorize?} \Cref{prop:supp-cbow-sim} is the CBOW version of \citeauthor{levy2014neural}'s [\citeyear{levy2014neural}] classical result on between-token similarities for \gls{SGNS}. The proof can be found in \Cref{sec:app-proof-simil-w2v}. 

\begin{prop}
\label{prop:supp-cbow-sim}
Consider CBOW without negative sampling. Using the same notation as before, we have
\begin{equation*}
    w_i^\top c_j \approx \log \left( \frac{p(w_i, c_j)}{p(c_j)} \right) + \log |V|.
\end{equation*}
\end{prop}

From \Cref{prop:supp-cbow-sim}, we know that CBOW approximately factorizes $M$, a $|V| \times |V|$ matrix such that 
$$M_{i,j} = \log \left(\frac{p(w_i, c_j)}{p(c_j)} \right) + \log |V|.$$

It is worth noting that this formula is similar to that for noise-contrastive estimation (NCE) as mentioned in \citet{levy2014neural}, with $\log |V|$ replaced by $- \log k$. Also, observe that $w_i^\top c_j > w_k^\top c_j$ if and only if $p(w_i, c_j) > p(w_k, c_j)$. 

We empirically verify \Cref{prop:supp-cbow-sim} using a toy corpus with a vocabulary size of 12. This corpus consists of 10,000 sentences, each of which has length 5. The corpus generation process is detailed in \Cref{sec:app-corp-gen-proc}. We then train a CBOW model with the whole sentence except the center word as the context. We choose the embedding dimension to be one of $\{30, 100, 300, 900\}$. For each dimension, we compute (1) the Spearman correlation between $w_i^\top c_j$ and $p(w_i, c_j) / p(c_j)$ for each $i,j$; and (2) the Pearson correlation between $w_i^\top c_j$ and $\log \left(p(w_i, c_j) / p(c_j) \right) + \log |V|$ for each $i,j$ such that the latter is well-defined. We obtain values of $(0.74, 0.77, 0.77, 0.77)$ for (1) and $(0.67, 0.71, 0.70, 0.71)$ for (2), which are reasonably high.

% Without invoking the Cauchy-Schwarz inequality as in the proof of \Cref{prop:supp-cbow-sim}, we do not have a nice closed form for $w_i^\top c_j$. Instead, we can derive a system of equations concerning the $w_i^\top c_j$'s as summarized in \Cref{prop:supp-cbow-sim-no-cs}.
% The proof can be found in \Cref{sec:app-proof-w2v-wo-cs}.

% \begin{prop}
% \label{prop:supp-cbow-sim-no-cs}
% Consider CBOW without negative sampling. Define $\Delta(j_1, j_2)$ as the number of times $j_1$ and $j_2$ occur together in the context on the whole corpus. For example, in an instance where the center word is $c$ and the context words are $a$, $a$, $b$, $b$ and $c$, we have $\Delta(a,a) = 1$, $\Delta(a,b) = 4$ and $\Delta(a,c) = 2$. Let $\Delta \in \mathbb{R}^{n \times n}$ be a matrix such that $(\Delta)_{k,l} :=  \Delta(k,l)$, and $\square := \mathrm{diag}(\#(c_1) + \Delta(1,1), \cdots, \#(c_n) +  \Delta(n, n))$. Assuming a sufficiently large embedding dimension, we have
% \begin{equation*}
%     \begin{pmatrix}
%     w_i^\top c_1\\
%     \cdots \\
%     w_i^\top c_{n}
%     \end{pmatrix} \approx 2m (\Delta + \square)^{-1} \begin{pmatrix}
%     n \#(w_i,c_1) + \#(c_1) \\
%     \cdots \\
%     n \#(w_i,c_n) + \#(c_n)
%     \end{pmatrix},
% \end{equation*}
% where $2m$ is the window size, $\#(w_k,c_l)$ represents the number of center-context pairs with $k$ as the center word and $l$ as the context word, and $\#(c_l) = \sum_k \#(w_k,c_l)$.
% \end{prop}

\parhead{The paraphrasing argument for CBOW.} We look at what it means for two word sets to paraphrase each other.

\begin{defn}[Definition D2 of \citet{allen2019analogies}]
\label{defn: paraph}
Let $\mathcal{E}$ be the set of all words in the vocabulary. Two word sets $\mathcal{W}, \mathcal{W}_* \subseteq \mathcal{E}$ are said to paraphrase each other if the paraphrase error $\rho^{\mathcal{W}, \mathcal{W}_*} \in \mathbb{R}^{|V|}$ is element-wise small, where
\begin{equation*}
    \rho^{\mathcal{W}, \mathcal{W}_*}_j = \log \left( \frac{p(c_j | \mathcal{W}_*)}{p(c_j | \mathcal{W})} \right)
\end{equation*}
for every $c_j \in \mathcal{E}$.
\end{defn}
Intuitively, ``word sets paraphrase one another if they induce equivalent distributions over context words". When $\mathcal{W}$ and $\mathcal{W}_*$ paraphrase each other, we write $\mathcal{W} \approx_P \mathcal{W}_*$. From \Cref{defn: paraph}, we observe that $\mathcal{W} \approx_P \mathcal{W}_*$ if and only if $\mathcal{W}_* \approx_P \mathcal{W}$. Also, we implicitly require both $p(\mathcal{W}_*)$ and $p(\mathcal{W})$ to be positive. This is exactly Assumption A3 in the original paper. We now provide an equivalent version of their Lemma 2 for the matrix $M$. Here, $M_i^\top$ denotes the $i$-th row of $M$. The proof is provided in \Cref{sec:pf-lemma-5}.
\begin{lemma}
\label{lemma:supp-lem-2-for-m}
For any word sets $\mathcal{W}, \mathcal{W}_* \subseteq \mathcal{E}$ with the same cardinality, we have
\begin{equation*}
    \begin{split}
        \sum_{w_i \in \mathcal{W}_*} M_i &= \sum_{w_i \in \mathcal{W}} M_i + \rho^{\mathcal{W}, \mathcal{W}_*} + \sigma^{\mathcal{W}} - \sigma^{\mathcal{W}_*} + \delta^{\mathcal{W}, \mathcal{W}_*} \\
        &= \sum_{w_i \in \mathcal{W}} M_i + \xi^{\mathcal{W}, \mathcal{W}_*} + \sigma^{\mathcal{W}} - \sigma^{\mathcal{W}_*},
    \end{split}
\end{equation*}
where 
$$\sigma_j^{\mathcal{W}} = \log \left( \frac{p(\mathcal{W} | c_j)}{\prod_{w_i \in \mathcal{W}} p(w_i|c_j)} \right),$$
$$\sigma_j^{\mathcal{W}_*} = \log \left( \frac{p(\mathcal{W}_* | c_j)}{\prod_{w_i \in \mathcal{W}_*} p(w_i|c_j)} \right),$$

$\delta_j^{\mathcal{W}, \mathcal{W}_*}  = \log \left( \frac{p(\mathcal{W}_*)}{p(\mathcal{W})} \right)$, and $\xi_j^{\mathcal{W}, \mathcal{W}_*} = \log \left( \frac{p( \mathcal{W}_* | c_j)}{p( \mathcal{W} | c_j)} \right)$.

\end{lemma}
 
\Cref{prop:supp-anal-w2v}, which is equivalent to Corollary 2.3 of \citet{allen2019analogies}, follows from multiplying both sides of the equations in \Cref{lemma:supp-lem-2-for-m} by $C^{\dagger} = (C C^\top)^{-1} C$ (assuming $C$ has full row rank) and setting $\mathcal{W} =  \{ w_b, w_{a^*}\}$ and $\mathcal{W}_* = \{ w_{b^*}, w_a \}$.
\begin{prop}
\label{prop:supp-anal-w2v}
    Given any $w_a, w_{a^*}, w_b, w_{b^*} \in \mathcal{E}$, we have
    \begin{align*}
            w_{b^*} 
            &= w_{a^*} - w_a + w_b + C^{\dagger}(\rho^{\mathcal{W}, \mathcal{W}_*} + \sigma^{\mathcal{W}} - \sigma^{\mathcal{W}_*} + \delta^{\mathcal{W}, \mathcal{W}_*}) \\
            &= w_{a^*} - w_a + w_b + C^{\dagger}(\xi^{\mathcal{W}, \mathcal{W}_*} + \sigma^{\mathcal{W}} - \sigma^{\mathcal{W}_*}),
    \end{align*}
    where $\mathcal{W} = \{ w_b, w_{a^*}\}$ and $\mathcal{W}_* = \{ w_{b^*}, w_a \}$.
\end{prop}
From \Cref{prop:supp-anal-w2v}, we see that when $\mathcal{W} \approx_P \mathcal{W}_*$, and $\sigma^{\mathcal{W}}$, $\sigma^{\mathcal{W}_*}$ and $\delta^{\mathcal{W}, \mathcal{W}_*}$ are small, we have $w_{b^*} \approx w_{a^*} - w_a + w_b$. By definition, $\sigma^{\mathcal{W}}$ ($\sigma^{\mathcal{W}_*}$) is small when all $w_i \in \mathcal{W}$ ($w_i \in \mathcal{W}_*$) are approximately conditionally independent given $c_j$, and $\delta^{\mathcal{W}, \mathcal{W}_*}$ is small when $p(\mathcal{W}) \approx p(\mathcal{W}_*)$. Following the connection between analogies and word transformations described in Sections 6.3 and 6.4 of \citet{allen2019analogies}, we now have an approximately linear relationship for CBOW embeddings with some error terms mentioned above.

Alternatively, we can modify \Cref{defn: paraph} so that $\mathcal{W} \approx_P \mathcal{W_*}$ if and only if $\xi^{\mathcal{W}, \mathcal{W}_*}$ (instead of $\rho^{\mathcal{W}, \mathcal{W}_*}$) is element-wise small. Now, our error terms only depend on the approximate conditional independence of $w_i$'s given $c_j$.

\parhead{Does this linear relationship also hold for context embeddings?} In other words, if $w_r + w_s \approx w_t + w_u$, do we have $c_r + c_s \approx c_t + c_u$? \Cref{prop:supp-anal-ctx}, whose proof is provided in \Cref{sec:app-proof-w2v-anal-context}, answers the question.

\begin{prop}
\label{prop:supp-anal-ctx}
    Let $\mathcal{W} = \{r,s\}$ and $\mathcal{W}_* = \{t,u\}$. Assume $p(\mathcal{W}) \approx p(\mathcal{W_*})$ and $w_i \in \mathcal{W}$ ($w_i \in \mathcal{W_*}$) are approximately marginally independent. Also, assume that $W$ has full row rank. If $w_r + w_s \approx w_t + w_u$, then $c_r + c_s \approx c_t + c_u$.
\end{prop}

So far, we have argued that both the center and context embeddings of CBOW exhibit linear structures under some assumptions. We now extend this argument to MLM with self-attention, and show that the same conclusion holds under stronger assumptions.

\subsection{Linear word analogies in attention-based embeddings}

Similar to \Cref{sec:lin-rel-cbow}, we compute the matrix MLM with self-attention factorized and construct a paraphrasing argument to show linear structures in the learned embeddings.

\parhead{What matrix does MLM with self-attention (approximately) factorize?} To make calculations tractable, we exclude both residual connections and positional encodings. Let the masked sentence be $(a_1, \cdots, a_S)$. As before, let $m \in [S]$ and $b \in [|V|]$ denote the masked position and masked word, respectively. This means $a_i \in [|V|]$ for every $i \neq m$ and $a_m = |V| + 1$. From \Cref{prop:mlm-obj}, the loss for this instance is given by
\begin{equation}
\label{eq:sim-attn-loss}
\begin{split}
    -\sum_{j=1}^S \frac{\tau_{a_j}}{\sum_{j=1}^S \tau_{a_j}} w_b^\top c_{a_j} + \log\left( \sum_{k=1}^{|V|} \exp\left( \sum_{j=1}^S \frac{\tau_{a_j}}{\sum_{j=1}^S \tau_{a_j}} w_k^\top c_{a_j}  \right)\right),
\end{split}
\end{equation}
where $\tau_j = \exp\left( \frac{c_j ^\top W^{KQ} c_{|V|+1}}{\sqrt{d_w}} \right)$. \Cref{prop:supp-sim-attn} approximates the matrix factorized by the attention objective, given all $\tau_j$ values for each $j \in [|V| + 1]$. The proof is similar to that of \Cref{prop:supp-cbow-sim}, and therefore omitted.

\begin{prop}
\label{prop:supp-sim-attn}
    Consider the attention objective as in \Cref{eq:sim-attn-loss}. We have
    \begin{equation}
    \label{eq:sim-attn}
    \begin{split}
         w_i^\top c_j 
         &\approx \frac{|V| \sum_{(i,j)} \gamma_{j}^i - \left(\sum_{(1,j)} \gamma_{j}^1 + \cdots + \sum_{(|V|,j)} \gamma_{j}^{|V|} \right)}{S \left(\sum_{(1,j)} (\gamma_{j}^1)^2 + \cdots + \sum_{(|V|,j)} (\gamma_{j}^{|V|})^2 \right)},
    \end{split}
    \end{equation}
where for a center-context pair $(d,j)$ in the masked sentence $(a_1, \cdots, a_S)$, we define $\gamma_j^d = \tau_j / \sum_{s=1}^S \tau_{a_s}$.
\end{prop}
In other words, MLM with self-attention approximately factorizes a $|V| \times |V|$ matrix whose $(i,j)$-th entry is given by \Cref{eq:sim-attn}. It is important to note that unlike in CBOW, the token embedding for each word $i$ is $c_i$ (the \textit{context} embedding), and not $w_i$ (the \textit{center} embedding). In the case where $\tau_j$ is approximately the same for every $j \in [|V|+1]$, our problem approximately reduces to a vanilla CBOW. In particular, we always have $\gamma_j^d \approx 1/S$, whence \Cref{prop:supp-sim-attn} yields $w_i^\top c_j \approx \frac{p(w_i, c_j)}{p(c_j)} \cdot |V| - 1 \approx \log \left( \frac{p(w_i, c_j)}{p(c_j)} \right) + \log |V|$. Using \Cref{prop:supp-anal-w2v}, we argue that the resulting embeddings approximately form a linear relationship, up to some error terms. 

\parhead{The paraphrasing argument for MLM with self-attention.} We first define
\begin{equation*}
    \tilde{c}_j := \frac{S \left(\sum_{(1,j)} (\gamma_{j}^1)^2 + \cdots + \sum_{(|V|,j)} (\gamma_{j}^{|V|})^2 \right)}{\sum_{(1,j)} \gamma_{j}^1 + \cdots + \sum_{(|V|,j)} \gamma_{j}^{|V|} } c_j
\end{equation*}
for every $j \in [|V|+1]$. This means
\begin{equation*}
\begin{split}
     w_i^\top \tilde{c}_j 
     &\approx \frac{|V| \sum_{(i,j)} \gamma_{j}^i}{\sum_{(1,j)} \gamma_{j}^1 + \cdots + \sum_{(|V|,j)} \gamma_{j}^{|V|} } - 1  \\
     &\approx \log \left( \frac{ \sum_{(i,j)} \gamma_{j}^i}{\sum_{(1,j)} \gamma_{j}^1 + \cdots + \sum_{(|V|,j)} \gamma_{j}^{|V|} }\right) + \log |V|,
\end{split}
\end{equation*}
where we used the approximation $x \approx \log(1+x)$. Previously, $p(w_i, c_j)$ represents a population quantity which is estimated by $\#(w_i,c_j)/D$, where $D$ is a normalizing constant, and $p(c_j) = \sum_i p(w_i, c_j)$. We now define $\bar{p}(w_i, c_j)$, a population quantity which is estimated by $\sum_{(i,j)} \gamma_j^i/E$ for some normalizing constant $E$. We have
\begin{equation*}
\label{eq:sim-attn-transf}
    w_i^\top \tilde{c}_j \approx \log \left( \frac{\bar{p}(w_i, c_j)}{\bar{p}(c_j)} \right) + \log |V|,
\end{equation*}
where $\bar{p}(c_j) = \sum_i \bar{p}(w_i,c_j)$. Note that unlike $p$, $\bar{p}$ is not symmetric, i.e., $\bar{p}(w_i, c_j) \neq \bar{p}(w_j,c_i)$. Having defined $\bar{p}$, we are ready to state \Cref{lemma:supp-lem-5-for-n}, which is a version of \Cref{lemma:supp-lem-2-for-m} for the matrix $N$, where $$N_{i,j} = \log \left( \frac{\bar{p}(w_i, c_j)}{\bar{p}(c_j)} \right) + \log |V|.$$ 
Here, $N_i^\top$ denotes the $i$-th row of $N$. The proof is analogous to that of \Cref{lemma:supp-lem-2-for-m} and is thus omitted.

\begin{lemma}
\label{lemma:supp-lem-5-for-n}
For any word sets $\mathcal{W}, \mathcal{W}_* \subseteq \mathcal{E}$ with the same cardinality, we have
\begin{equation*}
\begin{split}
    \sum_{w_i \in \mathcal{W}_*} N_i &= \sum_{w_i \in \mathcal{W}} N_i + \bar{\rho}^{\mathcal{W}, \mathcal{W}_*} + \bar{\sigma}^{\mathcal{W}} - \bar{\sigma}^{\mathcal{W}_*} + \bar{\delta}^{\mathcal{W}, \mathcal{W}_*} \\
    &= \sum_{w_i \in \mathcal{W}} N_i + \bar{\xi}^{\mathcal{W}, \mathcal{W}_*} + \bar{\sigma}^{\mathcal{W}} - \bar{\sigma}^{\mathcal{W}_*},
\end{split}
\end{equation*}
where 
$$\bar{\sigma}_j^{\mathcal{W}} = \log \left( \frac{\bar{p}(\mathcal{W} | c_j)}{\prod_{w_i \in \mathcal{W}} \bar{p}(w_i|c_j)} \right),$$
$$\bar{\sigma}_j^{\mathcal{W}_*} = \log \left( \frac{\bar{p}(\mathcal{W}_* | c_j)}{\prod_{w_i \in \mathcal{W}_*} \bar{p}(w_i|c_j)} \right),$$ 
$\bar{\rho}^{\mathcal{W}, \mathcal{W}_*}_j = \log \left( \frac{\bar{p}(c_j | \mathcal{W}_*)}{\bar{p}(c_j | \mathcal{W})} \right)$ , $\bar{\delta}_j^{\mathcal{W}, \mathcal{W}_*}  = \log \left( \frac{\bar{p}(\mathcal{W}_*)}{\bar{p}(\mathcal{W})} \right)$, and $\bar{\xi}_j^{\mathcal{W}, \mathcal{W}_*} = \log \left( \frac{\bar{p}( \mathcal{W}_* | c_j)}{\bar{p}( \mathcal{W} | c_j)} \right)$.
\end{lemma}
\Cref{prop:supp-anal-w2v-attn,prop:supp-anal-ctx-attn} are the attention versions of \Cref{prop:supp-anal-w2v,prop:supp-anal-ctx}. The proof of \Cref{prop:supp-anal-w2v-attn} follows from multiplying both sides of the equations in \Cref{lemma:supp-lem-5-for-n} by $\tilde{C}^{\dagger} = (\tilde{C} \tilde{C}^\top)^{-1} C$ (assuming $\tilde{C}$ has full row rank) and setting $\mathcal{W} =  \{ w_b, w_{a^*}\}$ and $\mathcal{W}_* = \{ w_{b^*}, w_a \}$.
The proof of \Cref{prop:supp-anal-ctx-attn} can be found in \Cref{sec:app-proof-attn-anal-context}. 
\begin{prop}
\label{prop:supp-anal-w2v-attn}
    Given any $w_a, w_{a^*}, w_b, w_{b^*} \in \mathcal{E}$, we have
    \begin{align*}
            w_{b^*} 
            &= w_{a^*} - w_a + w_b + \tilde{C}^{\dagger}(\bar{\rho}^{\mathcal{W}, \mathcal{W}_*} + \bar{\sigma}^{\mathcal{W}} - \sigma^{\mathcal{W}_*} + \bar{\delta}^{\mathcal{W}, \mathcal{W}_*}) \\
            &= w_{a^*} - w_a + w_b + \tilde{C}^{\dagger}(\bar{\xi}^{\mathcal{W}, \mathcal{W}_*} + \bar{\sigma}^{\mathcal{W}} - \bar{\sigma}^{\mathcal{W}_*}),
    \end{align*}
    where $\mathcal{W} = \{ w_b, w_{a^*}\}$ and $\mathcal{W}_* = \{ w_{b^*}, w_a \}$.
\end{prop}

\begin{prop}
\label{prop:supp-anal-ctx-attn}
    Let $\mathcal{W} = \{r,s\}$ and $\mathcal{W}_* = \{t,u\}$. Assume $\bar{p}(\mathcal{W}) \approx \bar{p}(\mathcal{W_*})$ and $w_i \in \mathcal{W}$ ($w_i \in \mathcal{W_*}$) are approximately marginally independent. Also, assume that $W$ has full row rank and $\bar{p}(w_i, c_j) \approx \bar{p}(w_j,c_i)$. If $w_r + w_s \approx w_t + w_u$, then $\tilde{c}_r + \tilde{c}_s \approx \tilde{c}_t + \tilde{c}_u$.
\end{prop}


\parhead{What do we learn from these results?} One important takeaway is that the sufficient conditions to obtain linear relationships are stronger in the case of MLM with self-attention as compared to CBOW. 
Concretely, we need $\bar{p}$ to be approximately symmetric. Even when this is satisfied, the linear relationships hold for the transformed embeddings $\tilde{c}_i$'s instead of the token embeddings $c_i$'s. Under an additional assumption that
\begin{equation*}
   \zeta_j := \frac{\sum_{(1,j)} (\gamma_{j}^1)^2 + \cdots + \sum_{(|V|,j)} (\gamma_{j}^{|V|})^2 }{\sum_{(1,j)} \gamma_{j}^1 + \cdots + \sum_{(|V|,j)} \gamma_{j}^{|V|} }
\end{equation*}
is approximately the same for each $j$ (e.g., when $\tau_j$ is approximately the same for every $j$), we approximately have linear relationships for the token embeddings $c_i$'s.

\textit{\textbf{Remarks.} It is easy to see that our result can technically be extended to incorporate positional encodings by considering each (word, position) pair as a unit. In particular, analogies are drawn between (word, position) units.}

%\bibliography{uai2023-template}


\section{Proof of Proposition \ref{prop:supp-cbow-sim}}
\label{sec:app-proof-simil-w2v}

\textbf{Proposition \ref{prop:supp-cbow-sim}}. \textit{Consider CBOW without negative sampling. Using the same notation as before, we have
\begin{equation*}
    w_i^\top c_j \approx \log \left( \frac{p(w_i, c_j)}{p(c_j)} \right) + \log |V|.
\end{equation*}}

\textit{Proof.} For simplicity, we assume that the window size is always $2m$. Consider an instance with $i$ as the center word and $j \in J$ as the context words. The loss for this instance can be approximated as
\begin{equation*}
    \begin{split}
        &-\frac{\sum_{j \in J} w_i^\top c_j}{2m} + \log \left( \sum_{k=1}^{|V|} \exp \left( \frac{\sum_{j \in J} w_k^\top c_j}{2m}\right) \right) \\
        &\approx -\frac{\sum_{j \in J} w_i^\top c_j}{2m} + \log \left( \sum_{k=1}^{|V|} \left( 1 + \frac{\sum_{j \in J} w_k^\top c_j}{2m} + \frac{(\sum_{j \in J} w_k^\top c_j)^2}{8m^2} \right) \right)  \\
        &= -\frac{\sum_{j \in J} w_i^\top c_j}{2m} + \log   |V| + \log \left( 1 + \frac{\sum_{k=1}^{|V|} \left( \sum_{j \in J} w_k^\top c_j\right)}{2m |V|} + \frac{\sum_{k=1}^{|V|} \left( \sum_{j \in J} w_k^\top c_j\right)^2}{8m^2 |V|} \right)  \\
        &\approx -\frac{\sum_{j \in J} w_i^\top c_j}{2m} + \log |V| + \frac{\sum_{k=1}^{|V|} \left( \sum_{j \in J} w_k^\top c_j\right)}{2m |V|} + \frac{\sum_{k=1}^{|V|} \left( \sum_{j \in J} w_k^\top c_j\right)^2}{8m^2 |V|} \\
        &\leq -\frac{\sum_{j \in J} w_i^\top c_j}{2m} + \log |V| + \frac{\sum_{k=1}^{|V|} \left( \sum_{j \in J} w_k^\top c_j\right)}{2m |V|} + \frac{\sum_{k=1}^{|V|} \left( \sum_{j \in J} (w_k^\top c_j)^2\right)}{4m |V|},
    \end{split}
\end{equation*}
where we used the Taylor expansions $\exp(x) \approx 1 + x + x^2/2$ and $\log(1 + x) \approx x$, as well as the Cauchy-Schwarz inequality. Ignoring the constant $\log |V|$ and multiplying by $2m|V|$, the approximate loss can be written as
\begin{equation*}
    - |V| \sum_{j \in J} w_i^\top c_j + \sum_{k=1}^{|V|} \left( \sum_{j \in J} w_k^\top c_j\right) + \frac{1}{2} \sum_{k=1}^{|V|} \left( \sum_{j \in J} (w_k^\top c_j)^2\right).
\end{equation*}
Summing this over all instances and only extracting terms which depend on $w_i^\top c_j$, we have the following loss which we want to minimize:
\begin{equation*}
    \ell(i, j) = - |V| \cdot \#(w_i, c_j) w_i^\top c_j + \#(c_j) w_i^\top c_j + \frac{1}{2} \#(c_j) (w_i^\top c_j)^2.
\end{equation*}
Taking derivative with respect to $w_i^\top c_j$ and setting it to 0 yields 
\begin{equation*}
    w_i^\top c_j = \left(\frac{\#(w_i,c_j)}{\#(c_j)}\right) \cdot |V| - 1 = \left(\frac{p(w_i, c_j)}{p(c_j)} \cdot |V| \right) - 1.
    \end{equation*}
The approximation $x \approx \log(1+x)$ completes the proof.



\section{Corpus generation process}
\label{sec:app-corp-gen-proc}
\begin{enumerate}
    \item Consider four subjects (mathematics, statistics, sociology and history) and four adjectives (fun, boring, easy and difficult). Assign scores to each subject which represents the level of each adjective:
    \begin{enumerate}
        \item mathematics: (4, 2, 4, 2).
        \item statistics: (6, 0, 5, 1).
        \item sociology: (1, 5, 2, 4).
        \item history: (0, 6, 0, 6).
    \end{enumerate}
    \item Consider three types of sentence:
        \begin{enumerate}
            \item Type 1: I like \texttt{subj1} and \texttt{subj2}, where \texttt{subj1} and \texttt{subj2} are independently chosen from the list of subjects with probability $(4/11, 5/11, 1/11, 1/11)$. 
            \item Type 2: \texttt{subj1} and \texttt{subj2} is \texttt{adj}, where \texttt{subj1} and \texttt{subj2} are independently chosen from the list of subjects with uniform probability, and  \texttt{adj} is chosen from the list of adjectives with probability proportional to the sum of the scores of \texttt{subj1} and \texttt{subj2}.
            \item Type 3: \texttt{subj} is \texttt{adj1} and \texttt{adj2}, where \texttt{subj} is chosen from the list of subjects with uniform probability, and \texttt{adj1} and \texttt{adj2} are independently chosen from the list of adjectives with probability proportional to the score of \texttt{subj}.
        \end{enumerate}
    \item To generate each sentence, we first randomly choose the sentence type with uniform probability. We then form the sentence following the process above.
\end{enumerate}

\section{Proof of Lemma \ref{lemma:supp-lem-2-for-m}}
\label{sec:pf-lemma-5}
\textbf{Lemma \ref{lemma:supp-lem-2-for-m}}. \textit{For any word sets $\mathcal{W}, \mathcal{W}_* \subseteq \mathcal{E}$ with the same cardinality, we have
\begin{equation*}
    \begin{split}
        \sum_{w_i \in \mathcal{W}_*} M_i &= \sum_{w_i \in \mathcal{W}} M_i + \rho^{\mathcal{W}, \mathcal{W}_*} + \sigma^{\mathcal{W}} - \sigma^{\mathcal{W}_*} + \delta^{\mathcal{W}, \mathcal{W}_*} \\
        &= \sum_{w_i \in \mathcal{W}} M_i + \xi^{\mathcal{W}, \mathcal{W}_*} + \sigma^{\mathcal{W}} - \sigma^{\mathcal{W}_*},
    \end{split}
\end{equation*}
where $\sigma_j^{\mathcal{W}} = \log \left( \frac{p(\mathcal{W} | c_j)}{\prod_{w_i \in \mathcal{W}} p(w_i|c_j)} \right)$, $\sigma_j^{\mathcal{W}_*} = \log \left( \frac{p(\mathcal{W}_* | c_j)}{\prod_{w_i \in \mathcal{W}_*} p(w_i|c_j)} \right)$, $\delta_j^{\mathcal{W}, \mathcal{W}_*}  = \log \left( \frac{p(\mathcal{W}_*)}{p(\mathcal{W})} \right)$, and $\xi_j^{\mathcal{W}, \mathcal{W}_*} = \log \left( \frac{p( \mathcal{W}_* | c_j)}{p( \mathcal{W} | c_j)} \right)$.}

\textit{Proof.} Observe that $p(c_j | \mathcal{W}_*) = \frac{p(\mathcal{W}_* | c_j) p(c_j)}{p(\mathcal{W}_*)}$ and $p(c_j | \mathcal{W}) = \frac{p(\mathcal{W} | c_j) p(c_j)}{p(\mathcal{W})}$, whence $\rho_j^{\mathcal{W}, \mathcal{W}_*} = \log \left( \frac{p(c_j | \mathcal{W}_*)}{p(c_j | \mathcal{W})} \right) = \log \left( \frac{p( \mathcal{W}_* | c_j) }{p( \mathcal{W} | c_j) } \right) + \log \left( \frac{p(\mathcal{W})}{p(\mathcal{W}_*)} \right)$. We have
\begin{align*}
    &\sum_{w_i \in \mathcal{W}_*} M_i - \sum_{w_i \in \mathcal{W}} M_i \\
    &=\sum_{w_i \in \mathcal{W}_*} \log \left(\frac{p(w_i, c_j)}{p(c_j)} \right) - \sum_{w_i \in \mathcal{W}} \log \left(\frac{p(w_i, c_j)}{p(c_j)} \right) \\
    &= \log \prod_{w_i \in \mathcal{W}_*} p(w_i | c_j) - \log \prod_{w_i \in \mathcal{W}} p(w_i | c_j) \\
    &= \log \left( \frac{ \prod_{w_i \in \mathcal{W}_*} p(w_i | c_j)}{ \prod_{w_i \in \mathcal{W}} p(w_i | c_j)} \right) + \log \left( \frac{p(\mathcal{W}_*)}{p(\mathcal{W}_*)} \right) + \log \left( \frac{p(\mathcal{W})}{p(\mathcal{W})} \right) + \log \left( \frac{p(\mathcal{W}_* | c_j)}{p(\mathcal{W}_* | c_j)} \right) + \log \left( \frac{p(\mathcal{W} | c_j)}{p(\mathcal{W} | c_j)} \right) \\
    &= \log \left( \frac{p( \mathcal{W}_* | c_j) }{p( \mathcal{W} | c_j) } \right) + \log \left( \frac{p(\mathcal{W})}{p(\mathcal{W}_*)} \right) + \log \left( \frac{p(\mathcal{W} | c_j)}{\prod_{w_i \in \mathcal{W}} p(w_i|c_j)} \right) - \log \left( \frac{p(\mathcal{W}_* | c_j)}{\prod_{w_i \in \mathcal{W}_*} p(w_i|c_j)} \right) + \log \left( \frac{p(\mathcal{W}_*)}{p(\mathcal{W})} \right) \\
    &= \rho_j^{\mathcal{W}, \mathcal{W}_*} + \sigma_j^{\mathcal{W}} - \sigma_j^{\mathcal{W}_*} + \delta_j^{\mathcal{W}, \mathcal{W}_*}.
\end{align*}
Also, 
\begin{align*}
    \rho_j^{\mathcal{W}, \mathcal{W}_*} + \delta_j^{\mathcal{W}, \mathcal{W}_*} &= \log \left( \frac{p( \mathcal{W}_* | c_j) }{p( \mathcal{W} | c_j) } \right) + \log \left( \frac{p(\mathcal{W})}{p(\mathcal{W}_*)} \right) + \log \left( \frac{p(\mathcal{W}_*)}{p(\mathcal{W})} \right) \\
    &= \xi_j^{\mathcal{W}, \mathcal{W}_*},
\end{align*}
which completes the proof.

\section{Proof of Proposition \ref{prop:supp-anal-ctx}}
\label{sec:app-proof-w2v-anal-context}
\textbf{Proposition \ref{prop:supp-anal-ctx}}. \textit{
    Let $\mathcal{W} = \{r,s\}$ and $\mathcal{W}_* = \{t,u\}$. Assume $p(\mathcal{W}) \approx p(\mathcal{W_*})$ and $w_i \in \mathcal{W}$ ($w_i \in \mathcal{W_*}$) are approximately marginally independent. Also, assume that $W$ has full row rank. If $w_r + w_s \approx w_t + w_u$, then $c_r + c_s \approx c_t + c_u$.
}

\textit{Proof.} For any $c_v \in \mathcal{E}$, we have $(w_r + w_s)^\top c_v \approx (w_t + w_u)^\top c_v$. From \Cref{prop:cbow-sim}, this expression can be simplified as $\log p(w_r, c_v) + \log p(w_s, c_v) \approx \log p(w_t, c_v) + \log p(w_u, c_v)$. This implies $\log p(w_v, c_r) + \log p(w_v, c_s) \approx \log p(w_v, c_t) + \log p(w_v, c_u)$. Observe that
    \begin{align*}
            &w_v^\top (c_r + c_s - c_t - c_u) \\ 
            &= (\log p(w_v, c_r) + \log p(w_v, c_s) - \log p(w_v, c_t) - \log p(w_v, c_u)) + \log \left( \frac{p(c_t) p(c_u)}{p(c_r) p(c_s)} \right) \\
            &\approx 0 + \log \left( \frac{p(\mathcal{W}_*)}{p(\mathcal{W})} \right) \\
            &\approx 0.
    \end{align*}
Since this holds for every $v$ and $W$ has full row rank, we conclude that $c_r + c_s \approx c_t + c_u$, completing the proof.

\section{Proof of Proposition \ref{prop:supp-anal-w2v-attn}}
\label{sec:app-proof-attn-anal-context}
\textbf{Proposition \ref{prop:supp-anal-w2v-attn}.} Let $\mathcal{W} = \{r,s\}$ and $\mathcal{W}_* = \{t,u\}$. Assume $\bar{p}(\mathcal{W}) \approx \bar{p}(\mathcal{W_*})$ and $w_i \in \mathcal{W}$ ($w_i \in \mathcal{W_*}$) are approximately marginally independent. Also, assume that $W$ has full row rank and $\bar{p}(w_i, c_j) \approx \bar{p}(w_j,c_i)$. If $w_r + w_s \approx w_t + w_u$, then $\tilde{c}_r + \tilde{c}_s \approx \tilde{c}_t + \tilde{c}_u$.

\textit{Proof.} For any $\tilde{c}_v \in \mathcal{E}$, we have $(w_r + w_s)^\top \tilde{c}_v = (w_t + w_u)^\top \tilde{c}_v$. From \Cref{eq:sim-attn-transf}, this expression can be simplified as $\log \bar{p}(w_r, c_v) + \log \bar{p}(w_s, c_v) \approx \log \bar{p}(w_t, c_v) + \log \bar{p}(w_u, c_v)$. By the assumption that $\bar{p}(w_i, c_j) \approx \bar{p}(w_j,c_i)$, this implies $\log \bar{p}(w_v, c_r) + \log \bar{p}(w_v, c_s) \approx \log \bar{p}(w_v, c_t) + \log \bar{p}(w_v, c_u)$. Observe that
    \begin{align*}
            &w_v^\top (\tilde{c}_r + \tilde{c}_s - \tilde{c}_t - \tilde{c}_u) \\
            &= (\log \bar{p}(w_v, c_r) + \log \bar{p}(w_v, c_s) - \log \bar{p}(w_v, c_t) - \log \bar{p}(w_v, c_u)) + \log \left( \frac{\bar{p}(c_t) \bar{p}(c_u)}{\bar{p}(c_r) \bar{p}(c_s)} \right) \\
            &\approx 0 + \log \left( \frac{\bar{p}(\mathcal{W}_*)}{\bar{p}(\mathcal{W})} \right) \\
            &\approx 0.
    \end{align*}
Since this holds for every $v$ and $W$ has full row rank, we conclude that $\tilde{c}_r + \tilde{c}_s \approx \tilde{c}_t + \tilde{c}_u$, completing the proof.


% % \section{Interpreting similarities between tokens in the masked language model (continued)}
% % As a starting point, we first derive a similar result to that of \citet{levy2014neural} for skip-gram without negative sampling.
% % \begin{prop} 
% % Consider skip-gram without negative sampling. Let $w$ and $c$ denote the center and context words, respectively. Define $\#(w,c)$ as the number of center-context pairs with $w$ as the center word and $c$ as the context word, and $\#(w) = \sum_{i=1}^{|V|} \#(w,i)$. For a sufficiently large embedding dimension, we have $u_c^\top v_w = \log \left(\frac{\#(w,c)}{\#(w)} \right) + k_w$ for some constant $k_w$.
% % \end{prop}
% % \begin{proof}
% % For a particular $(w,c)$ pairs, the cross-entropy loss is $-u_c^\top v_w + \log \left(\sum_{k=1}^{|V|} \exp(u_k^\top v_w) \right)$. Summing this over all $(w,c)$ pairs, we have 
% % \begin{equation*}
% %     \ell(w,c) = -\#(w,c)u_c^\top v_w + \#(w) \log \left(\sum_{k=1}^{|V|} \exp(u_k^\top v_w) \right),
% % \end{equation*}
% % which we want to minimize. Taking derivative with respect to $u_c^\top v_w$ and setting it to 0 yields $\exp(u_c^\top v_w) = (\#(w,c)/\#(w)) \cdot a_w$. The proof is complete after taking logarithm on both sides.
% % \end{proof}
% % Note that by plugging in the formula in Proposition 3 into the cross-entropy loss, we see that the loss does not depend on $k_w$. Letting $a_w = 1$ for all $w$, we have $u_c^\top v_w = \log \left(\frac{\#(w,c)}{\#(w)} \right)$ as a minimizer of the loss function. Corollary 1 trivially follows from Proposition 1.
% % \begin{cor}
% %     $u_a^\top v_w > u_b^\top v_w$ if and only if $\#(w,a) > \#(w,b)$.
% % \end{cor}
% % We now establish a similar result for CBOW. For simplicity, we assume that the window size is always $2m$. Given an instance of $c$ as the center word in a sentence, the loss can be approximated as follows:
% % \begin{equation*}
% % \resizebox{.90\hsize}{!}{
% %     \begin{split}
% %         &-\frac{\sum_{w \in W} u_c^\top v_w}{2m} + \log \left( \sum_{j=1}^{|V|} \exp \left( \frac{\sum_{w \in W} u_j^\top v_w}{2m}\right) \right) \\
% %         &\approx -\frac{\sum_{w \in W} u_c^\top v_w}{2m} + \log \left( \sum_{j=1}^{|V|} \left( 1 + \frac{\sum_{w \in W} u_j^\top v_w}{2m} + \frac{(\sum_{w \in W} u_j^\top v_w)^2}{8m^2} \right) \right)  \\
% %         &= -\frac{\sum_{w \in W} u_c^\top v_w}{2m} + \log   |V| + \log \left( 1 + \frac{\sum_{j=1}^{|V|} \left( \sum_{w \in W} u_j^\top v_w\right)}{2m |V|} + \frac{\sum_{j=1}^{|V|} \left( \sum_{w \in W} u_j^\top v_w\right)^2}{8m^2 |V|} \right)  \\
% %         &\approx -\frac{\sum_{w \in W} u_c^\top v_w}{2m} + \log |V| + \frac{\sum_{j=1}^{|V|} \left( \sum_{w \in W} u_j^\top v_w\right)}{2m |V|} + \frac{\sum_{j=1}^{|V|} \left( \sum_{w \in W} u_j^\top v_w\right)^2}{8m^2 |V|} \\
% %         &\leq -\frac{\sum_{w \in W} u_c^\top v_w}{2m} + \log |V| + \frac{\sum_{j=1}^{|V|} \left( \sum_{w \in W} u_j^\top v_w\right)}{2m |V|} + \frac{\sum_{j=1}^{|V|} \left( \sum_{w \in W} (u_j^\top v_w)^2\right)}{4m |V|},
% %     \end{split}}
% % \end{equation*}
% % where we used the Taylor expansions $\exp(x) \approx 1 + x + x^2/2$, $\log(1 + x) \approx x$, and the Cauchy-Schwarz inequality. We have the following proposition.
% % \begin{prop}
% % Consider the approximate CBOW objective as above. Assuming a sufficiently large embedding dimension, we have $u_c^\top v_w = \frac{\#(w,c)}{\#(w)} |V| - 1$. Here, $c$ is the center word and $w$ is the context word.
% % \end{prop}
% % \begin{proof}
% % Ignoring the constant $\log |V|$ and multiplying by $2m |V|$, the approximated loss can be written as
% % \begin{equation*}
% %     - |V| \sum_{w \in W} u_c^\top v_w + \sum_{j=1}^{|V|} \left( \sum_{w \in W} u_j^\top v_w\right) + \frac{1}{2} \sum_{j=1}^{|V|} \left( \sum_{w \in W} (u_j^\top v_w)^2\right).
% % \end{equation*}
% % Summing this over all instances and only extracting terms which depend on $u_c^\top v_w$, we have
% % \begin{equation*}
% %     \ell(w,c) = -\#(w,c) |V| u_c^\top v_w + \#(w) u_c^\top v_w + \frac{1}{2} \#(w) (u_c^\top v_w)^2.
% % \end{equation*}
% % Taking derivative with respect to $u_c^\top v_w$ and setting it to 0 yields $u_c^\top v_w = \frac{\#(w,c)}{\#(w)} |V| - 1$, as desired. Observe that the term $\frac{\#(w,c)}{\#(w)}$ is also present in the optimal similarity formula for skip-gram.
% % \end{proof}
% % We now present Corollary 2, which follows immediately from Proposition 4.
% % \begin{cor}
% %     $u_a^\top v_w > u_b^\top v_w$ if and only if $\#(w,a) > \#(w,b)$.
% % \end{cor}
% % Without invoking the Cauchy-Schwarz inequality (i.e., using the second-to-last line of the approximated loss above), we do not have a nice closed form for $u_c^\top v_w$. Instead, we can derive a system of equations concerning the $u_c^\top v_w's$, as summarized in the following proposition.
% % \begin{prop}
% % Consider the second-to-last line of the CBOW objective as above. Define $\Delta(w_1, w_2)$ as the number of times $w_1$ and $w_2$ occur together in the context on the whole corpus. For example, in an instance where the center word is $c$ and the context words are $a$, $a$, $b$, $b$ and $c$, we have $\Delta(a,a) = 1$, $\Delta(a,b) = 4$ and $\Delta(a,c) = 2$. Let $\Delta \in \mathbb{R}^{|V| \times |V|}$ be a matrix such that $(\Delta)_{i,j} :=  \Delta(i,j)$, and $\square := \mathrm{diag}(\#(1) + \Delta(1,1), \cdots, \#(|V|) +  \Delta(|V|, |V|))$. Assuming a sufficiently large embedding dimension, we have
% % \begin{equation*}
% %     \begin{pmatrix}
% %     u_c^\top v_1\\
% %     \cdots \\
% %     u_c^\top v_{|V|}
% %     \end{pmatrix} = 2m (\Delta + \square)^{-1} \begin{pmatrix}
% %     |V| \#(1,c) + \#(1) \\
% %     \cdots \\
% %     |V| \#(|V|,c) + \#(|V|)
% %     \end{pmatrix}.
% % \end{equation*}
% % \end{prop}
% % \begin{proof}
% % Ignoring the constant $\log |V|$ and multiplying by $2m |V|$, the loss can be written as
% % \begin{equation*}
% %     -|V| \sum_{w \in W} u_c^\top v_w + \sum_{j=1}^{|V|} \left( \sum_{w \in W} u_j^\top v_w\right) + \frac{1}{4m} \sum_{j=1}^{|V|} \left( \sum_{w \in W} u_j^\top v_w\right)^2.
% % \end{equation*}
% % Summing this over all instances and only extracting terms which depend on $u_c^\top v_w$, we have
% % \begin{equation*}
% % \resizebox{.93\hsize}{!}{
% %    $\ell(w,c) = -\#(w,c) |V| u_c^\top v_w + \#(w) u_c^\top v_w + \frac{1}{4m} \left( \#(w)(u_c^\top v_w)^2 + 2 \sum_{i=1}^{|V|} \Delta(i,w) (u_c^\top v_i) (u_c^\top v_w) \right).$}
% % \end{equation*}
% % Taking derivative with respect to $u_c^\top v_w$, we have
% % \begin{equation*}
% %     u_c^\top v_w = \frac{2m \left( \#(w,c)|V| + \#(w) \right) - \sum_{i \neq w} \Delta(i,w) u_c^\top v_i}{\#(w) + 2\Delta(w,w)},
% % \end{equation*}
% % from which some algebraic manipulation yields the result we want to show.
% % \end{proof}
% % We now analyze the case where we have the attention mechanism. For simplicity, we first consider the simplified loss when there are no residual connections or position encodings, which is
% % \begin{equation*}
% %     \begin{split}
% %         &-\sum_{j=1}^S \frac{\exp\left( \frac{x_{a_j} ^\top W^{KQ} x_{|V|+1}}{\sqrt{d_w}} \right)}{\sum_{j=1}^S \exp\left( \frac{x_{a_j}^\top W^{KQ} x_{|V|+1}}{\sqrt{d_w}} \right)} (W_b^{LOV})^\top (x_{a_j}) \\
% %         &+ \log\left( \sum_{k=1}^{|V|} \exp\left( \sum_{j=1}^S \frac{\exp\left( \frac{x_{a_j}^\top W^{KQ} x_{|V|+1}}{\sqrt{d_w}} \right)}{\sum_{j=1}^S \exp\left( \frac{x_{a_j} ^\top W^{KQ} x_{|V|+1}}{\sqrt{d_w}} \right)} (W_k^{LOV})^\top (x_{a_j})  \right)\right). 
% %     \end{split}
% % \end{equation*}
% % In order to follow the previously used notations, we rewrite $W_c^{LOV}$ as $u_c$ and $x_w$ as $v_w$ for every $c$ and $w$. Note that $w$ now can be equal to $|V| + 1$ due to the introduction of the \texttt{[MASK]} token. We have the following proposition.
% % \begin{prop}
% % Consider the simplified CBOW with attention. Using a similar approximation technique as above and assuming a sufficiently large embedding dimension, we have
% % \begin{equation*}
% %     u_c^\top v_w = \frac{|V| \sum_{(c,w)} \gamma_{w}^c - \left(\sum_{(1,w)} \gamma_{w}^1 + \cdots + \sum_{(|V|,w)} \gamma_{w}^{|V|} \right)}{S \left(\sum_{(1,w)} (\gamma_{w}^1)^2 + \cdots + \sum_{(|V|,w)} (\gamma_{w}^{|V|})^2 \right)},
% % \end{equation*}
% % where for a center-context pair $(d,w)$ in a modified sentence $(a_1, \cdots, a_S)$, we define 
% % \begin{equation*}
% %     \gamma_w^d = \frac{\exp\left( \frac{v_w ^\top W^{KQ} v_{|V|+1}}{\sqrt{d_w}} \right)}{\sum_{j=1}^S \exp\left( \frac{v_{a_j}^\top W^{KQ} v_{|V|+1}}{\sqrt{d_w}} \right)}.
% % \end{equation*}
% % \end{prop}
% % The proof is similar to that of Proposition 4, and is therefore omitted. Note that we can interpret $\gamma_w^d$ for that sentence as the relative importance of the context word $w$ as compared to the other context words. We have the following corollary, which is an immediate consequence of Proposition 6.
% % \begin{cor}
% %     $u_a^\top v_w > u_b^\top v_w$ if and only if $\sum_{(a,w)} \gamma_{w}^a > \sum_{(b,w)} \gamma_{w}^b$.
% % \end{cor}
% % Lastly, we incorporate position encodings. In this case, the loss can be written as
% % \begin{equation*}
% %     \begin{split}
% %         &-\sum_{j=1}^S \frac{\exp\left( \frac{(v_{a_j} + p_j)^\top W^{KQ} (v_{|V|+1} + p_m)}{\sqrt{d_w}} \right)}{\sum_{j=1}^S \exp\left( \frac{(v_{a_j} + p_j)^\top W^{KQ} (v_{|V|+1} + p_m)}{\sqrt{d_w}} \right)} (u_b)^\top (v_{a_j} + p_j)  \\
% %         &+ \log\left( \sum_{k=1}^{|V|} \exp\left( \sum_{j=1}^S\frac{\exp\left( \frac{(v_{a_j} + p_j)^\top W^{KQ} (v_{|V|+1} + p_m)}{\sqrt{d_w}} \right)}{\sum_{j=1}^S \exp\left( \frac{(v_{a_j} + p_j)^\top W^{KQ} (v_{|V|+1} + p_m)}{\sqrt{d_w}} \right)} (u_k)^\top (v_{a_j} + p_j) \right)\right).
% %     \end{split}
% % \end{equation*}
% % We now have the following proposition.
% % \begin{prop}
% % Consider the CBOW with attention and position encodings. Using a similar approximation technique as above without invoking the Cauchy-Schwarz inequality, and assuming a sufficiently large embedding dimension, we have
% % \begin{equation*}
% %     \begin{split}
% %         0 &= -|V| \sum_{\ell = 1}^L \sum_{j=1}^S (\alpha_j^\ell) \mathds{1}(b_\ell = c \land a_{\ell_j} = w) + \sum_{\ell = 1}^L \sum_{j=1}^S  (\alpha_j^\ell) \mathds{1}(a_{\ell_j} = w) \\
% %         &+ \sum_{\ell = 1}^L \sum_{j=1}^S (\alpha_j^\ell)^2 \mathds{1}(a_{\ell_j} = w) \left(u_c^\top v_w + u_c^\top p_j\right)\\
% %         &+ \sum_{\ell = 1}^L \sum_{j_1 < j_2}(\alpha_{j_1}^\ell) (\alpha_{j_2}^\ell) \mathds{1}(a_{\ell_{j_1}} = a_{\ell_{j_2}} = w) \left(2(u_c^\top v_w) + u_c^\top p_{j_1} + u_c^\top p_{j_2}\right)  \\
% %         &+ \sum_{\ell = 1}^L \sum_{j_1 < j_2}(\alpha_{j_1}^\ell) (\alpha_{j_2}^\ell) \mathds{1}(a_{\ell_{j_1}} = w \land a_{\ell_{j_2}} \neq w) \left((u_c^\top v_{a_{\ell {j_2}}} + u_c^\top p_{j_2}\right) \\
% %         &+ \sum_{\ell = 1}^L \sum_{j_1 < j_2}(\alpha_{j_1}^\ell) (\alpha_{j_2}^\ell) \mathds{1}(a_{\ell_{j_1}} \neq w \land a_{\ell_{j_2}} = w) \left(u_c^\top v_{a_{\ell {j_1}}} + u_c^\top p_{j_1}\right)
% %     \end{split}
% % \end{equation*}
% % and
% % \begin{equation*}
% %     0 =  -|V| \sum_{\ell=1}^L (\alpha_p^\ell) \mathds{1}(b_\ell = c) + \sum_{\ell = 1}^L \left(\alpha_p^\ell + \sum_{j=1}^S (\alpha_p^\ell) (\alpha_j^\ell) (u_c^\top v_{a_{\ell j}} + u_c^\top p_j)\right),
% % \end{equation*}
% % where the notations are defined below.
% % \end{prop}
% % \begin{proof}
% % Let $L$ denote the number of distinct input-output pairs. For example, a sentence $p, p, q, r$ in which the first word is masked corresponds to the pair $((\texttt{[MASK]}, p, q, r), p)$. For each $\ell \in \{1,2,\cdots,L\}$, let $a_{\ell k}$ denote the word in $k$-th position, and $b_\ell$ denote the output word. Also, define $\alpha_j^\ell$ as the weight of the word in $j$-th position in pair $\ell$. That is,
% % \begin{equation*}
% %     \alpha_j^\ell := \frac{\exp\left( \frac{(v_{a_{\ell j}} + p_j)^\top W^{KQ} (v_{|V|+1} + p_m)}{\sqrt{d_w}} \right)}{\sum_{j=1}^S \exp\left( \frac{(v_{a_{\ell j}} + p_j)^\top W^{KQ} (v_{|V|+1} + p_m)}{\sqrt{d_w}} \right)}.
% % \end{equation*}
% % Using the derivations as above, the loss for a particular pair $\ell$ can be written as
% % \begin{equation*}
% % \begin{split}
% %     &-\sum_{j=1}^S (\alpha_j^\ell) (u_{b_\ell})^\top (v_{a_{\ell j}} + p_j) + \frac{1}{|V|} \sum_{k=1}^{|V|} \left(\sum_{j=1}^S (\alpha_j^\ell) (u_k)^\top (v_{a_{\ell j}} + p_j)\right) \\
% %     &+ \frac{1}{2|V|} \sum_{k=1}^{|V|} \left(\sum_{j=1}^S (\alpha_j^\ell) (u_k)^\top (v_{a_{\ell j}} + p_j)\right)^2.
% % \end{split}
% % \end{equation*}
% % Summing this over all pairs and extracting only terms containing $u_c^\top v_w$, we have
% % \begin{equation*}
% % \resizebox{.90\hsize}{!}{
% %     \begin{split}
% %         \ell(w,c) &= -\sum_{\ell = 1}^L \sum_{j=1}^S (\alpha_j^\ell) \mathds{1}(b_\ell = c \land a_{\ell_j} = w) u_c^\top v_w + \frac{1}{|V|} \sum_{\ell = 1}^L \sum_{j=1}^S  (\alpha_j^\ell) \mathds{1}(a_{\ell_j} = w) u_c^\top v_w \\
% %         &+ \frac{1}{2|V|} \sum_{\ell = 1}^L \sum_{j=1}^S (\alpha_j^\ell)^2 \mathds{1}(a_{\ell_j} = w) \left((u_c^\top v_w)^2 + 2(u_c^\top v_w)(u_c^\top p_j)\right)\\
% %         &+ \frac{1}{|V|} \sum_{\ell = 1}^L \sum_{j_1 < j_2}(\alpha_{j_1}^\ell) (\alpha_{j_2}^\ell) \mathds{1}(a_{\ell_{j_1}} = a_{\ell_{j_2}} = w) \left((u_c^\top v_w)^2 + (u_c^\top v_w)(u_c^\top p_{j_1} + u_c^\top p_{j_2})\right)  \\
% %         &+ \frac{1}{|V|} \sum_{\ell = 1}^L \sum_{j_1 < j_2}(\alpha_{j_1}^\ell) (\alpha_{j_2}^\ell) \mathds{1}(a_{\ell_{j_1}} = w \land a_{\ell_{j_2}} \neq w) \left((u_c^\top v_w)(u_c^\top v_{a_{\ell {j_2}}} + u_c^\top p_{j_2})\right) \\
% %         &+ \frac{1}{|V|} \sum_{\ell = 1}^L \sum_{j_1 < j_2}(\alpha_{j_1}^\ell) (\alpha_{j_2}^\ell) \mathds{1}(a_{\ell_{j_1}} \neq w \land a_{\ell_{j_2}} = w) \left((u_c^\top v_w)(u_c^\top v_{a_{\ell {j_1}}} + u_c^\top p_{j_1})\right).
% %     \end{split}}
% % \end{equation*}
% % Similarly, summing the loss over all pairs and extracting only terms containing $u_c^\top p_p$, we have
% % \begin{equation*}
% %     \begin{split}
% %         \tilde{\ell}(c,p) &=  -\sum_{\ell=1}^L (\alpha_p^\ell) \mathds{1}(b_\ell = c) u_c^\top p_p + \frac{1}{|V|} \sum_{\ell=1}^L (\alpha_p^\ell) u_c^\top p_p \\ 
% %         &+ \frac{1}{2|V|} \sum_{\ell = 1}^L (\alpha_p^\ell)^2 \left((u_c^\top p_p)^2 + 2(u_c^\top p_p)(u_c^\top v_{a_{\ell p}})\right) \\
% %         &+ \frac{1}{|V|} \sum_{\ell=1}^L \sum_{j \neq p} (\alpha_p^\ell) (\alpha_j^\ell) (u_c^\top p_p) (u_c^\top v_{a_{\ell j}} + u_c^\top p_j).
% %     \end{split}
% % \end{equation*}
% % Taking derivative of $\ell$ and $\tilde{\ell}$ with respect to $u_c^\top v_w$ and $u_c^\top p_p$, respectively, and setting them to zero complete the proof.
% % \end{proof}

% \section{Exponential family embeddings and continuous bag of words}
% \label{sec:app-exp-fam-and-cbow}
% Let our data be $x = x_{1:I}$, where $x_i \in \mathbb{R}^D$. For each $i$, let $c_i \subseteq [I] - \{i\}$ denote its context. Each data point is modeled conditional on its context, i.e., $x_i | x_{c_i} \sim \textrm{ExpFam}(\eta_i(x_{c_i}), t(x_i))$. Consider the linear embedding $\eta_i(x_{c_i}) = f_i \left(\rho[i]^\top \sum_{j \in c_i} \alpha[j] x_j \right)$
% for some link function $f_i$, center embedding $\rho[i] \in \mathbb{R}^{K \times D}$ and context embedding $\alpha[i] \in \mathbb{R}^{K \times D}$, where $K$ is the embedding dimension. This general formulation is referred to as exponential family embeddings \citep{rudolph2016exponential}. 

% CBOW is a special case of the formulation above, where (1) $x_i \in \{0,1\}^D$ is the one-hot encoding of the word in position $i$; (2) $c_i = \{ j \neq i, i - w \leq j \leq i + w, 1 \leq j \leq I\}$ where $w$ is the window size; (3) we have the same center and context embeddings for each position, i.e., $\rho[i] = \rho$ and $\alpha[i] = \alpha$; (4) for each position $i$, $\eta_i := \eta_i(x_{c_i}) = \rho^\top \alpha \sum_{j \in c_i} x_j \in \mathbb{R}^D$; and (5) the distribution $p(x_i | x_{c_i})$ is given by $\textrm{Cat}(\eta_i)$. 

% \section{Applications of attention-based models for non-text data with sequential information}
% \label{sec:app-prob-appl}
% We propose two applications of attention-based models for non-text data in the presence of sequential information.

% \textbf{User-item rating data.} Consider a classical problem of modeling the rating a particular user gives to an item, given some (user, item) ratings. Without access to the order of items consumed by each user, one possible way to model this is to use Poisson factorization \citep{gopalan13scalable}. Concretely, we have $r_{ui} \sim \textrm{Poisson}(\theta_u^\top \beta_i)$, where $u \in [U]$ denote a user, $i \in [I]$ denote an item, and $r_{ui} \in [R]$ denotes the rating given by user $u$ to item $i$. 

% When such information is present, we can use MLM to capture it. Take any user $u$ and let their chronological item-rating history be $(i_1, r_1), \cdots, (i_t, r_t)$. We can decompose the probability of observing such a history from user $u$ into
% \begin{equation*}
%      p((i_1, r_1), \cdots, (i_t, r_t) | u) = p((i_1, \cdots, i_t)|u) p(r_1 | i_1, u) \cdots p(r_t | i_t, u),
% \end{equation*}
% where we used MLM to model the first term and Poisson factorization to model the remaining terms. Using a vanilla MLM, we implicitly have $p((i_1, \cdots, i_t)|u) = p((i_1, \cdots, i_t))$ since the embedding of a token is the same for each sentence that contains it. In this case, we can use the same item embeddings for both models to reduce the number of parameters. In general, we can have $p((i_1, \cdots, i_t)|u)$ depend on $u$, which corresponds to personalized item embeddings for each user.

% \textbf{Market basket data.} Suppose we have data on baskets of items purchased by customers in a grocery store, with no information on basket-level order of items. Let $N$ and $T$ be the number of items and baskets, respectively. For each $i = (n,t) \in [N] \times [T]$, let $X_i$ denote the quantity of items $n$ in basket $t$. \textit{p-emb} \citep{rudolph2016exponential} models $X_i$ conditional on its context $X_{c_i}$, i.e., other items from the same basket. Concretely, we have $X_{i=(n,t)} | X_{c_i} \sim \textrm{Poisson}\left( \exp \left( \sum_{m=1; m \neq n}^N \rho_n^\top \alpha_m X_{(m,t)} \right) \right)$, where $\rho_n$ and $\alpha_n$ are the center and context embeddings of item $n$, respectively.

% Now, suppose that the order of items in each basket is given. For each position $i \in [N]$ in a basket, we introduce a position encoding $p_i$. Items not in the basket do not have an order, and are assigned with a position encoding $p_0$. We can follow the masking process in \Cref{sec:der-mlm-obj} and set the masked $X_i$ to be 0. In order to account for the item order information, we replace each $\alpha_m$ with $\overline{\alpha}_m$, obtained from $\alpha_m$ by adding the corresponding position encoding and applying self-attention. Our model now becomes
% $$X_{i=(n,t)} | X_{c_i} \sim \textrm{Poisson}\left( \exp \left( \sum_{m=1; m \neq n}^N \rho_n^\top \overline{\alpha}_m X_{(m,t)} \right) \right).$$



% \section{Modeling user-item rating data with per-user sequential item consumption information}
% \label{sec:app-user-item-model}
% Without such information, one possible way to model user-item rating data is to use Poisson factorization \citep{gopalan13scalable}. Concretely, we have $r_{ui} \sim \textrm{Poisson}(\theta_u^\top \beta_i)$, where $u \in [U]$ denote a user, $i \in [I]$ denote an item, and $r_{ui} \in [R]$ denotes the rating given by user $u$ to item $i$. When such information is present, we can use MLM to capture it. Take any user $u$ and let their chronological item-rating history be $(i_1, r_1), \cdots, (i_t, r_t)$. We can decompose the probability of observing such a history from user $u$ into
% \begin{equation*}
%      p((i_1, r_1), \cdots, (i_t, r_t) | u) = p((i_1, \cdots, i_t)|u) p(r_1 | i_1, u) \cdots p(r_t | i_t, u),
% \end{equation*}
% where we used MLM to model the first term and Poisson factorization to model the remaining terms. Using a vanilla MLM, we implicitly have $p((i_1, \cdots, i_t)|u) = p((i_1, \cdots, i_t))$ since the embedding of a token is the same for each sentence that contains it. In this case, we can use the same item embeddings for both models to reduce the number of parameters. In general, we can have $p((i_1, \cdots, i_t)|u)$ depend on $u$, which corresponds to personalized item embeddings for each user.

\putbib[attention-uai]
\end{bibunit}

% References
\bibliography{attention-uai.bib}

\end{document}


% KEEP THIS for LATEX-TOOLS in SublimeText, as it does not recognize bibunit
% IT WILL NOT APPEAR IN THE PDF
\bibliography{attention-uai.bib}