
\documentclass{article} % For LaTeX2e
\usepackage{iclr2023_conference,times}

% Optional math commands from https://github.com/goodfeli/dlbook_notation.
\input{math_commands.tex}

\usepackage{hyperref}
\usepackage{url}
\usepackage{graphicx}
\usepackage{graphics}
\usepackage{footnote}
\makesavenoteenv{tabular}
\makesavenoteenv{table}
\usepackage{caption}
\usepackage{subcaption}

\usepackage{listings}
\lstdefinestyle{mystyle}{ 
    commentstyle=\color{codegreen},
    % keywordstyle=\color{magenta},
    numberstyle=\tiny\color{codegray},
    stringstyle=\color{codepurple},
    basicstyle=\ttfamily\footnotesize,
    breakatwhitespace=false,         
    breaklines=true,                 
    captionpos=b,                    
    keepspaces=true,                
    showspaces=false,                
    showstringspaces=false,
    showtabs=false,                  
    tabsize=2
}
\lstset{style=mystyle}


\title{Complete Likelihood Objective for Latent Variable Models}

% Authors must not appear in the submitted version. They should be hidden
% as long as the \iclrfinalcopy macro remains commented out below.
% Non-anonymous submissions will be rejected without review.

\author{Mikhail Arkhipov, Maria Vikhreva\\
Moscow, Russia\\
\texttt{\{arkhipovmu, mary.vikhreva\}@gmail.com}\\
% \AND
% Maria Vikhreva \\
% Moscow, Russia \\
% \texttt{mary.vikhreva@gmail.com}\\
}
% \author{Antiquus S.~Hippocampus, Natalia Cerebro \& Amelie P. Amygdale \thanks{ Use footnote for providing further information
% about author (webpage, alternative address)---\emph{not} for acknowledging
% funding agencies.  Funding acknowledgements go at the end of the paper.} \\
% Department of Computer Science\\
% Cranberry-Lemon University\\
% Pittsburgh, PA 15213, USA \\
% \texttt{\{hippo,brain,jen\}@cs.cranberry-lemon.edu} \\
% \And
% Ji Q. Ren \& Yevgeny LeNet \\
% Department of Computational Neuroscience \\
% University of the Witwatersrand \\
% Joburg, South Africa \\
% \texttt{\{robot,net\}@wits.ac.za} \\
% \AND
% Coauthor \\
% Affiliation \\
% Address \\
% \texttt{email}
% }

% The \author macro works with any number of authors. There are two commands
% used to separate the names and addresses of multiple authors: \And and \AND.
%
% Using \And between authors leaves it to \LaTeX{} to determine where to break
% the lines. Using \AND forces a linebreak at that point. So, if \LaTeX{}
% puts 3 of 4 authors names on the first line, and the last on the second
% line, try using \AND instead of \And before the third author name.

\newcommand{\fix}{\marginpar{FIX}}
\newcommand{\new}{\marginpar{NEW}}

% \iclrfinalcopy % Uncomment for camera-ready version, but NOT for submission.
\begin{document}


\maketitle

\begin{abstract} 
In this work, we propose an alternative to the Marginal Likelihood
(MaL) objective for training latent variable models, Complete Latent
Likelihood (CoLLike). We analyze the objectives from the perspective of
matching joint distributions. We show that MaL corresponds to a particular $KL$
divergence between some target \emph{joint} distribution and the model joint.
Furthermore, the properties of the target joint explain such major malfunctions
of MaL as uninformative latents (posterior collapse) and high deviation of
the aggregated posterior from the prior. In CoLLike approach, we use a sample
from the prior to construct a family of target joint distributions, which
properties prevent these drawbacks. We utilize the complete likelihood both to
choose the target from this family and to learn the model. We confirm our
analysis by experiments with low-dimensional latents,
which also indicate that it is possible to achieve high accuracy unsupervised
classification using CoLLike objective.

% In this work, we propose an alternative to the Marginal Likelihood (MaL)
% objective for training latent variable models. We treat a sample from the
% prior as a set of ground truth targets with an unknown pairing to the observed
% variables. The pairings span a set of possible target joint distributions
% which have desired marginals both observed and latent domains. 
% We propose Complete Latent Likelihood (CoLLike) 
% objective, which is the regular complete likelihood with extra dependence on 
% pairing.
% % The only difference with the regular complete likelihood is in optimization 
% % both with respect to parameters and possible assignments of the observed
% % variables to latent ones.
% % This assumption grants access to the complete likelihood, which is
% % regularly available only in supervised settings. We maximize the complete
% % likelihood both with respect to parameters and possible assignments of
% % visible variables to latent ones.
% We show that both MaL and CoLLike are aimed at minimization of $KL$ 
% divergence between some target \emph{joint} distribution and the model joint.
% Analysis of the target joints reveals motivation of such major malfunction
% of MaL as posterior collapse and high divergence of latent marginals.
%  also explains absence of those drawbacks for CoLLike.
% % Furthermore, while MaL picks one target joint, CoLLike optimizes over an
% % entire family of possible joints. All members of this family maintain
% % such desirable properties as high mutual information and prior matching.
% % In turn, the target joint form is responsible for such MaL malfunctions
% % as posterior collapse and high discrepancy between the aggregated posterior 
% % and the prior.
% % Using this perspective, we analyze why MaL is prone to posterior collapse
% % and mismatch between aggregated posterior and prior, while CoLLike is not.
% We confirm our analysis by experiments with low-dimensional discrete and
% continuous latents. Furthermore, we show that it is possible to achieve 
% high accuracy unsupervised classification using CoLLike objective.

% % We show that opposed to marginal likelihood, optimization of complete likelihood
% % allows to learn models with informative latent variables and low divergence
% % of aggregated posterior with prior. Along with empirical evidence, we explain
% % why complete likelihood exhibits superior performance from the probabilistic 
% % perspective.
% % In our work we show that optimization of the marginal likelihood is prone
% % to such failures as high mismatch between aggregated posterior and prior or
% % uninformative latents. In turn, training with complete likelihood objective 
% % promotes high mutual information between observed and latent variables 
% % and marginal distribution matching in the latent domain. We provide not 
% % only empirical evidence to support these statements, but also explain 
% % why this happens from probabilistic perspective. 
% % Furthermore, we show that 
% % complete likelihood can be applied seamlessly to discrete latent variables.
% %
% % (or even to priors that supports only sampling)
% % We provide both theoretical and empirical evidence that complete likelihood
% % tackles major challenges that arise during optimization of the marginal likelihood, 
% % namely, uninformative latents, discrepancy between the aggregated posterior and 
% % the prior, and gradient estimation with respect to sampling procedure. 
% % Empirical results
% % From the practical standpoint, we show how to use complete likelihood to achieve
% % high accuracy unsupervised classification. % on text modality
% % comparable with supervised techniques that relies on hundreds of labeled samples 
% % and top performing pre-trained architectures. 


% % ∇ Maximum likelihood approaches are dominant in supervised problems and widely
% % used for unsupervised problems. 
% % 
% % - appear without justification
% % - some handcrafted empirical risk (loss functions) functions used in the 
% %   can be explained and justified using maximum likelihood framework
% % ∇ Strengths: 
% % - no problems with discrete (and samplable) latents
% % - informative latents
% % - low aggregated posterior prior divergence
% % ∇ Link OT and Maximum Likelihood
% % || EPA simplifies gradient estimation etc.


\end{abstract}


\section{Introduction}
\label{introduction}

% TODO: representations is not the only task for LVMs
% Motivation of the marginal likelihood
% Another solution proposed by CoLOLate
% In the latent variable the common choice of the objective is marginal likelihood.
% We can get to this solution by the following steps: 

In the latent variable setting, the model defines a joint distribution over both 
observed variables $x$ and latent variables $z$, while the training data contains 
only observed variables. The problem can be treated as an unknown $z|x$ target 
conditional distribution.
There are at least two possible solutions to this problem:
try to come up with a meaningful target $z|x$ distribution and train the model 
similar to a supervised setting, or give up and 
focus on matching only marginals in the $x$ domain. The latter is the choice of 
the MaL objective. In this work, we follow the former approach. 
However, instead of picking up a single target conditional we construct an 
entire family of possible distributions and use the model likelihood to decide
which conditional to use as a target.

To construct a family of possible conditionals we use a sample from prior of 
the same size as the dataset in the observed domain.
% We use a sample from the prior of the same size as the dataset in the observed 
% domain. 
All possible assignments of observed samples to latent ones span a family of 
empirical joint distributions. This can be represented as permutations of the
latent samples. Despite the size of the permutations set being tremendous and 
growing as factorial of the dataset size, the search of the permutation 
with the best likelihood can be done efficiently using combinatorial optimization.
The resulting optimization procedure resembles expectation maximization algorithm 
\cite[]{dempster1977em}, where expectation is replaced with the combinatorial
assignment problem. Furthermore, since the proposed algorithm uses gradient-free
optimization for obtaining the target distribution, the objective can be seamlessly
applied to both continuous and discrete latent variables, while the discrete latents 
case is challenging for approaches based on the MaL\cite[]{mnih-nvil-14, mnih-vimco-16, tucker-rebar-17}.
% In this approach, we entirely rely on the model inductive biases for learning the 
% joint distribution. 

% Joint perspective
% Show that marginal likelihood objective along with its typical variational 
% approximation correspond to a specific choice of the target joint distributions.
We analyze the objectives from the perspective of matching \emph{joint} 
distributions. We show that MaL corresponds to a specific choice 
of the target $z|x$ conditional, while our approach takes into consideration 
family of possible conditionals. In MaL case, the choice of the target
conditional is responsible for two major failures that arise during training
with the MaL objective: inability to learn informative latents
% TODO: consider adding Blei
also known as "posterior collapse" \cite[]{BowmanVVDJB16, RazaviOPV19, He-lagging-19} and divergence between the prior and the
aggregated posterior \cite[]{hoffman2016elbo, makhzani2015adversarial, zhao2019infovae, kim2018disentangling}. These characteristics are vital for latent variable models
because posterior collapse prevents learning meaningful representation and samples 
from the regions of high deviation of the latent marginals are subjected to severe 
quality degradation \cite[]{rosca2018distribution}.
The form of the target joint also motivates the success of the complete likelihood 
in these challenges. Namely, the target distribution for CoLLike has high mutual
information one and matches prior.

We verify our analysis with experiments. In this work, we focus on low-dimensional
latent variables to perform direct comparison with exact MaL. Models trained 
with CoLLike 
stably maintain high mutual information and low divergence with the prior. In turn,
MaL inevitably leads to either posterior collapse or highly divergent 
aggregated posterior. Previously, for simple linear models it has been shown that posterior collapse takes place during optimization of the exact 
likelihood \cite{lucas-dontblameelbo-19}. Our
experiments demonstrate that it can as well happen with expressive models trained with exact likelihood.
Along with informativeness and latent distribution matching, CoLLike indicates
no degradation of likelihood compared to MaL.
Furthermore, we show that CoLLike objective alone can achieve high accuracy 
in unsupervised classification. However, the resulting variance of the accuracy 
is high. We propose a latent ensembling algorithm to tackle this issue, 
which not only stabilizes but also significantly increases accuracy. 
% marginal likelihood objective leads to models with high divergence of aggregated
% posterior and prior along 
% Desired characteristics 
% Collapse for MaL
% Unsupervised classification
% Latent ensembling

We show that CoLLike unifies a range of existing 
approaches that lack probabilistic justification. Constrained
K-means \cite[]{bradley2000constrained}, Permutation Invariant Training \cite[]{Yu-pit-17, Luo-convtasnet-19}, and Noise as Target \cite{bojanowski2017unsupervised} are among these
approaches. 
This allows to extend them to different factorizations of the joint and
perform analysis from the probabilistic perspective.
Furthermore, CoLLike bridges likelihood and optimal transport (OT) 
frameworks. From this perspective, the negative likelihood plays
the role of both mapping from latent to visible domain and distance function.

% TODO: may be rewrite last parts
In summary, we propose a new objective that compared to MaL allows to
effectively train models with such desirable properties as informative
latents and matching in the latent domain while maintaining similar 
likelihood levels. We propose a joint perspective on the objectives
that explains MaL and CoLLike properties observed in experiments. We show 
that the proposed objective unifies and explains a range of existing
approaches from probabilistic perspective. Besides,
we show that it is possible to achieve high quality unsupervised 
classification using CoLLike and the proposed latent ensembling.

% it connects likelihood with optimal transport, by showing that
% the model plays role of both mapping from $z$ to $x$ domain and distance function.

% We are given a sample in $x$ domain and a model that defines a joint distribution
% over variables $x$ and $z$. As regular, we want our model to mimic the distribution
% of the sample. To do so, we define some measure of discrepancy and minimize it. 
% Likelihood-based approaches rely on minimization of $KL$ divergence between two
% distributions, however, in our case the distributions are defined on different 
% but intersecting domains.  

\section{Complete Likelihood Objective} 

In the supervised setting, we are given a model $p_\theta(x, y)$ with 
parameters $\theta$ and a dataset represented by a
collection of samples $\{ (x_1, y_1), ..., (x_N, y_N) \}$. To mimic the 
data distribution with the model, a reasonable
objective to learn is complete likelihood:
\begin{equation}
    \mathcal{L}(\theta) = \sum_{i = 1}^N \log p_\theta(x_i, y_i)
    \label{eq:cl}
\end{equation}
% Complete likelihood is motivated by the divergence $KL(p_\delta(x, y) || 
% p_\theta(y,z))$, where the data empirical joint $p_\delta(x, y) = p_\delta(x) 
% p_\delta(y|x)$ now
% includes the conditional $p_\delta(z|x)$, which assigns equal probability mass
% to every $x, y$ pair in the dataset.
The motivation behind complete likelihood comes from the equivalence of maximizing 
(\ref{eq:cl}) and minimizing the Kullback–Leibler divergence
$KL(p_\delta(x, y) || p_\theta(x, y))$ 
which measures the discrepancy between the target empirical data distribution
$p_\delta(x, y)$\footnote{We 
find the Greek letter $\delta$ especially suitable for data distribution because 
it is consonant with "data" and reflects the delta-function-like form of the
empirical distribution.} and the model distribution $p_\theta(x, y)$ \cite[4.2.2]{murphy2022probabilistic}.

In the regular latent variable setting, we are given a dataset 
$\{x_1, ..., x_N\}$ and the model $p_\theta(x, z) = p_\theta(x|z) p(z)$.
Missing labels $z$ can be treated as missing $p_\delta(z|x)$ part of
the target joint.
If we cannot come up with a reasonable $z|x$ target we can at least 
match the marginals in the observed domain with $KL(p_\delta(x) || p_\theta(x))$
in hope that the model will learn an informative relation
between $x$ and $z$. This is equivalent to the maximization of MaL:
% The task is to find
% $\theta$ that makes the distribution $p_\theta(x)$ similar to the distribution
% of the samples while maintaining an informative relationship between $x$ and $z$
% induced by $p_\theta(x,z)$.
% This task is commonly approached by maximization of the data marginal likelihood:
\begin{equation} \label{eq:mal}
    \mathcal{L}_{MaL}(\theta) = \sum_{i = 1}^N \log p_\theta(x_i) =  \sum_{i =
    1}^N \log  \int p_\theta(x_i, z) dz
\end{equation}
In general, we cannot compute $p_\theta(x)$ exactly due to the integration
operation. This fact leads to an abundance of approximation techniques,
which in the majority are aimed at getting better estimates of $p_\theta(x)$ 
\cite[]{hoffman-svi-13, kingma2014vae, mnih-nvil-14, salimans-mcmcmi-15, burda-iwae-15}
or $\nabla_\theta p_\theta(x)$ \cite[]{ruiz-unbiased-21}. 

% Complete likelihood is much easier to optimize since it requires no
% integration with respect to $y$. However, it relies on the ground truth
% targets $y_i$ for each $x_i$ which are not available in the previous scenario.

% TODO: INFOMAX
Despite the family of all possible target $p(z|x)$ distributions being tremendous 
we do not need to consider it entirely. 
% Despite we do not know exactly what
% $p(z|x)$ we are looking for, we know at least some properties it should 
% possess.
Firstly, the marginal of the target conditional in the latent 
domain should match the prior $p(z)$. Secondly, as in any real-world 
dataset, it should confidently assign a label $z$ to each $x$ in the dataset. 
It is not hard to get a rich family of distributions with such properties.
We can obtain a collection $\{z_1, ..., z_N\}$ by sampling from the prior 
and pair this collection with the dataset $\{x_1, ..., x_N\}$. Sampling
from the prior ensures the first requirement, while the assignment of a single $z$
to each $x$ assures the second. We express each pairing as some 
permutation $\pi$, which produces a complete collection 
$\{(x_1, z_{\pi(1)}), ..., (x_N, y_{\pi(N)}) \}$ and empirical joint 
$p_{\delta\pi}(x, z) = p_\delta(x) p_\pi(z|x)$.
% of the sample from the prior that defines an 
% empirical distribution $p_{\delta\pi}(x, z) = p_\delta(x) p_\pi(z|x)$. 
Given a family of distributions we need to decide which member of the 
family is our target. We propose to pick the one with the highest complete 
likelihood relying on the model inductive biases. For this target we 
optimize once again the complete likelihood of the $(x_i, z_{\pi^*(i)})$ pairs 
with the optimal permutation $\pi^*$. These considerations lead us to the CoLLike
objective:
% We can try to analyze what kind of distributions we are looking for.
% To make the latent variable setting similar to the supervised one we extend
% it with the Empirical Prior Assumption (EPA): we sample $\{z_1, ..., z_N\}$
% from $p(z)$ and assume that they are ground truth labels for $\{x_1, ...,
% x_N\}$ with unknown pairing. Expressing the pairing as a permutation $\pi$ we
% propose to maximize the following objective with respect to both $\theta$
% and\,$\pi$:
\begin{equation}
\label{eq:colike}
    \mathcal{L}_{CL}(\theta, \pi) = \sum_{i = 1}^N \log p_\theta \left(x_i,
    z_{\pi(i)} \right)
\end{equation}
which we maximize both with respect to $\theta$ and $\pi$. 
An alternative view on the objective can be the following: we sample 
$z$ values from prior and assume that they are ground truth targets for 
the training dataset with unknown pairing. Figure 
\ref{fig:main} depicts the main difference between the objectives: 
CoLLike maximizes specific points of the joint distribution, while MaL
is aimed at maximization of whole lines along the joint.
\begin{figure}
    \centering
    \includegraphics[width=0.6\linewidth]{main_pic}
    \caption{Illustration of the CoLLike (left) and MaL (right) objectives. 
    Double circles and bold lines indicate areas of the joint to be maximized
    and circle filling represents likelihood.}
    \label{fig:main}
\end{figure}
% which is essentially a complete likelihood under the permutation $\pi$. We
% call this objective CoLLike, which stands for Complete Latent Likelihood.
% CoLLike objective can be written in terms of expectations, which will be
% useful for objective analysis:
% \begin{equation*}
%     \mathcal{L}_{CL} (\theta, \pi) = \mathbb{E}_{x \sim
%     p_\delta(x)}\mathbb{E}_{z \sim p_\pi(z|x)} \left [ \log p_\theta(x, z)
%     \right ]
% \end{equation*}
% where $p_\delta(x)$ is an empirical data distribution, and $p_\pi(z|x)$ is an empirical latent distribution
% produced by permutation $\pi$. Note that $p_\pi(z|x)$ is non-zero only when $x
% = x_i$ and $z = z_{\pi(i)}$.
% TODO: consider adding the expectation form of the objective (with p_\epsilon
% factorization also)

\section{Objective Analysis}
% \begin{table}[t]
% \caption{Joint $KL$ forms of the considered objectives.}
% \label{table:kl-forms}
% \begin{center}
% \begin{tabular}{lll}
% \multicolumn{1}{c}{\bf CoLLike} &\multicolumn{1}{c}{\bf Marginal Likelihood}
% &\multicolumn{1}{c}{\bf ELBO}
% \\ \hline \\ $KL(p_\delta(x) p_\pi(z|x) || p_\theta(x, z))$ & $KL(p_\delta(x)
% p_\theta(z|x) || p_\theta(x, z))$ & $KL(p_\delta(x) q_\phi(z|x) || p_\theta(x,
% z))$  \\
% \end{tabular}
% \end{center}
% \end{table}
% TODO: check elbo
% \textcolor{orange}{ $\odot $ We build our analysis on top of these forms}
% \textcolor{orange}{ $\odot $ We express }
We start our analysis by proving that MaL corresponds to matching of a specific 
joint distribution and the model joint:
\begin{align*} &KL(p_{\delta}(x) p_\theta(z |x) || p_\theta(x, z)) =
    \mathbb{E}_{x, z \sim p_{\delta}(x) p_\theta(z|x)} \left [\log
    \frac{p_{\delta}(x) p_\theta(z|x)}{p_\theta(x) p_\theta(z|x)} \right] 
    = \mathbb{E}_{x \sim p_{\delta}(x)}  \left [  \log
    \frac{p_{\delta}(x)}{p_\theta(x)} \right ] \\ &= \mathbb{E}_{x \sim
    p_{\delta}(x)} \left[ \log p_\delta(x) \right] - \mathbb{E}_{x \sim
    p_{\delta}(x)} \left[ \log p_\theta(x) \right] = C - \frac{1}{N} \sum_i
    \log p_\theta(x_i) = C - \frac{1}{N}
    \mathcal{L}_{MaL}(\theta)
\end{align*}
where $C$ is a constant. The joint $KL$ form of the MaL brings new perspectives
on the objective. It might be tempting to think about MaL as a workaround 
for unknown latents that allows you not to specify the target $z|x$ conditional.
However, the joint form reveals that the target conditional is actually 
specified and equals $p_\theta(z|x)$ if we ask what distribution we want to 
mimic. This implies that we are aimed at keeping the model posterior unchanged.
In addition, the form also highlights the intimate connection between MaL and posterior.

CoLLike and a common variational \cite[]{jordan-variational-99} approximation of MaL, 
Evidence Lower Bound (ELBO), can also be expressed as $KL$ divergences 
between joint distributions (see Table \ref{table:kl-forms}). 
We refer to Appendix \ref{appendix:a} for derivation of the equivalence. 
Note the elegant
similarity between objectives which becomes obvious in the joint $KL$ form. 
All divergences share the model $p_\theta(x, z)$ as the second argument, which
implies that the first argument is the target \emph{joint} distribution. 
For all objectives the target joint contains the data distribution
$p_\delta(x)$ as a marginal in $x$ domain, thus the only difference is
in the target $z|x$ conditional. Therefore, all considered objectives belong
to the same family of the form:
\begin{align*}
     \mathcal{L}(\theta) &= KL(p_\delta(x) p(z|x) || p_\theta(x, z)) =
    %  \addtocounter{equation}{1}\tag{\theequation}\label{eq:kl-family} =
     KL(p_\delta(x) p(z|x) || p_\theta(x) p_\theta(z|x)) \\ &= KL(p_\delta(x)
     || p_\theta(x)) + \mathbb{E}_{p_\delta(x)} \left[  KL(p(z|x) ||
     p_\theta(z|x)) \right]
     \addtocounter{equation}{1}\tag{\theequation}\label{eq:kl-x-marg-plus-expected-cond}
\end{align*}
Since the second term in (\ref{eq:kl-x-marg-plus-expected-cond}) is
non-negative, all objectives in the family are lower bounds on the likelihood
up to an additive constant. Note that the $z|x$ target conditional is used 
to minimize the overall divergence. This affects the second term of
(\ref{eq:kl-x-marg-plus-expected-cond}) to make the lower bound tighter.
% While CoLLike and ELBO use optimization
% to obtain the target $z|x$ conditional, the MaL objective use
% the current model posterior as target and essentially zeroing the expected
% $KL$ between conditionals. Note that for CoLLike and ELBO, optimization of 
% the target conditional makes the lower bound tighter.

% Besides, the overall $KL$ can be minimized
% to zero only when the model marginal distribution equal to the data
% distribution. These arguments can be used to justify the family of objectives
% for learning distributions in $x$ domain. 
% We start the analysis with showing that the proposed objective shares common
% traits with both MaL and its typical variational approximation
% represented by Evidence Lower Bound (ELBO). Firstly, we are able to express
% all these objectives as Kullback-Leibler ($KL$) divergences between joint 
% distributions as done in Table \ref{table:kl-forms}. 
% % Note the elegant
% % similarity between objectives which is not obvious in the original formulation
% % but become demonstrative in the joint $KL$ form. 
% Note the elegant
% similarity between objectives which becomes obvious in the joint $KL$ form. 
% Furthermore, this form is
% especially useful for further analysis and comparison of the objectives

% Add note about MaL (Closer look at the joints): despite the illusion that we
% abandon the joint and focus on the single marginal we still
% Continuing with similarities, all divergences in the table share the model
% \emph{joint} distribution as the second argument. Therefore, the first 
% argument of the $KL$ divergences can be treated as 
% \emph{target joint distribution}, which we want to mimic using our model.

% Problem: decrease in KL can raise divergence in the x domain while
% significantly lowering the expected posterior KL This is a lower bound of
% the likelihood 
% Justified objective to learn joint $p_\delta(x) p(z|x)$

\begin{table}[t]
\caption{Considered objectives and their joint $KL$ forms.}
\label{table:kl-forms}
\begin{center}
\begin{tabular}{ccc}
 &\multicolumn{1}{c}{\bf Original Objective}
&\multicolumn{1}{c}{\bf Joint KL form}
% & Original Objective & Joint $KL$ 
\\[2pt] \hline 
\\[-5pt] {\bf CoLLike} & 
    $\sum_{i = 1}^N \log p_\theta \left (x_i, z_{\pi(i)} \right )$ &
    $KL(p_\delta(x) p_\pi(z|x) || p_\theta(x, z))$  \\[5pt]
{\bf MaL} & $\sum_{i = 1}^N \log p_\theta(x_i)$ &
    $KL(p_\delta(x) p_\theta(z|x) || p_\theta(x, z))$ \\[5pt]
{\bf ELBO\footnote{$q_\phi(z|x)$ is an approximate posterior distribution parametrized by $\phi$.}} & $\sum_{i = 1}^N \mathbb{E}_{z \sim q_\phi(z | x_i)} \left [ \log \frac{p_\theta(x_i, z)}{q_\phi(z|x_i)}   \right]$ &
$KL(p_\delta(x) q_\phi(z|x) || p_\theta(x, z))$  \\
\end{tabular}
\end{center}
\end{table}

% TODO: rewrite x given z as x|z everywhere
% The joint $KL$ form reveals another notable similarity
% between the objectives: the $z|x$ target conditional is used to minimize the
% overall divergence. This affects the second term of
% (\ref{eq:kl-x-marg-plus-expected-cond}). While CoLLike and ELBO use optimization
% to obtain the target $z|x$ conditional, the MaL objective use
% the current model posterior as target and essentially zeroing the expected
% $KL$ between conditionals. Note that for CoLLike and ELBO, optimization of 
% the target conditional makes the lower bound tighter.

% TODO: only model inductive biases for learning representations

% TODO: check equals before and after
Despite the common traits, the objectives are different. We will highlight a few
differences and go deeper in the following sections. Firstly, the target
conditional for CoLLike $p_\pi(z | x)$ is empirical, while its counterparts
$p_\theta(z|x)$ and $q_\phi(z|x)$ aren't. Secondly, in MaL approach, we
construct a particular joint distribution $p_\delta(x) p_\theta(z|x)$ and use
it as a target joint, while, in CoLLike, we construct an entire family of joint
distributions with desired properties. Thirdly, the target posterior
is readily available in CoLLike and ELBO cases, while for MaL it could be 
intractable. Lastly, the CoLLike objective allows learning models with 
reverse factorization $p_\theta(x) p_\theta(z|x)$, while MaL and ELBO do not.

\subsection{Mutual Information of the Target Distribution}

% TODO: MI at initialization
% TODO: add lower bound perspective
% TODO: add MI formula, possibly with entropic factorization
Mutual Information (MI) is the key property of the joint distribution in a latent
variable setting. It characterizes how dependent the observed and latent
variables are. We would like to know what MI value our model is targeted at
for each objective. Since our objective can be expressed as $KL$ divergence
between model and target joint distributions (Table \ref{table:kl-forms}), we
can investigate MI values for each target joint. We define
MI between $x$ and $z$ under $p(x, z)$ distribution as:
\begin{equation}
    MI(p(x, z)) = \mathbb{E}_{x, z \sim p(x, z)} \left[ \log \frac{p(x, z)}{p(x) p(z)}  \right ]
    \end{equation}
For MaL, the MI of the target $p_\delta(x) p_\theta(z|x)$ is determined by the
model's current posterior $p_\theta(z|x)$.
Most models have no class preferences at the initialization, which results in
low MI of $p_\delta(x) p_\theta(z|x)$.
Moreover, we are aimed at keeping it unchanged,
since we are using the current posterior as our target posterior. So, low MI
at the initialization might induce learning non-meaningful factorized joint
throughout the training procedure. Since for ELBO the approximate posterior
approximates to the true model posterior this argument is applicable to ELBO too. 
Furthermore, uninformative posterior is a common problem when learning a latent variable
model \cite[]{BowmanVVDJB16, alemi-fixingbrokenelbo-18, lucas-dontblameelbo-19, RazaviOPV19, He-lagging-19} known as
"posterior collapse". 

CoLLike target is an empirical joint distribution. It represents a deterministic
mapping and has constantly high MI by construction, as
shown in Appendix \ref{appendix:b}. Therefore, we are aimed at mimicking a
high MI distribution with our model distribution. Furthermore, CoLLike can
be interpreted as some realization of InfoMax principle \cite{huzar-nat-17}, where 
prior limits the entropy and deterministic mapping maximize MI.

% Furthermore, the CoLLike objective
% is a lower bound on the mutual information between $x$ and $z$ under 
% $p_{\delta \pi}(x, z)$ distribution:
% \begin{align*}
%     MI_{x,z}(p_{\delta \pi}(x, z)) &= \mathbb{E}_{x, z \sim p_{\delta \pi}(x, z)}\left [ \frac{p_{\delta \pi}(x, z)}{p_{\delta}(x) p_{\delta \pi}(z)} \right] \\
%     &= \mathbb{E}_{x, z \sim p_{\delta \pi}(x, z)}\left [ \frac{p_{\delta \pi}(x, z)}{p_\theta(x, z)} \frac{p_\theta(x, z)}{p_{\delta}(x) p_{\delta \pi}(z)}\right] \\ 
%     &= KL(p_{\delta \pi}(x, z) || p_\theta(x, z)) + \mathbb{E}_{x, z \sim p_{\delta \pi}(x, z)}\left [  \frac{p_\theta(x, z)}{p_{\delta}(x) p_{\delta \pi}(z)}\right] \\
%     & \geq \mathbb{E}_{x, z \sim p_{\delta \pi}(x, z)}\left [  \frac{p_\theta(x, z)}{p_{\delta}(x) p_{\delta \pi}(z)}\right] 
%      \addtocounter{equation}{1}\tag{\theequation}\label{eq:mi-lower}
% \end{align*}
% where the last step rely on non-negativity of $KL$ divergence. Maximization
% of (\ref{eq:mi-lower}) is equivalent to maximization of CoLLike because of 
% the constancy of $p_\delta(x) p_{\delta \pi}(z)$.


% If $p_\theta(z|x)$ changes only slightly when $x$ changes, the resulting MI
% of the distribution will be low. Moreover, since the current posterior is
% our target posterior, we enforce the model to keep the posterior
% unchanged\footnote{ Essentially, according to
% \ref{eq:kl-x-marg-plus-expected-cond} we have two learning signal: one that
% promotes marginals in $x$ domain to match and another that enforces to match
% expected $KL$ between conditionals. Both learning signals can modify the
% model joint. When we use the model posterior as the target posterior we drop
% the second learning signal, while the first can modify the entire joint
% since the factorization is $p_\theta(x|z) p(z)$. }. This might result in a
% situation when we initialized the model such that MI of the target is low
% and encourage the model to keep it low.



% This can be interpreted as desire to keep the posterior unchanged
% Variability of the p_\theta(z|x) and the value of MI
%  - is the variability bounded for cat vars
%  - what is the maximum and minimum values then
%  - expected entropy of the conditional (or conditional entropy)

% Moreover, at every step


% VI can be (or can not?) thought as equivalent of MaL when \phi is optimal

% - mutual information of the target is important \newline
% - mutual information for MaL, VI, and CoLLike \newline
% - what is MI for MaL \newline
% - high mutual information can be achieved only if $p_\theta(x|z)$ vary
%   sufficiently while $z$ changes since $MI(p_{\delta\theta}(x, z)) =
%   \mathbb{E}_{x, z \sim p_{\delta\theta}(x, z)} \left[ \log
%   \frac{p_{\delta\theta}(x|z)}{p_\delta(x)} \right]$ \newline
% - Info-Max principle \newline
% - To what extent we mimic $p_\theta$ by picking the target posterior?
%   \newline
% - CoLLike at least has fixed high mutual information while MaL target can be
%   either high MI or low MI.

\subsection{Matching in the Latent Domain}

% ML minimizes the divergence between p(z|x) while collike minimize marginal?
The joint form of the objectives from Table \ref{table:kl-forms} is convenient
for obtaining a perspective on distribution matching in the latent space. After treating
$p_\delta(x) p_\theta(z|x)$ as a joint $p_{\delta \theta}(x, z)$ and rewriting
the original MaL objective as:
% TODO: refer to derivation of the joint form
\begin{gather}
         KL(p_\delta(x) || p_\theta(x)) = KL(p_{\delta \theta}(x, z) ||
 p_\theta(x, z)) =
\mathbb{E}_{x, z \sim p_{\delta \theta}(x|z) p_{\delta \theta}(z)} \left [\log
\frac{p_{\delta \theta}(x|z) p_{\delta \theta}(z)}{p_\theta(x| z) p_\theta(z)}
\right] \nonumber \\
= \mathbb{E}_{z \sim p_{\delta \theta}(z)} \left[ KL \left (p_{\delta \theta}(x|z) ||
p_\theta(x|z) \right) \right ]+ KL(p_{\delta \theta}(z) || p_\theta(z))% \end{split}
\label{eq:latent-mismatch}
\end{gather}
we see that matching in $x$ space requires matching in $z$ space. Namely,
$KL(p_{\delta \theta}(z) || p_\theta(z)) =0$, where $p_{\delta\theta}(z)$ is
called an aggregated posterior. It signifies that even though MaL is constructed such that $z$ given $x$ conditional part
of the $KL$ between joints is zero, we end up in a situation where none of the
model marginals match target marginals. Moreover, the learning signal from 
the first term of (\ref{eq:latent-mismatch}) might be significantly larger
compared to the second term signal if the dimensions of $x$ and $z$ differ a lot.
This might lead to a sacrifice of the second divergence in favor of the first
one.

% TODO: say about struggle in optimization
% TODO: this motivates researchers to tackle this divergence by adding a 
% a separate loss to the objective.

% This divergence is not zero even if we have access to the true underlying
% data distribution $p(x)$ instead of $p_\delta(x)$ or when we use approximate
% posterior $q_\phi(z|x)$ as a target. There is a question whether the
% divergence in the latent domain play important role in both modelling the
% distribution $p_\theta(x)$ and properties of the joint $p_\theta(x, z)$

Matching in a latent domain is considered as a known challenge of latent 
variable modelling \cite[]{hoffman2016elbo}. Mismatch with prior results in
unnatural samples from areas with high deviation of aggregated posterior
from the prior \cite[]{rosca2018distribution}. A number of works is focused
on this problem. They either utilize additional losses that penalize
discrepancy between marginals
\cite[]{makhzani2015adversarial,zhao2019infovae,kim2018disentangling} or
introduce a learnable prior \cite[]{bauer2019resampled,tomczak2018vampprior}.

% Despite this problem can be corresponded to the finite nature of $p_\delta(x)$
% a range of works [] report unnatural $x$ samples from regions where $p_{\delta
% \theta} (z)$ is lower than model prior $p(z)$. Moreover, attempts to directly
% tackle the divergence of marginals in the latent space are present in [].

% KL between empirical distribution and the distribution it is obtained from
% Cross-entropy? In the limit we approach 0 as the empirical cumulative
% distribution function converge to the distribution it was sampled from
% almost surely.
In turn, CoLLike addresses this problem by constructing a conditional, which 
marginal matches prior in the latent domain.
% While MaL target distribution is constructed to minimize
% divergence between conditional distributions, the CoLLike construction is
% designed to minimize divergence between marginals in the latent domain.
Obviously, the target marginal in $x$ domain for CoLLike is always
$p_\delta(x)$. In turn, the target aggregate posterior is always a sample from
the prior since $p_{\delta\pi} (z) = \int_x p_\delta(x) p_\pi(z|x) dx =
p_\epsilon(z)$  for all $\pi$ values, where $p_\epsilon(z)$ is the distribution
of the sample produced by sampling from the prior.
While it is intuitively obvious that the
empirical distribution converges to the underlying distribution, one can show
that $KL$ between the empirical sample and the prior converges in probability to
$0$ \cite[Theorem~11.2.1]{thomas2006elements}.

\subsection{Gradient Estimation with Respect to Sampling}

% As neither $p_\theta(x)$ nor $p_\theta(z|x)$ are available in general,
% particularly, we cannot compute their true values or differentiate them, a
% common practice is to apply approximate inference techniques such as
% variational inference.

% Closed form expressions for $p_\theta(x)$ or its gradient (in case of
% application of the gradient based optimization) are available only for limited
% range of relatively simple distributions. Inability to access the marginal
% distribution in the $x$ domain motivates a range of approximation techniques.
% One of such technique is Variation Inference.
% Variational Inference relies on the approximation $q_\phi(z|x)$ of the true
% posterior $p_\theta(z|x)$, which in turn grants access to the
% $p_\theta(x)=p_\theta(x, z) / p_theta(z|x)$. The difference between between
% true and approximate posteriors is minimized through the optimization
% procedure.
The main challenge in optimization of ELBO is to estimate the
gradient with respect to the approximate posterior parameters $\phi$.
Approximate posterior $q_\phi(z|x)$ appears both in the probability ratio and
in the expectation:
\begin{equation} KL(p_\delta(x) q_\phi(z |x) || p_\theta(x, z)) =
\mathbb{E}_{x, z \sim p_\delta(x) q_\phi(z |x)} \left [ \log \frac{p_\delta(x)
q_\phi(z|x)}{p_\theta(x, z)}  \right]
\end{equation}
% Differentiation - assumes that we use gradient based optimization
Expectation is usually estimated by sampling. Differentiation with respect to
the sampling procedure is hard. For a limited range of continuous
distributions, the problem can be solved by reparametrization trick
\cite[]{kingma2014vae,salimans2013fixed}. For other cases, including discrete
$z$, a general purpose score-function estimator can be applied \cite[]{williams-reinforce-92}.
Nevertheless, it requires significant efforts to obtain reliable gradients \cite[]{mnih-nvil-14, mnih-vimco-16, tucker-rebar-17}.

For CoLLike, the target distribution $p_\delta(x) p_\pi(z | x)$ is
parameterized by permutation $\pi$. This parameterization does not rely on an
encoder, so the approach is encoderless. The permutation cannot be tuned by
gradient-based techniques, however, since the set of all possible $\pi$ values
is countable and finite, optimization can be performed by an exhaustive search.
% Moreover, as we show in Section \ref{sec:alg}, this task is essentially a
% combinatorial optimization problem which can be solved efficiently.


\section{Algorithm} \label{sec:alg}
% The algorithm of Complete Likelihood optimization repeatedly perform two steps:
% 
% 1. Find a pairing with maximum likelihood, e.g. $\{(x_1, z_5), ..., (x_N, z_2)\}$ such that the mean $\log p_\theta(x, z)$ over all pairs is maximal
% 
% 2. Likelihood maximization step over $\theta$ for this pairing, go to the previous step 

% TODO: add independence with respect to prior (rewrite explanations) 
% TODO: add mention that we use minibatch version in experiments
The complete likelihood objective \ref{eq:colike} is function of the
permutation $\pi$ and the model parameters $\theta$. We approach the objective
by alternating between finding optimal pairing $\pi^*$ and gradient-based
optimization of $\theta$. The procedure resembles EM algorithm, where the
expectation is replaced with the assignment $x$ samples to $z$ samples. 
To tackle $N!$ search space of possible pairings we evaluate the likelihood of 
all $(x, z)$ pairs and treat the task
as a combinatorial linear sum assignment problem with $\mathcal{O}(N^3)$ 
complexity, as shown in Appendix \ref{ap:combinator}. The 
following python-like pseudocode describes the proposed algorithm:
\\ \\

\begin{lstlisting}[language=Python]
x_collection = load_dataset()
n_samples = len(x_collection)
z_collection = model.p_z.sample(n_samples)
optim = optimizer()

for epoch in range(n_epochs):
  log_p_xz_mat = empty(n_samples, n_samples)
  for nx, x in enumerate(x_collection):
    for nz, z in enumerate(z_collection):
      log_p_xz_mat[nx, nz] = model.log_p_xz(x, z)

  opt_rows, opt_cols = maximize_linear_assignment(log_p_xz_mat)
  loss = 0
  for opt_nx, opt_nz in zip(opt_rows, opt_cols):
    loss += -log_p_xz_mat[opt_nx, opt_nz]
  optim.apply_grads(compute_grads(loss, model.parameters))
\end{lstlisting}

% So, we start with sampling $z_i$ for each $x_i$ in the dataset. According to
% EPA, this collection of samples is assumed to be shuffled labels for our
% collection $\{x_1, ..., x_N\}$, which means that this set $\{z_1, ..., z_N\}$
% is fixed along the learning procedure.

% Now we need to estimate pairing between $\{x_1, ..., x_N\}$ and $\{z_1, ...,
% z_N\}$ determined by permutation $\pi$. To get the optimal pairing $\pi^*$ we
% maximize \ref{eq:colike} with respect to $\pi$ under the current model
% parameters. This step requires knowledge of $\log p_\theta(x_i, z_j)$ for
% every pair of $x_i$ and $z_j$, leading us to necessity of evaluation of the
% model $\mathcal{O}(N^2)$ times.

% After optimal permutation is found we optimize complete likelihood of the
% permuted pairs $(x, z)$ with respect to $\theta$ by gradient descent step.
% Then we continue to alternate between finding optimal permutation and complete
% likelihood optimization until convergence.

The main challenges of the proposed algorithm are $\mathcal{O}(N^2)$ calls of the 
model and $\mathcal{O}(N^3)$ complexity of combinatorial optimization.
While former might be more challenging due to an extensive amount of computations
inside the model, the later exhibits faster growth. We purpose a range of
techniques to reduce the complexity below.

% \subsection{Complexity Reduction}
% \label{complexity-reduction}

\textbf{Minibatch Assignment.} The proposed algorithm is aimed at full batch
optimization. Aside from squared complexity in dataset size $N$, there are two
reasons to develop a minibatch version of it. Firstly, combinatorial
optimization can be prohibitive for large $N$. Secondly, large batches can
potentially harm generalization \cite{you2017large,you2020large}.
% Besides fast convergence, SGD has sometimes been observed to yield significantly better generalization errors than batch methods (Bottou & Bousquet, 2011)
We apply a
minibatch technique similar to \cite{bojanowski2017unsupervised}. We optimize 
over permutation set  and perform the gradient step only for minibatch. However,
we store an array of $z$ values and permute minibatches inside it according to 
the optimal permutation. The number of model forward
passes is now quadratic in terms of minibatch size instead of dataset size.
However, as gracefully shown by \cite{huzar-nat-17},
%Ferenc
% Huszár\footnote{\href{https://www.inference.vc/unsupervised-learning-by-predicting-noise-an-information-maximization-view-2/}{https://www.inference.vc/unsupervised-learning-by-predicting-noise-an-information-maximization-view-2/}}
this kind of minibatch combinatorial optimization provides only locally
optimal solutions. 
% Nevertheless, the size of the gap between local global optimum
% is still to be determined.

\textbf{Low-dimensional Discrete Latents.} Discrete latent variables with
a number of categories $K$ lower than $N$ can provide additional speed-ups. In
the case, all possible values $z$ for each $x$ is $K < N$ which results in cost 
$\mathcal{O}(NK)$ instead of $\mathcal{O}(N^2)$. For instance, if there is a single 
binary random variable, the computational cost is proportional to $2 N$.

\textbf{Factorized Conditional.} If $p_\theta(x|z)$ part of the model
factorizes (each $x$ dimension is predicted independently given $z$ like in
the original VAE \cite{kingma2014vae}) only one forward pass is required.
Concretely, the distribution for all dimensions of $x$ is obtained by passing
$z$ to $p_\theta(x|z)$. In contrast to, for instance, autoregressive
$p_\theta(x|z)$, the distribution does not depend on previous dimensions of
$x$. Having a distribution for all dimensions of $x$, it is regularly cheap to
evaluate likelihood of a particular $x$.
% For example, if $p_\theta(x|z)$ is a
% factorized Gaussian distribution for a given $z$ we run $p_\theta(x|z)$ to get
% mean and variance vectors. Then we use these this vectors in Gaussian PDF for
% each $x$, which is cheap compared to the model forward pass. In short, for
% factorized models the number of model forward passes is $N$, while the number
% of evaluations of likelihood is still $N^2$.


% TODO: replace states with categories (e.g. for discrete random variable x 
% with K categories ...)

\section{Connections} 

% Introduction
% Optimal transport
% Constrained K-means
% Permutation invariant training
% NAT and Sinkhorn Autoencoders
%  - is WAE limited to continuous latents? seems so
%  - use not factorized distribution, and not deterministic decoder
%  - WAE struggle to construct a joint family with given joints, CoLLike shows
%    that it is possible to make such family

Connections with existing techniques not only gives alternative perspectives on
CoLLike objective, but also provides probabilistic grounding to some
existing algorithms. Many well-known objectives actually use CoLLike while being
motivated as an ad-hoc empirical risk minimization. We show that these objectives
not only seem reasonable but also probabilistically motivated.

% TODO: add derivation to the appendix
While traditional K-means algorithm \cite[]
{macqueen1967classification,lloyd1982least} has a probabilistic
interpretation \cite[21.4.1.1]{murphy2022probabilistic}, its
constrained counterpart \cite[]{bradley2000constrained} lacks probabilistic
grounds. Constrained K-means is equivalent to CoLLike under factorized Gaussian $p_\theta(x|z)$ and
uniform categorical $p(z)$, which has a number of states equal to the number of
clusters. This connection allows extending the constrained K-means approach to
different generative distributions and priors.

Permutation Invariant Training (PIT) \cite[]{Yu-pit-17, Luo-convtasnet-19}
used in source separation solutions also can be expressed as CoLLike objective.
For instance, in cocktail part problem, we want to separate a mixture of $K$ sources. During training, we have $K$ isolated mixture components and a network that
produce $K$ estimates of the components based on the single mixture. We don't 
know which network output corresponds to which source and we pick a permutation
that produces minimal total mismatch between outputs and sources. This procedure
corresponds to training a latent variable model with CoLLike objective, where 
a categorical latent variable of dimension $K$ determines the source identity.
In this setting, we treat mixture components as samples in the dataset.

% TODO: add sinkhorn and other work
The closest predecessor of the CoLLike is Noise As Target (NAT)
\cite[]{bojanowski2017unsupervised}. This is an unsupervised approach to learn 
an image encoder. In this approach, the representations produced by a network
are assigned to a fixed collection of vectors sampled from uniform distribution
on a sphere. After this, the network parameters are adjusted to make encodings
closer to assigned vectors. This approach is equivalent to CoLLike with reverse 
model factorization $p_\theta(x, z) = p_\theta(z|x) p(x)$ and factorized Gaussian 
$p_\theta(z|x)$. Another approaches that obtain clear probabilistic interpretation
using CoLLike include: Sinkhorn Autoencoders \cite[]{patrini-sinkhorn-auto-19}, simultaneous clustering and representation learning \cite[]{asano-selflab-20}, and \cite[]{jeong-alternating-19}.


\cite{bojanowski2017unsupervised} noticed that NAT objective 
has Optimal Transport (OT) roots.
% We think that connection between CoLLike and 
% OT is the most beautiful part of the current work, so we dedicate the rest of 
% the section to it.
% % TODO: citation needed
OT framework can be used to measure discrepancy between
distributions. Particularly, for a given non-negative cost function $c$ the
optimal transport distance between distributions $p_\delta$ and $p_\epsilon$
is defined as
$$ OT(p_\delta, p_\epsilon) = \min_{\gamma \in \Gamma
(p_\delta, p_\epsilon)} \mathbb{E}_{x, z \sim \gamma(x, z)}\left[ c
(x, z)\right] $$
where $\Gamma(p_\delta, p_\epsilon)$ is the set of all joint distributions on
$x$ and $z$ with marginals $p_\delta(x)$ and $p_\epsilon(z)$ respectively.
Furthermore, if we use a parametric model $p_\theta$ in place of $p_\epsilon$ we
can fit it by minimizing the distance. Note that in this
case we minimize the function that already has a $\min$ function
inside.

When both $p_\delta$ and $p_\epsilon$ are empirical, the search space $\Gamma$
becomes countable and finite. Now it contains only pairings between points in
$p_\delta$ and points in $p_\epsilon$. Given an arbitrary initial pairing, we
can express all other pairings through permutation applied to either $x$ or
$z$. In this case, the cost becomes 
$$ OT(p_\delta, p_\epsilon) = \min_{\pi \in \Pi} \sum_i c \left(x_i, z_{\pi
(i)} \right) $$
where $\Pi$ is the set of all permutation functions. This expression is almost
the CoLLike objective (\ref{eq:colike}). Choosing the cost function $c$ to be
$-\log p_\theta(x, z)$ and switching to maximization make them equivalent\footnote{A cautious reader
might note that for continuous variables non-negativity of $\log p_\theta$ can be 
violated, however, all model densities used in practice are finite and the
corresponding cost can be made positive just by an additive constant which does
not change the optimization problem.}. Thus,  CoLLike bridges maximum likelihood
methods with OT. This connection allows bringing latest developments in OT to
improve likelihood-based methods. Furthermore, in Appendix \ref{ap:wasserstein}, 
we provide an example of the equivalence between CoLLike and Wasserstein distance. In the
case, the model's complete likelihood plays the roles of both a mapping from $z$ to $x$
domain and a distance metric.

\section{Experiments}

In this work, we focus on low-dimensional discrete latents. This type of latent
variables allows to compare directly with the exact likelihood. Furthermore, we 
emphasize our focus on learning useful $z|x$ instead of simplifying the model
with factorized $x|z$ conditional.

% Some motivation: representation learning, unsupervised learning,
% not restricted model class

\subsection{Tractable Likelihood}

\begin{table}[]
    \caption{Results for tractable categorical latents. MNIST, CIFAR -- BPD; AG News -- NLL.}
    \label{tab:res-lowdim}
    \centering
    \begin{tabular}{cccccc}
        \hline
        Dataset   & Objective    & Accuracy $\uparrow$ & NLL/BPD $\downarrow$ & Agg. KL $\downarrow$ & MI $\uparrow$ \\
        \hline
        \hline
        CIFAR     & CoLLike       & 14.5              & 3.45                 & 0.01                  & 2.20  \\
        CIFAR     & MaL          & 14.0               & 3.46                 & 0.74                  & 1.50  \\
        MNIST     & CoLLike       & 14.1              & 1.27                 & 0.01                  & 1.95  \\
        MNIST     & MaL          & 12.5               & 1.29                 & 1.61                  & 0.58  \\
        AG News   & CoLLike       & 82.1              & 250.79               & 0.00                 & 1.32  \\
        AG News   & MaL          & 31.6               & 249.73               & 0.00                 & 0.00  \\
        \hline
        \hline
    \end{tabular}
\end{table}

Models with tractable likelihood are perfect for comparison of likelihood-based
algorithms because they remove the problem of the likelihood estimation precision. For
this type of models all quantities of interest can be computed exactly. Moreover, 
tractable likelihood allows comparing CoLLike directly with MaL instead of its 
approximations like ELBO.

% TODO: add specification of the models in the appendix
We use MNIST \cite[]{lecun-mnist-98} and CIFAR \cite[]{krizhevsky2009cifar10} datasets
for image modality and AG News \cite[]{zhang2015agnews} for text domain.
All these datasets are equipped with class labels. For images, we train a Glow-like 
normalizing flow conditioned on a discrete latent variable with 10 categories
through all coupling layers. For text we use a Transformer Language Model
conditioned on a discrete latent variable with 4 categories using additive
embedding for all tokens. The size of the discrete variables is equal to the 
number of classes in the underlying dataset. Small number of categories allows
to compute exact marginal likelihood value and speed up CoLLike to
$\mathcal{O}(NK)$. 

Table \ref{tab:res-lowdim} presents the results of training the latent variable 
models for CoLLike and MaL objectives averaged across 4 runs. Both objectives 
exhibit similar performance in terms of likelihood across datasets. However, other
characteristics vary. 

MI is high for CoLLike objective on every dataset. Furthermore, it attains 
approximately maximal value for AG News and CIFAR. MI for MaL objective 
ranges from zero to values significantly lower than those of CoLLike. 
Zero MI indicates posterior collapse cases, which are mainly observed in
ELBO optimization and recently discovered by \cite{lucas-dontblameelbo-19}
for MaL applied to simple linear models. This experiment
indicates important observation: posterior collapse can as well happen 
in deep latent variable models during optimization of exact MaL despite
usually being corresponded to the structure of ELBO. Importantly, for 
MNIST dataset, half of the experiments exhibited
posterior collapse. 

CoLLike exhibits near zero aggregated KL for all experiments. It implies that
the model joint marginal in latent domain perfectly matches prior. For MaL,
aggregated KL is zero only for AG News dataset which also has uninformative
factorized joint. For other datasets, aggregated posterior significantly deviates
from the prior. We also note that for MNIST dataset, MaL puts all probability
mass to a single category in half of the runs.

To estimate unsupervised classification quality we perform the optimal assignment
of the latent categories to classes. For all cases except CoLLike objective on
AG News dataset, the quality of the unsupervised classification is similar and is low.
On AG News the unsupervised accuracy is exceptionally good. However, the 
variance of the proposed solution is relatively high. Standard deviation of the
accuracy across 4 runs is $5.4$ with the highest value of $87.1$ and the lowest of $73.3$.
We show in the following section that it is possible to achieve significantly
higher unsupervised accuracy and lower variance by latent variable 
ensembling.

Overall, CoLLike clearly outperforms MaL in the tractable likelihood setting. Moreover,
it shows high unsupervised classification accuracy for text modality.
For MaL, experiments depict a variety of possible failures from posterior collapse
to degenerate aggregated posterior, which extends findings of \cite[]{lucas-dontblameelbo-19} to expressive models and exact likelihood.
However, despite CoLLike producing informative latents in terms of MI, 
unsupervised classification might be challenging even in these cases. We believe
that the key to high-performance unsupervised classification should be in the right
inductive biases in conditioning and probabilistic model type.

\subsection{Latent Ensembling}

\begin{figure}
    \centering
    \subcaptionbox{CoLLike ensemble vs. unsupervised and semi-supervised approaches. \label{fig:comparison}}{
        % \def\svgwidth{0.45\columnwidth}\input{comparison.pdf_tex}
        \includegraphics[width=0.48\linewidth]{comparison}
    }
    \hfill
    \subcaptionbox{Supervised DeBERTa v3 base vs. CoLLike ensemble.\label{fig:ensemble}}{
        \includegraphics[width=0.48\linewidth]{ensemble}
    }
    \caption{Comparison of ensembled CoLLike with supervised (a) and unsupervised/few-shot methods(b).}
    \label{fig:figures}
\end{figure}

% - Variance of the unsupervised classification is a problem, we propose solution
% - 
% TODO: more info on ensembling in appendix
To reduce the high variance of CoLLike unsupervised classification and increase its
accuracy we propose to perform ensembling of multiple models trained on the same
data but using different seeds at initialization. Although there is no
correspondence between labels for latent variable models, we can try to find the 
labels assignment based on the agreement between them. 
This approach is motivated by direct cluster ensembling
\cite[]{boongoen-clustensembl-18}. The agreement between two labels of different
ensemble members is the number of intersecting samples with those labels.
To align the latents we iteratively find the
assignment with the highest intersection between labels. Finally, we find the
assignment between aligned latents and ground truth labels.

% $\sum_q \sum_k \sum_i z_i$

% We define the agreement score between
% two clusters $$ and $C_\beta$ as:

% \begin{equation}
%     A(C_\alpha, C_\beta) = \sum_{c_\al} 
% \end{equation}

% \begin{figure}
%     \centering
%     \def\svgwidth{\columnwidth}
%     \scalebox{0.7}{\input{ensemble.pdf}}
%     \caption{Illustration of the CoLLike (left) and MaL (right) objecoives. 
%     Double circles and bold lines indicate areas of the joint to be maximized. 
%      }
%     \label{fig:main}
% \end{figure}

In our experiments, we use $8$ models per ensemble and train $4$ independent
ensembles.
The simplest ensembling method is averaging of the predictions. It increases
the mean unsupervised accuracy from $82.1$ to $84.5$ and reduces the standard
deviation from $5.4$ to $1.7$. We further significantly improve these results by 
utilizing the agreement score, which is also used for alignment of the labels. We 
pick top-k models with highest maximum coherence across other models in the ensemble.
Averaging predictions of those top-k models further increases accuracy to $\mathbf{86.6}$
and lowers the standard deviation to $\mathbf{0.2}$.

We compare CoLLike results with the following unsupervised and supervised approaches: PET 
and iPET  \cite[]{schick-petipet-21}, EFL \cite[]{wang-efl-21}, LM-BFF \cite[]{gao-lmbff-21}, DocSCAN \cite[]{stammbach2021docscan}. DocSCAN is purely unsupervised, while other approaches
rely on engineering multiple textual descriptions of classes (prompts) 
or labeled data. All methods use heavy pre-trained Transformers
\cite[]{vaswani-transformer-17} as an initialization, while in CoLLike we use small $2$-layer
Transformer with random initialization. Figure \ref{fig:comparison} 
presents the comparison of the methods. CoLLike clearly outperforms both unsupervised and
most of the supervised methods.
To determine how much training data we need without, possibly laborious, prompt engineering
we use DeBERTa v3 \cite[]{deberta2021}. We vary
training set sizes from $32$ to $2048$ and apply additional ensembling of $8$ models with different
initializations and train-validation splits. Figure \ref{fig:ensemble} reveals that CoLLike
can be a better alternative to labeling more than a hundred samples, which, in turn, 
requires an extensive data analysis. Besides, note the high difference between
the ensemble and the single model for small dataset sizes in a supervised setting, which
is an interesting result by itself.

\section{Discussion and Future Work}
In this work, we propose to switch from the MaL paradigm of matching only marginals in
the observed domain to CoLLike paradigm of finding an exact target joint by selection
from a family of joints with desirable properties. Furthermore, we show that matching
of marginals utilized by MaL corresponds to a specific choice of target joint, 
which motivates such failures as posterior collapse and divergence between target 
and model marginals in the latent domain. We experimentally show the ability of CoLLike
to learn useful representations. Connection of CoLLike with OT allows to borrow
techniques from the later. For instance, Sinkhorn Relaxation \cite[]{cuturi-sinkhor-13}
can be used to speed up the assignment problem. 
Investigation of alternatives to complete likelihood for target selection is of special
interest. The right inductive biases for inducing useful properties using CoLLike
are still to be discovered, at 
least until we want to get the desired without specifying what we want. 
We believe that the further extension of CoLLike to high-dimensional latents would be exciting
and challenging.
Other lines of research can be devoted to the application of other divergences to the
constructed family of joint target candidates and extension of CoLLike to learnable 
priors. 

% Discussion (and Future Work?)
% - cold posterior 
% - any form of samplable prior
% - Other divergences with EPA
% - Both MaL and CoLLike are more like a constraint not the objective we want 
%   by itself.
% - Sinkhorn approximation
% - Only model's inductive biases, we can come up with something else, because it is
%   more like of a restriction

% -  Despite the proposed approach is encoderless, we believe that learning an
% approximate posterior could be useful for sampling better minibatches in this
% setting.
    
% \section{Reproducibility}

% To promote reproducibility we open-source our code \emph{\href{http://example.com/}{link is hidden for double blind review}}

\bibliography{iclr2023_conference}
\bibliographystyle{iclr2023_conference}

\appendix
\section{Derivation of KL Forms of the Considered Objectives}\label{appendix:a}

Equivalence between CoLLike objective (\ref{eq:colike}) and it's $KL$ divergence
form from Table \ref{table:kl-forms} can be derived as follows:

% TODO: CL or CLL
\begin{align*} KL(p_\delta(x) p_\pi(z | x) || p_\theta(x, z))  &=
    \mathbb{E}_{x, z \sim p_\delta(x) p_\pi(z|x)} \left [ \log
    \frac{p_\delta(x) p_\pi(z|x)}{p_\theta(x,z)} \right] \\ &= \mathbb{E}_{x,
    z \sim p_\delta(x) p_\pi(z|x)} \left [ \log p_\delta(x) p_\pi(z|x) - \log
    p_\theta(x,z) \right]
    \addtocounter{equation}{1}\tag{\theequation}\label{eq:cll-derivation-constant}\\
    &= C - \mathbb{E}_{x, z \sim p_\delta(x) p_\pi(z|x)} \left [ \log
    p_\theta(x,z) \right] \\ &= C - \frac{1}{N}\sum_{i = 1}^N \log
    p_\theta(x_i, z_{\pi(i)}) \\ &= C - \frac{1}{N} \mathcal{L}_{CLL}(\theta,
    \pi)\addtocounter{equation}{1}\tag{\theequation}\label{eq:cll-derivation-result}
\end{align*} where the first term in (\ref{eq:cll-derivation-constant}) is
treated as constant with the assumption that all samples from $p_\delta(x)$
take distinct value, which is reasonable for such high-dimensional objects as
images, texts, and sounds. Thus, the $KL$ form of the objective is equivalent
to the CoLLike objective up to a multiplicative factor and an additive term.
For the proof of the constancy see Appendix \ref{appendix:b}.

The derivation of equivalence between (\ref{eq:mal}) and its $KL$ from Table
\ref{table:kl-forms} is as follows

\begin{align*} KL(p_{\delta}(x) p_\theta(z |x) || p_\theta(x, z)) &=
    \mathbb{E}_{x, z \sim p_{\delta}(x) p_\theta(z|x)} \left [\log
    \frac{p_{\delta}(x) p_\theta(z|x)}{p_\theta(x) p_\theta(z|x)} \right] \\
    &= \mathbb{E}_{x \sim p_{\delta}(x)}  \left [  \log
    \frac{p_{\delta}(x)}{p_\theta(x)} \right ] \\ &= \mathbb{E}_{x \sim
    p_{\delta}(x)} \left[ \log p_\delta(x) \right] - \mathbb{E}_{x \sim
    p_{\delta}(x)} \left[ \log p_\theta(x) \right]\\ &= C - \frac{1}{N} \sum_i
    \log p_\theta(x_i)\\ &= C - \frac{1}{N}
    \mathcal{L}_{MaL}(\theta)
\end{align*}

% TODO: check VI abbreviation
The $KL$ form of ELBO objective from Table \ref{table:kl-forms} can be found in
a number of works \cite{zhao2019infovae,kingma2019introduction}, however, we
provide a derivation here to make the paper self contained.

\begin{align*} KL ( p_{\delta}(x) q_\phi(z | x) || p_\theta(x, z)) &=
    \mathbb{E}_{x, z \sim p_{\delta}(x) q_\phi(z | x)} \left [ \log
    \frac{p_{\delta}(x) q_\phi(z | x)}{p_\theta(x, z)} \right ] \\ &=
    \mathbb{E}_{x \sim p_\delta(x)} \left[ \log p_\delta(x) \right] +
    \mathbb{E}_{x, z \sim p_{\delta}(x) q_\phi(z | x)} \left [ \log
    \frac{q_\phi(z | x)}{p_\theta(x, z)} \right ] \\ &= C - \mathbb{E}_{x \sim
    p_{\delta}(x)} \mathbb{E}_{z \sim q_\phi(z | x)} \left [ \log
    \frac{p_\theta(x, z)}{q_\phi(z | x)} \right ] \\ &= C - \frac{1}{N} \sum_i
    \mathcal{L}_{ELBO}(x_i, \phi, \theta)
\end{align*}


\section{Entropy and Mutual Information of Empirical Joint}\label{appendix:b}

In this appendix we derive some useful properties of the empirical joint
distributions produced sampling from the prior.
The joint distribution $p_\delta(x) p_\pi(z|x)$
depends on $\pi$. We focus on how $\pi$ influence such distribution
characteristics as entropy and mutual information.

\begin{figure}[ht]
\begin{center}% \framebox[4.0in]{$\;$}
    % \fbox{\rule[-.5cm]{0cm}{4cm} \rule[-.5cm]{4cm}{0cm}}
    \includegraphics[width=0.8\linewidth]{joints_example}
\end{center}
\caption{Example of three joint distributions with discrete $x$ and $z$. Lines
on the left and in the bottom depicts the number of samples from empirical
marginals with corresponding value of the random variable. Squares reflect the
joint probability value. Each line crossing the square corresponds to $1/N$
probability added to the corresponding $(x, z)$ random variable pair.}
\label{fig:two-discrete-joints}
\end{figure}

Consider a joint distribution over discrete $x$ and $z$. This kind of
distribution can be visualized as a table, such as depicted in Figure
\ref{fig:two-discrete-joints}. If there are multiple samples taking the same
value both in $x$ and $z$ domain, the permutation can change the entropy of
the joint. For instance, for the distribution on the left, the entropy
$H(p_\delta(x)p_\pi(z|x)) = -(\frac{1}{4} \log \frac{1}4 + \frac{1}{4} \log
\frac{1}4 + \frac{1}{2} \log \frac{1}{2}) \approx 1.04$ nats. For the
distribution in the center, the entropy  $H(p_\delta(x)p_\pi(z|x)) = -(4 \cdot
\frac{1}{4} \log \frac{1}4) \approx 1.39$ nats. So, depending on $\pi$ we
might end up with more and less entropic distributions.

However, when we restrict any empirical marginal to take only distinct values,
like in the right part of Figure \ref{fig:two-discrete-joints}, the situation
changes. Namely, each distinct pair $(x, z)$ can be chosen at most once,
because to chose it twice we need a duplicate sample in both domains. This can
be verified using the right part of Figure \ref{fig:two-discrete-joints}. Just
try to construct a joint with some square having grater than one line assuming
that each value $x$ marginal has only one line. Moreover, for $x$, this is a
reasonable assumption since usually the domain of $x$ is high-dimensional. For
such case, the joint will contain $N$ non-zero points each with probability
$1/N$. Thus, the entropy of the empirical distribution is equal $-\sum_{n =
1}^N \frac{1}{N} \log \frac{1}{N} = \log N$.

The observation above allows to easily derive mutual information of the
empirical joint. Mutual information is defined as

$$ MI(p(x, z)) = \mathbb{E}_{x, z \sim p(x, z)}  \log \frac{p(x, z)}{p(x)p(z)}
= \mathbb{E}_{x, z \sim p(x, z)}  \log \frac{p(z | x)}{p(z)}$$

Under the assumption that $p_\delta(x)$ contains only distinct elements the
conditional $p_{\delta\pi}(z|x) \equiv 1$ for all values $x$ from the
$p_\delta(x)$. So, choosing $x$ uniquely determines the value of $z$, as can
be seen from the right part of Figure \ref{fig:two-discrete-joints}. Then the
mutual information is given by

\begin{equation}\label{eq:mi-empirical} MI (p_\delta(x) p_\pi(z|x)) =
    \mathbb{E}_{x, z \sim p_\delta(x) p_\pi(z|x)} \log
    \frac{p_{\delta\pi}(z|x)}{p_{\delta\pi}(z)} = \mathbb{E}_{z \sim
    p_{\delta\pi}(z)} \log \frac{1}{p_{\delta\pi}(z)} = H(p_{\delta\pi}(z))
\end{equation}

So, the mutual information is always equal to the entropy of the empirical
prior. It is possible to show that this value is the maximum possible one.
This becomes obvious from the entropic factorization of the mutual information

\begin{equation} \label{eq:mi-entropic-factorization} MI(p(x, z)) = H(p(z)) -
    \mathbb{E}_{x \sim p(x)} \left [ H(p(z|x)) \right ]
\end{equation}

Since the entropy is non-negative, the mutual information can be decreased
only through the second term of (\ref{eq:mi-entropic-factorization}), which
equals $0$ because $z$ value is completely determined by $x$.

When we try to extend the observations above to continuous cases we face the
following challenge: empirical distribution has infinite values at the sample
points. This drives the differential entropy as well as mutual information to
infinity. However, adding noise to the empirical distribution solves this
problem. Adding uniform noise with the interval smaller than the precision of
floating point makes the entropy finite and constant with respect to $\pi$.
One can show that the resulting mutual information of the empirical joint also
equals to $\log N$.

% TODO: write a proof in the comment
% TODO: check mention of the value of the MI (log N)

% Refine notation of the entropy Update figure pic Entropy of empirical
% distribution (discrete and continuous cases) Independence of entropy of the
% empirical joint from permutation parameter. High MI joint

\section{Wasserstein Distance and CoLLike}\label{ap:wasserstein}

Optimal Transport cost becomes Wasserstein distance when $c$ is a metric. A
very illustrative example from this family is equality of Wasserstein-2
($c$ is the Euclidean distance) and CoLLike for some setups. Specifically, the
following objective can be produced both by Wasserstein distance and CoLLike
$$
\mathcal{L}_W(\theta) = \min_{\pi \in \Pi} \sum_i \left(x_i - f_\theta(z_{\pi
 (i)}) \right)^2 $$
To get this objective from OT perspective we define the model distribution to
be produced by passing a fixed sample from prior through a deterministic
decoder $f_\theta(z)$. The result is an empiric distribution in $x$ domain.
Wasserstein distance between two empiric distributions is determined by
optimal pairing between points from data distribution $p_\delta(x)$ and model
distribution spanned by empiric latents. The same objective is produced by
factorized Gaussian $p_\theta(x|z)$ and uniform prior $p(z)$. 

This connection reveals that the model $p_\theta(x|z)$ defines both mapping
from $z$ to $x$ domain and "topology" of the $x$ space (how we measure distance
between objects). However, approaches based on the Wasserstein distance are
limited to continuous variables, while CoLLike is applicable both for discrete
and continuous domains. Moreover, CoLLike provides a 
probabilistic basis for the choice of the cost function.

\section{Optimal Pairing by Combinatorial Optimization} \label{ap:combinator}

Having at hand log-likelihood values for all possible $x_i$ $z_j$ pairs 
we are ready to find the
optimal permutation. A naive way to do so is to evaluate the sum
\ref{eq:colike} for every possible permutation $\pi$. Despite we need only to
sum different pre-computed values, the search space for $\pi$ is tremendous
$N!$. However, we can cast this problem as a combinatorial optimization one.
Following \cite{papadimitriou1998combinatorial}, the assignment problem is
stated as follows
\begin{align*} \text{minimize} \quad & \sum_{i, j} c_{i, j} a_{i, j} \\
    \text{subjected to} \quad & \sum_i a_{i, j} = 1 \quad j = 0, ..., N \\ &
    \sum_j a_{i, j} = 1 \quad i = 0, ..., N  \\ & a_{i, j} \in \{0, 1 \}
\end{align*}
where $c_{i, j}$ is the cost of picking the element $i, j$ and $a_{i, j}$ is
the indicator variable. The constrains of this problem define a set of
permutation matrices. Choosing the cost to be negative log-likelihood and
replacing indicator variable with permutation we end up with CoLLike
objective. This combinatorial optimization problem can be solved efficiently
with Hungarian algorithm \cite{kuhn1955hungarian} with complexity
$\mathcal{O}(N^3)$.

\section{Models Description}\label{ap:model-desc}
\subsection{Normalizing Flows}

We use Glow-like normalizing flow for all image experiments. We choose the learning 
rate by starting from 1e-2 and gradually decrease it until there is no instabilities
during training. No extensive learning rate search was done. Below we provide details
on the model parameters.

CIFAR model:
\begin{itemize}
    \item architecture: Glow
    \item number of flows per scale: 7
    \item flow coupling: affine
    \item coupling net: ResNet, 3 blocks, hidden size 96
    \item permutation flow: invertible 1x1 convolution with lower-upper factorization
    \item normalization flow: ActNorm
    \item number of scales: 3
    \item scale factor: 2 height, 2 width
\end{itemize}

MNIST model (same as CIFAR except the number of scales):
\begin{itemize}
    \item architecture: Glow
    \item number of flows per scale: 7
    \item flow coupling: affine
    \item coupling net: ResNet, 3 blocks, hidden size 96
    \item permutation flow: invertible 1x1 convolution with lower-upper factorization
    \item normalization flow: ActNorm
    \item number of scales: 2
    \item scale factor: 2 height, 2 width
\end{itemize}

During training we use marginal likelihood to validate both MaL and CoLLike. 
We did not search over possible optimizers, and use Adam \cite[]{kingma-adam-14}
with default parameters. The validation split is chosen to be $0.05$ because no
significant variations of the likelihood were observed during training.
To summarize, we use the following parameters both for CoLLike and MaL:
\begin{itemize}
    \item epochs: 256
    \item learning rate: 5e-5 - MNIST; 2e-4 - CIFAR
    \item batch size: 64
    \item validation part of the training set: 0.05
    \item validation criterion: marginal likelihood
    \item optimizer: Adam, $\beta = (0.9, 0.999); \epsilon = 1e-8$
\end{itemize}

As data pre-processing step, we used only dequantization with unifom noise, with the 
range equal to the quantization step.


\subsection{Transformer}

We used simple two-layer transformer across our experiments. The
model description:

\begin{itemize}
    \item number of layers: 2
    \item hidden size: 128
    \item feedforward dimension: 128
    \item embedding dimension: 128
    \item number of attention heads: 4
    \item number of embeddings: 4000
\end{itemize}

The training details are similar to CIFAR configuration:
\begin{itemize}
    \item epochs: 256
    \item learning rate: 2e-4 - CIFAR
    \item batch size: 64
    \item validation part of the training set: 0.05
    \item validation criterion: marginal likelihood
    \item optimizer: Adam, $\beta = (0.9, 0.999); \epsilon = 1e-8$
\end{itemize}

In pre-processing step, we truncated the sequences longer than 192 tokens.
Truncation affected less than $0.3\%$ of the samples.
Nevertheless, the tokenizer was trained on the full-length sequences. Data 
pre-processing can be summarized as follows:
\begin{itemize}
    \item maximum length truncation: 192
    \item BPE tokenization 
    \item vocabulary size: 4000 
\end{itemize}
\end{document}

