\documentclass[accepted]{uai2022} % for initial submission

% Recommended, but optional, packages for figures and better typesetting:
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{booktabs} 
\usepackage{multirow}
\usepackage[table]{xcolor}
\usepackage{hyperref}

% Attempt to make hyperref and algorithmic work together better:
\newcommand{\theHalgorithm}{\arabic{algorithm}}
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{tikz}
\usetikzlibrary{bayesnet}
\usetikzlibrary{arrows}
\usepackage{color}
\usepackage{soul}
\usepackage{graphicx}
\usepackage{caption}
\usepackage{subcaption}
\usepackage[inline]{enumitem}
\usepackage{hhline}
\usepackage{colortbl}
\usetikzlibrary{backgrounds}

\usepackage{math}
\def\etal{\emph{et al.}}
\def\ie{\emph{i.e.}}
\def\eg{\emph{e.g.}}
\newcommand\todo[1]{\textcolor{red}{\textbf{[#1]}}} %TODO
\newcommand\fede[1]{\textcolor{blue}{\textbf{[#1]}}} %TODO
\newcommand{\mounia}[1]{\textcolor{green}{#1}}
\usepackage{cleveref}

\newcommand{\us}{MIST}
\newcommand{\dif}{\text{d}}
\newcommand{\tV}{{\bm{x}}}%\bm{t}}
\newcommand{\uM}{{U}}
\newcommand{\uV}{\bm{u}}
\newcommand{\zV}{\bm{z}}
\newcommand{\mM}{{M}}
\newcommand{\sM}{{S}}
\newcommand{\fV}{\bm{f}}
\newcommand{\fM}{{F}}
\newcommand{\lM}{{L}}
\newcommand{\kl}{\text{KL}}

% Add authors
\title{Efficient Inference for Dynamic Topic Modeling with Large Vocabularies}
\author[1]{\href{mailto:<federicot@spotify.com>?Subject=Your UAI 2022 paper}{Federico Tomasi}{}}
\author[1]{Mounia Lalmas}
\author[1]{Zhenwen Dai}
\affil[1]{%
    Spotify Research
}

\begin{document}

\maketitle

\begin{abstract}
Dynamic topic modeling is a well established tool for capturing the temporal dynamics of the topics of a corpus. 
% Currently, dynamic topic models can only consider a small set of frequent words because of their computational complexity and insufficient data for less frequent words.
In this work, we develop a scalable dynamic topic model by utilizing the correlation among the words in the vocabulary. By correlating previously independent temporal processes for words, our new model allows us to reliably estimate the topic representations containing less frequent words. We develop an amortised variational inference method with self-normalised importance sampling approximation to the word distribution that dramatically reduces the computational complexity and the number of variational parameters in order to handle large vocabularies. With extensive experiments on text datasets, we show that our method significantly outperforms the previous works by modeling word correlations, and it is able to handle real world data with a large vocabulary
% (80K words) 
which could not be processed by previous continuous dynamic topic models. 
% With qualitative analyses, we show that our method can perform inference on infrequent but representative keywords much more reliably than previous methods.
\end{abstract}

\section{Introduction}\label{sec:intro}
% Topic models are generative probabilistic models that use a restricted number of distributions over a vocabulary to describe a document collection \citep{blei2003latent}. 
Topic modeling has been widely used to extract the main topics from a large collection of content such as text documents, images and other types of data that can be represented as bag-of-words \citep{balikas2016topic,kho2017novel}.
In topic modeling, a topic is represented as a probability distribution over the words in the vocabulary, and the words in a document are assumed to be independently drawn from a mixture of topics \citep{blei2003latent}.
This approach allows us to efficiently infer topic compositions of documents in a large corpus without modeling sentence and paragraph structures.

Topic modeling has been extended to analyse the evolution of topics in a corpus over time \citep{bhadury2016scaling,jahnichen2018scalable,tomasi2020stochastic}. 
The prior distribution of topic composition and the representations of individual topics are augmented into temporal processes such as Gaussian processes (GPs).
With these methods, one can understand the rise and fall of a topic at an aggregated level. 
For example, when applied to the machine learning research literature, we can easily observe the changes of the popularity of different research topics over time. 
Such dynamic topic modeling requires a large amount of data because we need lots of documents at each time point to reliably estimate the topic representations and topic compositions.
It is particularly challenging for modeling less frequent words because each word needs to be observed multiple times for each relevant topic at every time point, which is less likely for rare words.
It is also computationally very challenging for the current dynamic topic models to handle a large vocabulary due to the increased computational and memory requirement.
On the other hand, those less frequent words are often very specific, and hence may be strong cues to inform the topic of a document.

To leverage the information from less frequent words, we propose to incorporate word correlation in dynamic topic modeling. 
By doing so, a topic model can obtain sufficient signals about less frequent words by observing the existence of similar words.
We incorporate word correlation by augmenting the generative model of topic representations.
Previously, a topic representation is assumed to be drawn from a temporal process represented by a set of GPs, one for each word.
The word correlation is introduced by correlating these previously independent GPs, resulting in a multi-output Gaussian process \citep[MOGP;][]{alvarez2010efficient}.
Ideally, MOGP can explicitly capture word correlation in the form of a covariance matrix of all the words. 
This is infeasible due to both the high computational complexity and the large amount of data required for a reliable estimate. 
Instead of an explicit covariance matrix, we represent the correlation of words by embedding them into a latent space and generate the covariance matrix through a covariance function.
With a Bayesian treatment to word representations (vectors) in the latent space, we can obtain a reliable estimate of the correlation with a small amount of data.

We develop an efficient stochastic variational inference method for the topic model.
By extending the sparse GP \citep{titsias2009variational} to our MOGP formulation, the derived variational lower bound has the same computational complexity as in previous works with word independence and has significantly less number of variational parameters.
Furthermore, compared to \citep{tomasi2020stochastic}, we improve the amortised inference formulation by adopting a meta-encoder for the variational posterior of topic mixing proportions.
% In \citep{tomasi2020stochastic} the encoder is not aware of the evolution of topic representations, which cannot accommodate dramatic changes in topic representations.
The meta-encoder takes as inputs not only a document representation but also a summary of all the topic representations at a given time point. 
This allows the meta-encoder to easily handle the changes to topic representations.
%With a meta-encoder, the model infers more accurately the posterior of topic mixing proportions of a document, leading to a better modeling of temporal dynamics. 
%
As the word distributions are modelled in as unnormalised log probability, the log-likelihood is calculated by drawing a sample of all the words in the vocabulary, which is computationally expensive for large vocabularies.
For efficient inference, we derive an asymptotically unbiased estimator for the gradient of the lower bound that samples a subset of words from the vocabulary. This greatly increases the scalability of the inference method on large vocabularies.
%
In summary, the main contributions of this paper are:
\begin{itemize}
    \item We develop a word-correlated dynamic topic model, where the word correlation is jointly inferred together with the topic model from the data.
    \item We derive an efficient amortised variational inference method, which has the same computational complexity as the word independent model and less number of variational parameters.
    \item We derive an asymptotically unbiased estimator for the gradient of the lower bound, in which the computational complexity is constant with respect to the vocabulary size.
    \item On synthetic and real world datasets, we show that our method significantly outperforms the previous dynamic topic models in term of both quality and scalability.
\end{itemize}

% We propose the multiple output dynamic correlated topic model (\us), to jointly model the word-topic probability through the use of latent variable multiple output Gaussian processes.
% % 
% Our formulation assumes topics to be independent multi-output Gaussian processes, which considers both words and other index points (such as timestamps of documents) as observations of a latent representation which we learn together with the topic model itself.
% The formulation benefit from a sparse GP approach, allowing the model to have constant complexity in terms of both number of words in the vocabulary and timestamps. We emphasise that this does not hold in existing topic models, as they considers words to be independent (thus having at least linear complexity in the number of words).

% We augment the model with auxiliary variables as in the stochastic variational GP~\citep{hensman2013gaussian} to derive a scalable variational lower bound.
% To overcome the intractability problem of the lower bound, we marginalise the discrete latent variables before applying a Monte Carlo sampling approximation using the reparameterisation trick. This allows for a low-variance estimate for the gradients. 
% To enable efficient inference in the multiple output GPs, we also propose a variant of our model which considers the FITC approximation \citep{naish2008generalized}. 
% This allows us to speed up the inference of the model, avoiding drawing correlated samples from the distribution.

\paragraph{Outline.}
The rest of this paper is organised as follows. 
\Cref{sec:related-work} discusses related work.
\Cref{sec:modctm} presents our novel contribution, a word-correlated dynamic topic model.
\Cref{sec:vi} describes an efficient variational inference procedure, using sparse Gaussian processes.
\Cref{sec:experiments} includes our experiments. 
\Cref{sec:conclusion} concludes with a discussion of the contributions of this work.

\section{Related Work}\label{sec:related-work}
\paragraph{Topic Models.}
Topic models were proposed as a way to infer a mixture of topics from a collection of documents 
% originally being static over time and with no correlation among topics 
\citep{blei2003latent}.
The correlated topic model \citep[CTM;][]{blei2006correlated} allows topics to be correlated using a logistic normal distribution. 
%
Dynamic topic models have been proposed to enable consistent topics over a series of documents indexed by a temporal index \citep{blei2006dynamic,wang2008continuous,dieng2019dynamic}. 
Recent models have been extending the idea by using the inherent structure between documents through continuous processes \citep{bhadury2016scaling,jahnichen2018scalable,tomasi2020stochastic}. 
% \mounia{relate back to our paper}
However, the assumption of word independence given the topic does not allow information sharing across words, which limits in practice the applicability of topic models on corpus with large vocabulary and short documents.
% A major drawback of topic models is that they consider words to be independent from each other. This is one of the main reasons why topic models have been performing poorly on short text data, because of the lacking of word co-occurrence information \citep{qiang2020short}.
% A solution to this has been proposed through the use of external information, such as pre-trained word embeddings \citep{yi2020topic}. However, such models needs to completely rely on external information, and additional words cannot be included after training. 
% In addition, the models complexity increases linearly with the number of words in the dictionary, which can become an issue in presence of a large vocabulary size.


% \paragraph{Word embeddings for topic models.}
% The limitation of topic models considering independent words has already been studied in the literature, and models aiming at relaxing this assumption have been proposed. 
% Pre-trained word embeddings have been shown to be beneficial for topic models, to learn coherent topics through word similarities \citep{xie2015incorporating}. In particular, their benefits are mostly evident in modeling short texts, to overcome the issue of a sparse word co-occurrence \citep{yi2020topic}.
% Recent approaches have been proposed to incorporate external additional information. % to overcome the topic modeling limitation in short texts.
% \citet{xie2015incorporating} exploit a prior similarity between the words to learn coherent topics, using a regularised version of LDA that encourage similar words to be assigned to the same topic. 
% % The word correlation that drives the model needs to be given as prior knowledge. 
% % However, the correlation between the words is only used to understand correlation between the topics, and word embeddings are not learnt.

\paragraph{Word \& Topic Embedding.}
The idea of word or topic embedding has been explored in the topic modeling literature. In particular they have been used to learn coherent topics through word similarities \citep{xie2015incorporating}
% \citet{he2017efficient} learnt latent embeddings of topics, which captures the correlations among topics.
% \citet{xun2017correlated} used Word2Vec to train word embeddings and represents a topic as a multi-variant Gaussian distribution in the embedded space. 
or represent a topic as Gaussian distributions in the embedded space \citep{xun2017correlated}. 
Recently, topic modeling has been formulated as factor analysis, where words are embedded into a latent space \citep{yi2020topic}.
Compared to our approach, none of these methods consider temporal dynamics and word embeddings are either learned outside the topic model such as using Word2Vec or lead to dramatic changes to the topic model formulation.
% 
Other recent work shows how a dynamic LDA and word embeddings can be effectively combined \citep{dieng2019dynamic}. Similar to our framework, the word embeddings are learned within the model and separated from the topic representations. However only discrete time stamps are considered, which do not allow to generalise the model to new time points, and topics are independent from each other.
% 
Another attempt to consider temporal dynamics have been proposed through Gaussian process latent variable models (GPLVMs), to infer a latent correlation between topics in discrete time stamps~\citep{song2008non}. Additional approaches consider embedding words through GPLVMs, and regard the resulting latent representation as topics. Topic correlation is then encoded through embedding similarity~\citep{agovic2010gaussian,hennig2012kernel}. 
% \mounia{relate back to our paper}
However, such topic models still use word embeddings to drive topic correlation. 
As we consider the correlation between the topics as an additional parameter, we can independently use (and learn from scratch) word embeddings to reliably model short texts.
% 
% The flexibility of topic models allows to incorporate additional information about the input documents into the model to learn consistent topics. 
% Our contribution enables the topic inference in documents with limited size while reducing the temporal complexity, which is constant in terms of the number of words of the vocabulary.
% The approach is still limited as its complexity is linear with the number of words (words are considered independent), so impractical with large-scale datasets.
% We aim at bridging this gap.

\paragraph{Multi-output Gaussian Process.}
% Our approach to deal with word correlation is based on the multiple output Gaussian processes (MOGPs).
% MOGPs have been extensively studied in the literature to consider Gaussian processes (GPs) for multiple related outputs \citep{alvarez2011kernels}.
MOGPs \citep{alvarez2010efficient,williams2007multi,stegle2011efficient} extend GPs by explicitly modeling the correlation among multiple output dimensions. The correlation is encoded as a covariance matrix among output dimensions, which is also known as a coregionalization matrix. The resulting model is still a GP but with a much larger covariance matrix (a Kronecker product between the coregionalization matrix and the covariance matrix based on inputs), which poses a significant challenge on computation. \cite{dai2017efficient} addressed this problem by proposing an efficient variational inference method, in which the coregionalization matrix is represented as a latent space embedding similar to Gaussian process latent variable model \citep[GPLVM;][]{lawrence2004gaussian}. Our approach extends the latent space embedding formulation of MOGP into topic modeling, which allows us to correlate the temporal processes of individual words in a topic representation.

\paragraph{Adaptive Softmax.}
The main issue in topic models is that the normalisation constant of the topic distribution depends on all of the vocabulary. The same issue can be found in language modeling or classification problems, where the number of classes to predict may be high, and computing probabilities for negative classes is too expensive \citep{bengio2008adaptive,blanc2018adaptive}. Solutions to this problem have been proposed by approximating the softmax transformation \citep{zoph2016simple,panos2021large,jean2014using,Bamler2020Extreme}. In particular, a self-normalised importance sampling procedure has been shown to effectively increase the training performance \citep{bengio2008adaptive,jean2014using}.
While not directly applicable in topic models, we show how it is possible to implement a similar procedure when estimating the topic distribution for dynamic topic models, allowing to effectively overcome vocabulary restrictions.


\section{Dynamic Topic Modeling with Word Correlation}\label{sec:modctm}
We propose a dynamic topic model with word correlation, which we refer to as \us{} (Multi-output with Importance Sampling Topic model).
% We propose the use of a multiple output Gaussian process (MOGP) to model the topic-word association in dynamic topic models. We refer to our proposed model as \us{}.
% Our model is an extension of the dynamic correlated topic model \citep[DCTM;][]{tomasi2020stochastic}.
\us{} is a probabilistic generative model that assumes that each document $d$, associated with a specific time point $\tV_d$, is generated by sampling a set of words according to $K$ topics. 
Each document has an unnormalised topic mixing proportion $\bm\eta_d$ sampled from a prior distribution, $\bm\eta_d \sim \N(\bm\mu_{\tV_d}, \cov_{\tV_d})$, 
where $\bm\mu_{\tV_d}$ is the mean of the distribution of topics mixing proportions associated to the time point $\tV_d$, and $\cov_{\tV_d}$ is the covariance matrix of topics at $\tV_d$. 
When $\cov_{\tV_d}$ is diagonal, the mixing proportion for each topic are independent to each other.
Then, each word $w_n$ in this document is assigned with a topic $z_n$, which is sampled from the distribution $\sigma(\bm\eta_d)$, where $\sigma(x)_i = {\exp(x_i)}/{\sum_j \exp(x_j)}$ is the softmax function. 
Finally, the word $w_n$ is sampled by picking a word from the vocabulary following the unnormalised word distribution of the assigned topic $z_n$ at the time $\tV_d$, $\bm\beta_{z_n}^{(\tV_d)}$.
%

\begin{figure}
\centering
\resizebox{0.9\columnwidth}{!}{%
      \tikz{
 \node[obs] (w) {$w_{dn}$};%
 \node[latent,left=of w] (z) {$z_{dn}$}; %
 \node[latent,left=of z] (eta) {$\eta_d$}; %
 \node[latent,left=of eta,yshift=0.5cm] (mu) {$\mu$}; %
 \node[latent,left=of eta,yshift=-0.5cm] (sigma) {$\cov$}; %
\edge {mu,sigma} {eta} ; %
 
 \node[latent,left=of sigma,yshift=-0.5cm] (L) {$L$}; %
 \node[latent,left=of sigma,yshift=0.5cm] (f) {$f$}; %
 \edge {L,f} {sigma} ; %
 \plate {plate1} {(w)(z)} {$N_d$}; %
 \plate {plate3} {(plate1)(eta)} {$D$}; %
 
 \node[latent,right=of w] (beta) {$\beta_{k}$}; %
 \node[latent,right=of beta] (h) {$H$}; %
 \plate {plate2} {(beta)} {$K$}; %
 \edge {z,beta} {w};%
 \edge {h} {beta};%
 \edge {eta} {z};% 
 }}
 \caption{The graphical model for \us{}. 
 %The latent topic proportions of a document are modelled by a logistic normal distribution, so to encode global correlations between the topics.
 }\label{fig:graphical-model}
\end{figure}
\Cref{fig:graphical-model} shows an overview of the graphical model of \us{}.
The generative process of a $N_d$-word document $d$ is summarised as follows. First, draw a mixture of topics $\bm\eta_d \sim \N(\bm\mu_{\tV_d}, \cov_{\tV_d})$. Then, for each word $n = 1, \dots, N_d$:
\begin{enumerate}
    \item Draw a topic assignment $z_n | \bm\eta_d$ from a categorical distribution with parameter $\sigma(\bm\eta_d)$;
    \item Draw a word $w_n | z_n, \bm\beta$ from a categorical distribution with parameter $\sigma(\bm\beta_{z_n}^{(\tV_d)})$.
\end{enumerate}
% \begin{enumerate}
%     \item Draw a mixture of topics $\bm\eta_d \sim \N(\bm\mu_{\tV_d}, \cov_{\tV_d})$;
%     \item For each word $n = 1, \dots, N_d$:
%     \begin{enumerate}
%         \item Draw a topic assignment $z_n | \bm\eta_d$ from a categorical distribution with parameter $\sigma(\bm\eta_d)$;
%         \item Draw a word $w_n | z_n, \bm\beta$ from a categorical distribution with parameter $\sigma(\bm\beta_{z_n}^{(\tV_d)})$.
%     \end{enumerate}
% \end{enumerate}
% 
The individual documents are assumed to be i.i.d.~given the document-topic proportion and topic-word distribution. 
% \begin{equation}
% p(W|\mu, \cov, \beta) = \prod_{d=1}^D \int  p(W_d | \eta_{d}, \beta_{\tV_d}) p(\eta_d| \mu_{\tV_d}, \cov_{\tV_d}) \dif \eta_d,
% \end{equation}
% where $\tV_d$ is the time stamp of the document $d$.
Under this generative process, the marginal likelihood for a given corpus $W$ that contains $D$ documents becomes:
\begin{align}\label{eq:z-prob-model}
\begin{split}
p(&W|\bm\mu, \cov, \bm\beta) = \\
&\prod_{d=1}^D \int \prod_{n=1}^{N_d} 
\left(\sum_{z_n = 1}^K p(w_{dn} | z_n, \bm\beta_{z_n}^{(\tV_d)}) p(z_n | \bm\eta_d) \right) \\
&p(\bm\eta_d| \bm\mu_{\tV_d}, \cov_{\tV_d}) \dif \bm\eta_d. 
\end{split}
\end{align}
% where $\bm\beta_{z_n}^{(\tV_d)}$ denotes the word distribution for topic $z_n$ at time point $\tV_d$.
% 
% \paragraph{Dynamics of $\mu$ and $\Sigma$.}
To model the temporal dynamics of topic mixing proportions $\bm\eta_d$, we consider temporal processes as the prior distributions for $\mu$ and $\Sigma$.
In particular, we consider zero-mean Gaussian process to model the topic probability $\left(\bm\mu_{\tV_d}\right)_{d=1}^D$, \ie, $p(\bm\mu) = \GP(\bm{0}, \kappa_\mu)$.
%
Similarly, we model covariance matrices $\left(\cov_{\tV_d}\right)_{d=1}^D$ as generalised Wishart process (GWP), indicated as $\cov_{\tV_d} \sim \GWP(V, \nu, \kappa_\theta)$ \citep{wilson2011generalised,heaukulani2019scalable,tomasi2020stochastic}.
% Wishart processes are constructed from a collections of $K\nu$ Gaussian processes, where $f_{ki} \sim \GP(\bm{0}, \kappa_\theta)$ with (shared) kernel function $\kappa_\theta$, where $\theta$ denotes any parameters of the kernel function, with $\nu \geq K$ degrees of freedom.
% Let $F_{\tV_d ki} := f_{ki}(\tV_d)$, and let $F_{\tV_d} := (F_{\tV_dki}, k \leq K, i \leq \nu)$ denote the matrix of collected function values, for every $d \geq 1$. Then, consider
% \begin{align}\label{eq:wp}
%     \cov_{\tV_d} = LF_{\tV_d}F_{\tV_d}^\top L^\top, d\geq1,
% \end{align}
% where $L\in\R^{K\times K}$ is a lower triangular matrix with the diagonal entries being positive, which satisfies the condition that $LL^\top$ is positive definite. 
% With such a construction,
% $\cov_{\tV_d} \sim \GWP(V, \nu, \kappa_\theta)$ follows a Wishart process.


\paragraph{Word Correlation.}
In dynamic topic models \citep{jahnichen2018scalable} the topic representations $\bm\beta$ are allowed to change over time. This is done by defining a GP prior over time independently for each word in each topic, so that there will be $KP$ independent GPs, where $P$ is the number of words in the vocabulary. This does not allow information sharing among similar words and results into a large number of variational parameters for inference.
% 
In this paper, we introduce correlation among words by defining a correlated temporal process for all words. 
First, we define a latent representation $\bm h_i \in \R^Q$ for each word in the vocabulary. 
The latent representations are given an uninformative prior $\bm{h}_i \sim \N(0, \mathbf{I})$. 
Then, a MOGP is defined for the topic representations over time for each topic:
\begin{align}\label{eq:mogp}
p((\bm\beta_k)_{:} | H, \tV) &= \N((\bm\beta_k)_{:} | 0, K^H \otimes K^\tV),
\end{align}
where $(\cdot)_{:}$ denotes a matrix vectorisation, $\otimes$ denotes the Kronecker product, $\bm\beta_k$ is a $T\times P$ matrix representing the unnormalised word probabilities over time for the topic $k$ ($T$ is the number of unique time points in the corpus). 
The covariance matrix $K^\tV$ is computed using the kernel function $\kappa_\tV$ over all the time points $\tV$ and the covariance matrix $K^H$ is computed using the kernel function $\kappa_H$ over all the word representations $H=(\bm{h}_1, \ldots, \bm{h}_P)$. 
With this formulation, all the words at all the time points are jointly modeled with a single GP, in which the word correlation is encoded in the $TP \times TP$ covariance matrix.
The prior distributions among different topics are assumed to be independent: $p(\bm\beta| \tV, H) = \prod_{k=1}^K p(\bm\beta_k| \tV, H)$. 
Note that the word correlations are encoded through the latent representations of words, which are static over time and shared across all the topics.
Although the number of those latent vectors is relatively large, they can be reliably estimated by conditioning on the whole corpus.

% The $\bm\beta$ parameter models the probability of words belonging to topics. Previous works consider the prior probability of $\bm\beta$ to be a Gaussian process, to model the temporal evolution of the word-topic association. 
% This requires $KW$ independent variables, one for each topic-word association, which may pose problems in practice, especially when the number of topics (or words) increases. 
% % 
% The idea behind this work is the fact that, in real applications, 
% not all of the words in the corpus under analysis are independent from each other.
% For this reason, incorporating the similarity between the words in the model, and model \emph{groups} of words through Gaussian processes (instead of modeling each word independently) enables robustness and efficient inference.
% As an example, suppose a word $w_2$ is introduced in the model as a synonym of another word $w_1$. The model should be already equipped to use $w_2$, in the same way it has been using $w_1$. This is straightforward in our model, which can use the same embedding for both $w_1$ and $w_2$.

% Conversely to standard topic models, we do not assume independence between the words in the vocabulary. We consider a MOGP prior for $p(\bm\beta)$: we regard each word as an \emph{observation}, and we can keep the complexity constant with respect to the number of words by using a sparse GP approach.
% For simplicity we assume that the vocabulary distribution of each topic is independent, so that $p(\bm\beta) = \prod_{k=1}^K p(\bm\beta_k)$, where $K$ is the number of topics.
% The probability distribution of $\bm\beta_k$ given two sets of latent variables, $\tV$ and $H$, is \citep{dai2017efficient}:
% \begin{align}
% p(\beta_{k:} | \tV, H) &= \N(\beta_{k:} | 0, K^H \otimes K^\tV),
% \end{align}
% where the latent variables $H$ have Gaussian priors $h_d \sim \N(0, I)$, $\otimes$ denotes the Kronecker product and the ``:" notation is a matrix vectorisation.
% The model assumes that the covariance matrix can be decomposed as a Kronecker product of the covariance matrix of the latent variables $K^H$ and the covariance matrix of the inputs $K^\tV$.
% In previous works $\tV$ was considered to be a vector (\eg, representing temporal dynamics).
% While our experiments consider only temporal dynamics, our model allows for an arbitrary $\tV$.


The topic assignment variables $\{z_n\}_{n=1}^{N_d}$ for individual words of each document are latent and discrete, {which} are difficult to infer with variational inference. We marginalise out these discrete variables and obtain a closed form likelihood distribution,
\begin{equation}\label{eq:word-probability}
p(W_d|\bm\eta_d, \bm\beta) = \prod_{n=1}^{N_d}\text{Cat}( \sigma(\bm\beta^{(\tV_d)}\sigma(\bm\eta_d))),
\end{equation}
where $\bm\beta^{(\tV_d)}$ denotes the representations of all the topics at the time $\tV_d$.
With this formulation, a document can be represented in the form of word-count, allowing for a simplified formulation of our variational inference procedure.

\section{Variational Inference}\label{sec:vi}
The MOGP formulation provides an elegant framework to correlate both the temporal dimension and the words in the vocabulary for the topic representations under a single GP. 
It also brings a significant challenge for inference, because the computational complexity for calculating the probability density function (PDF) of Equation~\eqref{eq:mogp} alone is $O(P^3T^3)$.
To overcome this challenge, we develop an efficient variational inference method based on the stochastic variational sparse GP formulation~\citep[SVGP;][]{hoffman2013stochastic}, reducing the computational complexity to be linear with respect to $P$ and $T$. 

% to perform mini-batch training with stochastic gradient descent methods.
% Our inference method include two main features: amortised inference and reparameterisation trick~\citep{KingmaWelling2014}. The use of amortised inference enables mini-batch training in presence of local latent variables such as deep Gaussian processes~\citep{DaiEtal2015} and variational autoencoder~\citep{KingmaWelling2014}. The reparameterisation trick is used to obtain low-variance gradient estimates with Monte Carlo sampling for intractable variational lower bounds. 

% While SVI can only be applied on i.i.d. data (conditioned on a global set of parameters), assumption that is typically broken when using Gaussian processes (GPs),~\citep{hensman2013gaussian} derive a tractable lower bound to the marginal likelihood of the data to allow for data subsampling.
% Even though the log marginal likelihood of MOGP cannot be easily approximated with data sub-sampling, we use the stochastic variational sparse GP formulation~\citep{hensman2013gaussian} where an unbiased estimate of the variational lower bound could be derived from data sub-sampling, which is essential for mini-batch training. 

\subsection{Variational Inference for Word Correlation}

The word correlation is encoded by the latent representations of individual words $H$. We parameterise the variational posterior of $H$ as $q(H) = \N(m_H, S_H)$ and derive a variational lower bound,
\begin{equation}\label{eq:bound_H}
\log p(W | \tV) \geq \mathbb{E}_{q(H)} [\log p(W | \tV, H)] - \kl(q(H) || p(H)),
\end{equation}
where $\kl(\cdot|| \cdot)$ denotes the Kullback-Leibler divergence. The KL term in \eqref{eq:bound_H} can be computed in closed form because both $q(H)$ and $p(H)$ are normal distributions, but $p(W | \tV, H)$ is intractable.

To derive a lower bound for the marginalised likelihood $p(W | \tV, H)$, we first derive a variational lower bound for $\log p(\bm\beta_k | \tV, H)$ by taking the SVGP formulation. 
To take advantage of the Kronecker product structure in the covariance matrix, \ie, $K^H \otimes K^\tV$, we define the inducing variables to be on a grid in the joint space of the word embedding and the temporal dimension. Let $U_{\bm\beta_k}$ be a $M_x \times M_H$ matrix, which follows the distribution 
$%\begin{equation}
p(\uM_{\bm\beta_k} | Z_\tV, Z_H) = \N((\uM_{\bm\beta_k})_{:}|0, K_{uu}), 
$%\end{equation}
where $K_{uu} = K_{uu}^H \otimes K_{uu}^\tV$. The rows of $U_{\bm\beta_k}$ corresponds to a set of inducing inputs in the temporal dimension, denoted as $Z_\tV$, and the columns of $U_{\bm\beta_k}$ corresponds to a set of inducing inputs in the word embedding space, denoted as $Z_H$. Then, $K_{uu}^H$ is computed on the set of inducing inputs $Z_H$ with $\kappa_H$, while $K_{uu}^\tV$ is computed on the set $Z_\tV$ with $\kappa_\tV$.

After defining the inducing variable $U_{\bm\beta_k}$, we reformulate $p(\bm\beta_k | \tV, H)$ as
\begin{align}
p(&\bm\beta_k|\tV, H) = \int p(\bm\beta| \uM_\beta, \tV, H, Z_\tV, Z_H) 
p(\uM_\beta| Z_\tV, Z_H) \dif \uM_\beta.
\end{align}
The conditional distribution of $\bm\beta_k$ is
\begin{align}\label{eq:beta-prior}
    p(\bm\beta_k|&\uM_\beta, Z_\tV,  Z_H, \tV, H) = \\
    &\N(\bm\beta_k| K_{fu}K_{uu}^{-1}\uM_\beta, K_{ff} - K_{fu}K_{uu}^{-1}K_{uf}),
\end{align}
where $K_{fu} = K_{fu}^H \otimes K_{fu}^\tV$ and $K_{ff} = K_{ff}^H \otimes K_{ff}^\tV$. $K_{ff}^H$ is the covariance matrix computed on $H$ with $\kappa_H$, and $K_{ff}^\tV$ is computed on $\tV$ with $\kappa_\tV$.

% In our model, the topic distributions $p(\bm\beta)$ follow a multi-output Gaussian process formulation.
% The inference of this Gaussian process has cubic computational complexity with respect to the number of documents. To scale the inference to large datasets, we take a stochastic variational Gaussian process~\citep[SVGP;][]{hensman2013gaussian} approach to construct the variational lower bound of our model.
% Consider the usual formulation for sparse GPs.
% We first augment the Gaussian process with an inducing auxiliary variable $U_\beta$, such that $p(U_\beta) = \N(U_\beta|0, K_{u_\beta u_\beta})$ with a set of corresponding time stamps and inducing inputs, \ie,

% \vspace{-1em}\small\begin{align*}
% p(\bm\beta|\tV, H) = 
% \int p(\bm\beta| \uM_\beta, \tV, H, Z_\tV, Z_H) 
% p(\uM_\beta| Z_\tV, Z_H) \dif \uM_\beta,
% \end{align*}\normalsize
% where $\uM_\beta$ is the auxiliary variable for $\bm\beta$ and $Z_\tV$ is the corresponding index point; $Z_H$ are inducing inputs for the latent space $H$.

% The covariance matrix is defined as $K_{uu} = K_{uu}^H \otimes K_{uu}^\tV$. $K_{uu}^H$ is computed on the set of inducing inputs $Z_H$, while $K_{uu}^\tV$ is computed on the set $Z_\tV$.
% % 
% The conditional distribution of $\bm\beta_k$ is:
% \begin{align}\label{eq:beta-prior}
%     p(\bm\beta_k|\uM_\beta, Z_\tV, & Z_H, \tV, H) \nonumber\\
%     = \N(\bm\beta_k| & K_{fu}K_{uu}^{-1}\uM_\beta, K_{ff} - K_{fu}K_{uu}^{-1}K_{uf})
% \end{align}
% where $K_{fu} = K_{fu}^H \otimes K_{fu}^\tV$ and $K_{ff} = K_{ff}^H \otimes K_{ff}^\tV$. $K_{ff}^H$ is the covariance matrix computed on $H$ with $\kappa_H$, and $K_{ff}^\tV$ is computed on $\tV$ with $\kappa_\tV$.

With the augmented GP formulation, we can derive a variational lower bound following \citep{hoffman2013stochastic}. 
However, a naive parameterisation of the variational posterior $q(\uM_{\bm\beta_k})$ using a multivariate normal distribution has a $M_xM_H \times M_xM_H$ covariance matrix, which is too large for matrix inversion.
Instead, we define $q(\uM_{\bm\beta_k})$ with a Kronecker product covariance matrix similar to $p(\uM_{\bm\beta_k})$, 
\begin{equation}
q(U_{\bm\beta_k}) = \N((U_{\bm\beta_k})_{:}| M_:, \Sigma^H \otimes \Sigma^\tV).
\end{equation}
where $M$ is the mean of the variational posterior, $\Sigma^H$ is a $P \times P$ covariance matrix and $\Sigma^\tV$ is a $T \times T$ covariance matrix.
With this formulation, the covariance matrix can be inverted efficiently by only inverting the two smaller covariance matrices, $(\Sigma^H \otimes \Sigma^\tV)^{-1} = (\Sigma^H)^{-1} \otimes (\Sigma^\tV)^{-1}$. This parameterisation also dramatically reduces the number of variational parameters in the covariance matrix from $M_x^2M_H^2$ to $M_x^2+M_H^2$.

With the variational posterior $q(U_{\bm\beta_k})$, we derive the variational lower bound for any downstream variable that consumes $\bm\beta_k$,
\begin{align}\label{eqn:bound_beta}
\log p(\cdot | H) \geq \mathbb{E}_{q(\bm\beta_k | H) }[\log p(\cdot | \bm\beta_k)] - \kl(q(U_{\bm\beta_k}) || p(U_{\bm\beta_k})),
\end{align}
where $q(\bm\beta_k|H) = \int p(\bm\beta_k| \uM_{\bm\beta_k}, H) q(\uM_{\bm\beta_k}) \dif \uM_{\bm\beta_k}$.
As the expectation $\mathbb{E}_{q(\bm\beta_k | H) }[p(\cdot | \bm\beta_k)]$ has no close form solution for our model, we approximate it with Monte Carlo Integration by drawing samples from $q(\bm\beta_k | H)$. 

% Assuming $q(\bm\beta_k|\uM_\beta) = p(\bm\beta_k | \uM_\beta, \tV, H)$,
% $q(\uM_{\beta:}) = \N(M_{\beta:}, \Sigma^\uM)$
% and $q(H) = \N(m_H, S_H)$, we can derive a lower bound:
% \small\begin{align}\label{eqn:bound_beta}
% \log p(\cdot | \beta) \geq \mathcal{F} - \kl(q(\uM_\beta) || p(\uM_\beta)) - \kl(q(H) || p(H)),
% \end{align}\normalsize
% where $\mathcal{F} = \mathbb{E}_{q(\bm\beta_k | \uM_\beta)q(\uM_\beta)q(H)}[p(\cdot | \bm\beta_k)]$, and $q(\bm\beta) = \int p(\bm\beta| \uM_\beta, H) q(\uM_\beta)q(H) \dif \uM_\beta \dif H$.

The multivariate normal distribution with a Kronecker product covariance matrix like $p(U_{\bm\beta_k})$ and $q(U_{\bm\beta_k})$ is also called matrix normal distribution \citep{gupta1999matrix}.
In matrix normal distribution notation, $q(U_{\bm\beta_k})$ becomes $\mathcal{MN}(M, \Sigma^H, \Sigma^\tV)$. Sampling from the distribution and the $\kl$ divergence can be computed efficiently (details in Supplementary Material). 

% $U_{\bm\beta_k}$ can be sampled efficiently following the procedure:
% \begin{enumerate*}[label=\emph{(\roman*)}]
%     \item sample ${C}\sim\mathcal{MN}_{h\times x}(\mathbf{0},{I},{I})$, $C\in\R^{h\times x}$, a collection of independent samples from a standard normal distribution; then  
%     \item let $U_{\bm\beta_k} = (M + {ACB})_:$, where $\Sigma^H = AA^\top$ and $\Sigma^\tV = B^\top B$.
% \end{enumerate*}
% %
% The $\kl$ divergence between $q(U_{\bm\beta_k})$ and $p(U_{\bm\beta_k})$ can also be computed efficiently (see Supplementary Material).
% \begin{align*}
%     &\kl(q(U_{\bm\beta_k}) || p(U_{\bm\beta_k})) = \frac12 \bigg( M_x\log \frac{|K^H_{uu}|}{|\Sigma^H|} + M_H\log \frac{|K^\tV_{uu}|}{|\Sigma^\tV|} \\
%     &\hspace{9mm}+ \tr(M^\top(K^\tV_{uu})^{-1}M(K^H_{uu})^{-1}) + \tr((K^H_{uu})^{-1}\Sigma^H) 
%     \tr((K^\tV_{uu})^{-1}\Sigma^\tV) - M_H M_x )\bigg).
% \end{align*}
% 
% To implement $\tr[M^\top (K^\tV)^{-1}M(K^H)^{-1}]$, we use $K^\tV = L_\tV L_\tV^\top$, 
% $K^H = L_H L_H^\top$, $A = L_\tV^{-1} M L_H^{-\top}$, then $\tr[M^\top (K^\tV)^{-1}M(K^H)^{-1}] = \tr(A^\top A)$.

\paragraph{Sampling from $q(\bm\beta_k|H)$.}
To compute the expectation in \eqref{eqn:bound_beta}, we need to draw samples from $q(\bm\beta_k|H)$.
As $q(\bm\beta_k|H)$ is a multivariate normal distribution with a full covariance matrix, drawing a correlated sample of $\bm\beta_k$ is computationally very expensive, $O(P^3T^3)$.
Usually, we can avoid drawing a fully correlated sample if $\bm\beta_k$ in the downstream log PDF, $\log p(\cdot | \bm\beta_k)$, can be decomposed into a sum of individual entries, \eg, $p(\cdot | \bm\beta_k)$ is a normal distribution.
However, due to the softmax function that is applied to $\bm\beta_k$ in \eqref{eq:z-prob-model}, such decomposition is not applicable to our model.
% 
To efficiently sample from $q(\bm\beta_k|H)$, we apply another sparse GP approximation, the FITC approximation \citep{naish2008generalized}, to the conditional distribution of $\bm\beta_k$. 
The resulting formulation is
\begin{align}\label{eq:beta-prior-fitc}
    &p_{\text{FITC}}(\bm\beta_k |U, Z_\tV, Z_H, \tV, H) = \nonumber\\
    &\N(\bm\beta_k|  K_{fu}K_{uu}^{-1}(U^\top)_:, \diag{K_{ff} - K_{fu}K_{uu}^{-1}K_{uf}}),
\end{align}
where $\diag{\cdot}$ returns a diagonal matrix while keeping the diagonal entries. 
Since $K_{fu}$, $K_{ff}$ and $K_{uu}$ have a Kronecker structure, we can rewrite mean and covariance to compute them efficiently.
Sampling from \eqref{eq:beta-prior-fitc} is efficient because individual entries of $\bm\beta_k$ can be sampled independently. This reduces the computational complexity of sampling $\bm\beta_k$ from $O(P^3T^3)$ to $O(PTM_x^2M_H^2)$.

\subsection{Variational Inference for Mixture of Topics}

With a variational posterior $q(\bm\eta_d)$ for each document, we can derive a variational lower bound of the log probability over the documents as: 
% \begin{align}
% \log ~&p(W|\bm\mu, \cov, \bm\beta) \nonumber\\ 
% &\geq \sum_{d=1}^D \int q(\bm\eta_d) \log \frac{p(W_d | \bm\eta_{d}, \bm\beta^{(\tV_d)}) p(\bm\eta_d| \bm\mu_{\tV_d}, \cov_{\tV_d})}{q(\bm\eta_d)} \dif \bm\eta_d\nonumber\\
% &= \sum_{d=1}^D \Big( \mathbb{E}_{q(\bm\eta_d)}\left[\log p(W_d | \bm\eta_{d}, \bm\beta^{(\tV_d)})\right] \nonumber\\
% &\hspace{12mm}- \kl\left(q(\bm\eta_d)||p(\bm\eta_d| \bm\mu_{\tV_d}, \cov_{\tV_d})\right) \Big) \nonumber\\
% &= \mathcal{L}_W.
% \label{eqn:bound_doc}
% \end{align}
\begin{align}
\log ~&p(W|\bm\mu, \cov, \bm\beta) \geq \nonumber \\
&\sum_{d=1}^D \Big(
\E_{q(\bm\beta | H)q(\bm\eta_d)}\left[\log p(W_d | \bm\eta_{d}, \bm\beta^{(\tV_d)})\right] 
\nonumber\\&- \kl\left(q(\bm\eta_d)||p(\bm\eta_d| \bm\mu_{\tV_d}, \cov_{\tV_d})\right) \Big) \nonumber\\
&= \mathcal{L}_W.
\label{eqn:bound_doc}
\end{align}
Since the lower bound is a summation over individual documents,
this formulation allows for a stochastic approximation by sub-sampling the documents. 
% ,
% \begin{equation}
% \begin{split}
% \mathcal{L}_W \approx \frac{D}{B} \sum_{j \in \mathcal{D_B}} \Big(& \mathbb{E}_{q(\bm\eta_j)}\left[\log p(W_j | \bm\eta_{j}, \bm\beta_{\tV_j})\right]\\
% &-\kl\left(q(\bm\eta_j)||p(\bm\eta_j| \bm\mu_{\tV_j}, \cov_{\tV_j})\right) \Big), \label{eqn:minibatch_eta}
% \end{split}
% \end{equation}
% where $\mathcal{D_B}$ is a random sub-sampling of the document indices with size $B$. 
% Such sub-sampling allows us to perform mini-batch training, where the gradients of the variational parameters are stochastically approximated from a mini-batch. 

\paragraph{Importance Sampling.}
Computing the expectation \(\E_{q(\bm\beta | H)q(\bm\eta_d)}\left[\log p(W_d | \bm\eta_{d}, \bm\beta^{(\tV_d)})\right] \) is still problematic when the number of words in the vocabulary increase, as we need to sample each word to compute the normalisation constant the softmax function as in \Cref{eq:word-probability}. 
First, let $\bm\xi_d = \bm\beta^{(\tV_d)}\sigma(\bm\eta_d)$, and $\bm\xi_{d,n} = (\bm\xi_d)_n$. We can rewrite \eqref{eq:word-probability} as 
$
    p(W_d|\bm\eta_d, \bm\beta) = \prod_{n=1}^{N_d}\text{Cat}(\sigma(\bm\xi_d)) = \mathcal{\tilde{L}}_W.
$
Then, we can explicitly write its derivative as (details in Supplementary Material):
\begin{align}
    \nabla &\mathcal{\tilde{L}}_W 
    = \nonumber\\
    &\E_{q(\bm\beta | H)q(\bm\eta_d)}\sum_{n=1}^{N_d}\left[ \nabla \bm\xi_{d,n} 
    - \sum_{i=1}^{P}\frac{\exp(\bm\xi_d)_i}{\sum_{j=1}^{P}\exp(\bm\xi_d)_j}\nabla \bm\xi_{d,i} \right].
\end{align}

In the sum inside the parenthesis, it is clear we need to sample from all of the vocabulary (that has size $P$). This is inefficient and may even be unfeasible for a large vocabulary. 
% 
The key idea to solve this problem, and efficiently scale our topic model to an arbitrary large set of words in the vocabulary, is to approximate the normalisation constant with a fixed number of words, using a self-normalising importance sampling \citep{bengio2008adaptive}.
% 
Let consider the words appearing in the batch of documents under analysis as \textit{positive} (\eg, as in a positive class in a classification problem). We then borrow from the ``extreme'' classification literature the idea to use importance sampling to approximate the normalisation constant, which consists in consider a random sample of $M$ classes (in our case, words from the vocabulary) and using those to approximate the normalisation constant \citep{Bamler2020Extreme}.

Consider a sample vector $\bm s \in \{1,...,P\}^{M+N_d}$, which represents a
sample of words in the vocabulary and stores the index of the $N_d$ positive (words appearing in document $d$) and the index of the $M$ sampled words.
Let $\xi_{d,i}^{\prime} := \xi_{d,i} - \ln(Q_{di} / P)$ if $y_i=0$ (\ie, word $i$ does not appear in document $d$), 
$\xi_{d,i}^{\prime} := \xi_{d,i} -\ln (Q_{di})$ otherwise, with $Q_{di}$ proposal distribution.
We shift the true logits by the expected number of occurrences of a word $i$, ensuring that the sampled softmax is asymptotically unbiased. In our experiment we choose $Q$ to be a uniform distribution over the subset of words considered, so $Q_{di} = 1 / (N_d + M)$ \citep{jean2014using}.
Then:
\begin{align}
    % \E_{q(\bm\beta | H)q(\bm\eta_d)}\sum_{n=1}^{N_d}\left[ \nabla \bm\xi_{d,n} 
    % - \sum_{i=1}^{P}\frac{\exp(\xi_{i}) }{\sum_{j=1}^{P}\exp(\xi_{j})}\nabla \xi_{d,i} \right] \\
\nabla \mathcal{\tilde{L}}_W  &\approx \E_{q(\bm\beta | H)q(\bm\eta_d)}\sum_{n=1}^{N_d}\bigg[ \nabla \bm\xi_{d,n} 
    \nonumber\\&- \sum_{i=1}^{M+N_d}\frac{\exp(\xi^{\prime}_i) }{\sum_{j=1}^{M+N_d}\exp(\xi^{\prime}_j)}\nabla \xi_{d,i} \bigg].
%     \\
% = \E_{q(\bm\beta | H)q(\bm\eta_d)}\left[ \sum_{n=1}^{N_d} \nabla \bm\xi_{d,n} 
%     - N_d \sum_{i=1}^{M+N_d}\frac{\exp(\xi^{\prime}_i) }{\sum_{j=1}^{M+N_d}\exp(\xi^{\prime}_j)}\nabla \xi_{d,i} \right]
\end{align}
In this way, we further reduce the complexity of computing expectation from $O(PTM^2_xM^2_H)$ to $O((M+N_d)TM^2_xM^2_H)$.

\paragraph{Documents meta-encoder.}
We parameterise the variational posteriors $q(\bm\eta_d)$ for individual documents as:
\begin{equation}
q(\bm\eta_{d}) = \mathcal{N}(\phi_m([W_d~ M_{\beta,{\tV_d}}]),\phi_S([W_d~ M_{\beta,{\tV_d}}])),
\end{equation}
where $\phi_m$ and $\phi_S$ are parametric functions generating the mean and variance of $q(\bm\eta_d)$, respectively, 
% $M_{\beta,{\tV_d}}$ is the mean of the variational posterior $q(\bm\beta_k|H)$
$M_{\beta,{\tV_d}}$ is the mean of the GP prediction at the inducing point location $Z_H$, 
and $[A~B]$ denotes the concatenation of the matrices $A$ and $B$. 
Instead of implicitly learning the topic information into $\phi_m$ and $\phi_S$ as in \citep{tomasi2020stochastic}, 
we explicitly pass in a summary of all the topic representation at the time point $\tV_d$.
We define $M_{\beta,\tV_d}$ as the mean of the GP prediction at inducing point location $Z_H$, to keep the complexity constant with respect to the number of words in the vocabulary.
We treat such prediction as the summary of all the topic representations at input $\tV_d$ because the inducing variable in sparse GP can be viewed as a summary of all the data \citep{titsias2009variational}.
By having the topic representations as inputs, the encoder does not need to ``memorise" the information about topics but rather link a document to relevant topic representations.
Therefore we refer to $\phi_m$ and $\phi_S$ as the meta-encoder.
%As they encode the output of the posterior of~$\beta$, $\phi_m$ and $\phi_S$ are considered meta-encoders.
%With such formulation, a common set of parameters are always updated no matter which documents are sampled into the mini-batch, thus overcoming the synchronisation issue. In addition, the index $\tV_d$ of the document is taken into account by $M_\beta_{\tV_d}$.

\paragraph{Lower bound.} Note that the lower bound $\mathcal{L}_W$ is intractable. We compute an unbiased estimate of $\mathcal{L}_W$ via Monte Carlo sampling. As $q(\bm\eta_d)$ are normal distributions, we obtain a low-variance estimate of the gradients via the reparameterisation strategy~\citep{KingmaWelling2014}.

% A lower bound for $\bm\beta$ can then be derived:
% \begin{align}
%     \log p(\cdot | \beta) \geq &~ \mathbb{E}_{q(\bm\beta)} [p(\cdot | \beta)] - KL(q(U_\beta) || p(U_\beta)) \nonumber\\
%     &- KL(q(H) || p(H)), \label{eqn:bound_beta}
% \end{align}
% where $q(\bm\beta) = \int p(\bm\beta| \uM_\beta, H) q(\uM_\beta)q(H) \dif \uM_\beta \dif H$.
% \subsection{Variational Inference for GP and GWP}
The document-topic proportion for each document $d$ follows a prior distribution $p(\bm\eta_d| \bm\mu_{\tV_d}, \cov_{\tV_d})$, 
where the Gaussian process $p(\bm\mu) = \GP(0, \kappa_\mu)$ provides the mean
and the Wishart process $p(\Sigma) = \GWP(V,\nu,\kappa_\theta)$ provides the covariance matrix at $\tV_d$. 
To enable efficient inference for both GP and GWP, we take a SVGP approach to construct the variational lower bound of our model \citep{tomasi2020stochastic}. 
% 
% \paragraph{Inference for $\bm\mu$.}
% We first augment the Gaussian process with a set of auxiliary variables with a set of corresponding time stamps, \ie, 
% \begin{align}
% p(\bm\mu|\tV) = \int p(\bm\mu| \uM_\mu, \tV, \zV_\mu) p(\uM_\mu| \zV_\mu) \dif \uM_\mu, \label{eqn:aux_mu}
% \end{align}
% where $\uM_\mu$ is the auxiliary variable for $\bm\mu$ and $\zV_\mu$ is the corresponding index. 
% Both $p(\bm\mu| \uM_\mu, \tV, \zV_\mu)$ and $p(\uM_\mu| \zV_\mu)$ follow the same Gaussian processes as the one for $p(\bm\mu|\tV)$, \ie, these Gaussian processes have the same mean and kernel functions. 
% As shown in \Cref{eqn:aux_mu}, the above augmentation does not change the prior distributions for $\bm\mu$.
% 
% The variational posterior of $\bm\mu$ is constructed in a special form to enable efficient inference \citep{pmlr-v5-titsias09a}: $q(\bm\mu, \uM_\mu) = p(\bm\mu| \uM_\mu) q(\uM_\mu)$. $q(\uM_\mu) = \mathcal{N}(\mM_{\mu},\sM_{\mu})$ is a multivariate normal distribution, in which the mean and covariance are variational parameters.
% $p(\bm\mu| \uM_\mu)$ is a conditional Gaussian process \citep{hensman2013gaussian}.
% When $\bm\mu$ is used in the down-stream distributions, a lower bound can be derived,
% \begin{align}
% \log p(\cdot|\bm\mu) \geq \mathbb{E}_{q(\bm\mu)}[p(\cdot|\bm\mu)] - \text{KL}\left(q(\uM_\mu)||p(\uM_\mu)\right), \label{eqn:bound_mu}
% \end{align}
% where $q(\bm\mu) = \int p(\bm\mu| \uM_\mu) q(\uM_\mu) \dif \uM_\mu$.
% 
% \paragraph{Inference for $\Sigma$.}
% We derive a similar stochastic variational inference method for the Wishart Process. 
% We augment each GP $p(\fV_{ij}|\tV)$ in the Wishart process with a set of auxiliary variables and a set of the corresponding inputs, 
% \begin{align}
% p(\fV_{ij}|\tV) = \int p(\fV_{ij}| \uV_{ij}, \tV, \zV_{ij}) p(\uV_{ij}| \zV_{ij}) \dif \uV_{ij}, 
% \end{align}
% where $\uV_{ij}$ is the auxiliary variable, $\zV_{ij}$ is the corresponding inputs and $p(\fV_{ij}| \uV_{ij})$
% is a conditional Gaussian process \citep{hensman2013gaussian}.
% %
% We define the variational posterior of $\fV_{ij}$ to be $q(\fV_{ij}, \uV_{ij}) = p(\fV_{ij}| \uV_{ij}) q(\uV_{ij})$, where $q(\uV_{ij}) = \N(\bm m_{ij}, \bm s_{ij})$. We also define the variational posterior of $\bm{\ell}$ to be $q(\bm{\ell}) = \N(\bm{m}_\ell, {S}_\ell)$, where ${S}_\ell$ is a diagonal matrix. As the diagonal elements of $\lM{}$ needs to be positive, we apply a {change of variable} to the variational posterior of the diagonal elements, \ie, $\ell_m = \log(1+\exp(\hat{\ell}_m)), q(\hat{\ell}_m) = \N(\bm m_{\ell_m}, S_{\ell_m})$.
% % 
% Note that $\bm z_\mu$ and $\bm z_{ij}$ are variational parameters instead of random variables. For this reason, we will omit them from the notation for convenience.
% 
% We can derive a variational lower bound with such a set of variational posterior for all the entries $\{\fV_{ij}\}$ and $\bm{\ell}$, when $\cov$ is used for some down-stream distributions,
% \begin{align}\label{eqn:bound_sigma}
% % &\log p(\cdot|\cov) \geq \mathbb{E}_{q(\fM)}[p(\cdot|\cov)] - \sum_{i,j} \text{KL}\left(q(\uV_{ij})||p(\uV_{ij})\right),\\
% &\log p(\cdot|\cov) \geq \mathbb{E}_{q(\fM)q(\bm\ell)}[p(\cdot|\cov)] - \sum_{i,j} \text{KL}\left(q(\uV_{ij})||p(\uV_{ij})\right) 
% - \text{KL}\left(q(\bm{\ell})||p(\bm{\ell})\right),
% \end{align}
% where $q(\fM) = \prod_{ij}\int p(\fV_{ij}| \uV_{ij}) q(\uV_{ij}) \dif \uV_{ij}$.
% 
% 
% 
We can derive the complete variational lower bound $\mathcal{L}$ of \us{} combining the lower bounds \eqref{eqn:bound_doc}, \eqref{eq:bound_H}, \eqref{eqn:bound_beta} and the lower bounds for GPs and GWP (details in Supplementary Material).

% \subsection{Lower Bound for \us{}}
% After deriving the variational lower bounds for the individual components of \us{}, we assemble these components together to form the final variational lower bound.
% The word distributions for individual topics are used in defining the distribution of individual words for each document $d$, $p(W_d|\bm\eta_d, \bm\beta^{(\tV_d)})$. 
% % To simplify the notation, we follow \citep{tomasi2020stochastic} and 
% % Let $q(\Sigma) \propto q(\bm\ell)q(F)$, and $\kl(q(\Sigma) || p(\Sigma)) = \sum_{i,j} \kl\left(q(\uV_{ij})||p(\uV_{ij})\right) + \kl\left(q(\bm{\ell})||p(\bm{\ell})\right)$.
% Combining the lower bounds \eqref{eqn:bound_doc}, \eqref{eq:bound_H}, \eqref{eqn:bound_beta}, 
% % \eqref{eqn:bound_mu} and \eqref{eqn:bound_sigma}, 
% and the lower bounds for GPs and GWP,
% we can derive the complete variational lower bound $\mathcal{L}$ of \us{}.
% \begin{align*}
% \begin{split}
% \log ~&p(W) \geq  \mathbb{E}_{q(\bm\mu)q(\bm\ell)q(F)q(\bm\beta)}\left[\mathcal{L}_W\right]
% % &\sum_{d=1}^D \Big( \mathbb{E}_{q(\bm\eta_d)q(\bm\beta^{(\tV_d)})}\left[\log p(W_d | \bm\eta_{d}, \bm\beta^{(\tV_d)})\right] \\
% % &\hspace{2mm}-\mathbb{E}_{q(\bm\eta_d)q(\bm\mu_{\tV_d})q(\Sigma_{\tV_d})}\left[\kl\left(q(\bm\eta_d)||p(\bm\eta_d| \bm\mu_{\tV_d}, \cov_{\tV_d})\right)\right] \Big) \\
% - \kl\left(q(\uM_\beta)||p(\uM_\beta)\right) - \kl\left(q(H)||p(H)\right) \\
% &- \kl\left(q(\uM_\mu)||p(\uM_\mu)\right)  - \kl\left(q(\bm{\ell})||p(\bm{\ell})\right) - \sum_{i,j} \kl\left(q(\uV_{ij})||p(\uV_{ij})\right)  = \mathcal{L}.
% \end{split}
% \end{align*}
% % 
% The first term of $\mathcal{L}$ can be further decomposed by plugging in \Cref{eqn:bound_doc},
% \begin{align}\label{eq:empirical-elbo}
% &\E_{q(\bm\mu)q(\bm\ell)q(F)q(\bm\beta)}\left[\mathcal{L}_W\right] = \nonumber\\
% &\hspace{2mm}\sum_{d=1}^D \Big( \E_{q(\bm\eta_d)q(\bm\beta)}\left[\log p(W_d | \bm\eta_{d}, \bm\beta^{(\tV_d)})\right] - \mathbb{E}_{q(\bm\eta_d)q(\bm\mu_{\tV_d})q(\Sigma_{\tV_d})}\left[\kl\left(q(\bm\eta_d)||p(\bm\eta_d| \bm\mu_{\tV_d}, \cov_{\tV_d})\right)\right] \Big). \nonumber
% \end{align}
% % This formulation allows us to easily perform mini-batch training by data sub-sampling. For each mini-batch, we randomly sub-sampling the data set and re-weight the term $\mathbb{E}_{q(\bm\mu)q(\Sigma)q(\bm\beta)}\left[\mathcal{L}_W\right]$ according to the ratio between the size of dataset and the size of the mini-batch as shown in \Cref{eqn:minibatch_eta}.
% Note that all variational parameters of $q(\bm\mu)$, $q(\bm\ell)$, $q(F)$, $q(\bm\beta)$, $q(\bm\eta)$ are optimised.


\section{Experiments}\label{sec:experiments}

We compared \us{} with several static and dynamic topic models:
\begin{enumerate*}[label=\emph{(\roman*)}]
  \item LDA with an mean-field variational inference~\citep{hoffman2010online,scikit-learn};
  \item CTM with variational inference~\citep{blei2006correlated};\footnote{We additionally infer the variational posteriors for the topic representations $\bm\beta$, of which the point estimates are inferred in~\citep{blei2006correlated}.};
  \item dynamic embedded topic model~\citep[DETM;][]{dieng2019dynamic}, and 
  \item dynamic correlated topic model~\citep[DCTM;][]{tomasi2020stochastic}.
\end{enumerate*}
We do not compare with other previously proposed dynamic models that only cater for independent topics, \eg, DTM \citep{blei2006dynamic}, FastDTM ~\citep{bhadury2016scaling} (do not handle continuous dynamics), and gDTM~\citep{jahnichen2018scalable} (only considers dynamics for $\bm\beta$), as both DETM and DCTM have been shown to generalise and improve on such models.

\paragraph{Performance Analysis.}
We first empirically evaluated the benefit of \us{} using synthetic datasets.
%, using the importance sampling to scale the model to tens of millions of words.
% 
\begin{figure}[t]
  \centering
    \includegraphics[width=0.95\linewidth]{scalability_new.png}
    \caption{Average time to compute 5 epochs in a dataset with increasing number of words.}
    \label{fig:scalability}
\end{figure}
We show the performance of \us{} against the DCTM model \citep{tomasi2020stochastic} in \Cref{fig:scalability} on a dataset with increasing words (from 100 to 10M), while keeping a fixed number of samples at different time points. % We include only dynamic topic models for fair comparison. 
Our proposed model consistently outperforms DCTM across all the vocabulary sizes.
In particular, the computational benefit of \us{} is more evident after reaching 100K (and 1M) words.
On the dataset with 10M words we showed how DCTM could not be used anymore, as it was computationally intractable. Instead, we notice how \us{} is able to scale to 10M words, with an average computational time that is lower than the time required by DCTM with 1M words. Similarly, \us{} with 1M words took less than DCTM with 100K words.

\paragraph{Quantitive Analysis.}
We here showcase the benefit of incorporating word correlation in topic modelling by comparing \us{} with state-of-the-art topic models on public datasets. The common parameters across the models have been kept the same for a fair comparison (\eg, the number of topics). Here, we fix the number of topics to be small (30 topics for all datasets apart for SotU using 20) so we are able to compare to the baselines. In addition, our experiments do not show relevant differences across models when varying the number of topics (analysis in the Supplementary Material).
In all datasets, there is a timestamp associated with each document.
Static topic models (LDA and CTM) are optimised without considering the timestamps, while DCTM and \us{} incorporate the continuous timestamps into {the} inference. For DETM, which considers discrete times, we discretised the timestamps into 30 bins to make the inference computationally feasible on our machine.

We split each dataset considering 75\% of the samples as training and 25\% as test. Documents associated with the same time stamps were assigned to the same split.
% 
For each dynamic topic model we used a Mat\'ern 3/2 kernel for $\bm\beta$, to allow topics to quickly incorporate new words. This is important to incorporate neologisms, particularly for datasets such as NeurIPS conference papers and Elsevier (Abstracts) corpus, where the names of novel models become quoted in citations (for example, "LDA" starting to appear in publications together as "topic modeling" after its introduction in 2003). 
For the other parameters $\bm\mu$ and $\bm f$ we use a squared exponential kernel, as we expect a smooth temporal evolution of both topic probabilities and their correlation. 
Full details on data, model parameters and experimental settings can be found in the Supplementary Material.

% \paragraph{Model configurations.}
% For each dynamic topic model we used a Mat\'ern 3/2 kernel for $\bm\beta$, to allow topics to quickly incorporate new words. This is important especially to incorporate neologisms, and particularly for datasets such as NeurIPS conference papers and Elsevier corpus, where the names of novel models become quoted in citations (for example, "LDA" starting to appear in publications together as "topic modeling" after its introduction in 2003). 
% For the other parameters $\bm\mu$ and $\bm f$ we use a squared exponential kernel, as we expect a smooth temporal evolution of both topic probabilities and their correlation. 
% We initialise amplitude and length scale of kernels as $1$ and $0.5$ respectively, and we optimise for them %along with other prior parameters for the distribution of $\beta$, $\mu$ and $f$, 
% using the approximate empirical Bayes approach~\citep{maritz2018empirical}. 

% %\paragraph{Hyperparameters.}
% %We implemented our models using the TensorFlow framework \citep{dillon2017tensorflow}. 
% Experiments were conducted using Adam optimiser with learning rate $0.001$ and up to $10$k epochs until convergence.
% % 
% We experimented with different number of topics, and report the results using a default choice of 30 for all datasets (20 for SotU) to maintain consistency with previous works.
% We also experimented with a different number of inducing points for the three components $\bm\beta$, $\bm\mu$ and $\bm f$, thus controlling the complexity of the variational posterior used from both DCTM and our models (static models such as LDA and CTM do not have such dynamic components). The number of inducing points used for such components is 15, 20 and 15, respectively. \us{} has an additional component for the latent embedding of words in $\bm\beta$; we used $M_H = 200$ in $Q = 10$ dimensions.
% We initialised the posterior for $H$ by transforming the words in our vocabulary using ELMO embeddings \citep{peters2018deep} pre-trained on the 1 Billion Word Benchmark, and take the first $Q$ principal components using a PCA transformation.

% For the posterior of $\bm\eta$, when using a static encoder (\eg, for DCTM) we considered a dense neural network with three layers with size $500$, $300$ and $200$, respectively. To account for the increased input dimensionality in our meta-encoder we instead used a dense neural network with three layers, with size $1000$, $600$ and $400$, respectively.\footnote{Since the encoder is a collection of variational parameters, we emphasise that increasing its size does not overfit the model.}

\begin{table}[]
\caption{Average per-word perplexity (the lower the better) on public datasets.
% We indicate with $\dag$ the results computed on discretised timestamps.
}
\label{tab:results}
\centering
\resizebox{0.99\linewidth}{!}{%
\begin{tabular}{ccccccc}
\toprule
Dataset & \#words & LDA & CTM & DETM & DCTM & \us{} \\\midrule
% SotU & 1442 & 1578.93 & 1056.56 & 1353.24 &  957.52 & \textbf{846.81} \\
% \multirow{2}{*}{DoJ} & 1000 & 532.15  & 486.96  & 308.46 & 304.68 & \textbf{303.71}& \\
% DoJ & 2622 &  1043.77  &  937.97 &  & 523.65  &   \textbf{498.96} \\
% Abstracts & 3000 & 3461.39  &  2170.71  & 1597.34 & 1163.87  &  \textbf{1120.13}  \\
Blogs & 3000 & 1538.74 & 1525.01 & 1305.34 & 1013.71 &  \textbf{949.76}  \\%[0.5em]
% News & 3000 & 8721.69 & 3752.71 & & 1523.76 &  \textbf{1414.90}  \\ 
% Twitter & 3000 & 9221.66 & 1928.06 & & 946.09 &  \textbf{866.81} \\
SotU & 4583 & 3090.52 & 1937.19 & 3069.54 & 2205.01  & \textbf{1675.32} \\
NeurIPS & 4799 & 1321.51  & 1241.72 & 941.91 & 1012.51 &  \textbf{888.59}\\
DoJ & 9591 &  1459.10  &  931.23 & 936.00 & 928.08  &   \textbf{613.37} \\
Abstracts & 13126 & 3206.09  &  2918.36  & 2566.50 & -  & \textbf{1857.67}  \\
News & 22459 & 5351.83 & 3553.73 & 1957.25 & - &  \textbf{1703.77}  \\ 
Twitter & 83582 & 7101.83 & 8576.97 & 2721.05 & - & \textbf{2595.14} \\
\bottomrule
\end{tabular}
}
\end{table}

We report the average per-word perplexity computed on the held-out test set of the datasets for all the models in \Cref{tab:results}. The per-word perplexity is a measure of best fit to compare models, computed as the exponential average negative predictive log-likelihood for each word \citep{cdtm}:
\begin{align*}
    \text {perpl}_{\text{pw}}(t)=\exp \left\{-\frac{1}{\left|D_{t}\right|} \sum_{d \in D_{+}} \frac{\log p(W_d | \bm\eta_d, \bm\beta^{(\bm x_d)})}{N_{d}}\right\},
\end{align*}
% In cases where the perplexity is computationally intractable, we approximate it using a subset of words (details in Supplementary Material).
% 
where $\log p(W_d)$ is estimated as in \Cref{eqn:bound_doc}.
\us{} consistently outperforms all the baselines on all the datasets.
% We show that by considering a temporal dynamics, topic models generalise better to documents belonging to unseen time points.
% \us{} and \us-F are both better than the baselines, but the performance among them varies according to the dataset. In general, however, the per-word perplexity is similar among them, which validates \us-F as reliable approximation of \us{}.
%
The benefit in using our models is more evident for the datasets that have a larger time span (SotU) or a shorter document size (Blogs, News, Twitter). 
There is also a significant performance gap between static and dynamic topic models, which demonstrates the advantages of incorporating temporal information into topic modeling.
Comparing \us{} to DCTM, the perplexity decreases on average by 10\%, which shows that incorporating word correlation into topic modeling can significantly improve the quality of modeling.
When using large scale datasets (more than 10K words) we were not able to run DCTM as the size of the vocabulary was intractable. However, we were able to use our model \us{}, and obtain better % (approximate)
perplexity than static topic models and the discrete time topic model DETM (for which we had to discretise the time stamps of the documents into 30 bins).
% The approximate perplexity metric is a reasonable approximation of the real perplexity. We computed it with 1000 randomly sampled words. We also compared the approximate perplexity for NeurIPS with respect to the true perplexity with similar results (887.74 vs 888.59).

% Furthermore, we computed a coherence score as an additional measure to validate the topics as extracted by the compared models (Suppl. material).

We note that dynamic topic models allows to keep the number of topics low with respect to static topic models, because topics will be linked through time. Hence, if static models need a lot of topics to be able to cluster words in different time stamps, dynamic models allow to adapt the words related to the same topics to achieve the same (or best) results on a small number of topics. Furthermore, our proposed model, through the use of the Kronecker decomposition, is able to keep the number of parameters low in terms of words representation. Instead of independently learn the representation of words, we learn a representation of a similar group of words, which allows us to scale the model and train in the presence of few words in documents.

\paragraph{Qualitative Analysis on NeurIPS dataset.}
% \begin{figure}[t]
% \centering
% \begin{minipage}[b][][b]{.45\textwidth}
%   \centering
%     \includegraphics[width=\linewidth]{heatmap_correlation_2.png}
%     \caption{Heatmap of the correlation matrix for the top-10 frequent words for four topics inferred by \us{} for NeurIPS dataset.}
%     \label{fig:heatmap}
% \end{minipage}%
% \hfill
% \begin{minipage}[b][][b]{.45\textwidth}
%   \centering
%     \includegraphics[width=\linewidth]{scalability.png}
%     \caption{Average time to compute 5 epochs in a dataset with increasing number of words.}
%     \label{fig:scalability}
% \end{minipage}
% \end{figure}
\begin{figure}[t]
  \centering
    \includegraphics[width=0.95\linewidth]{heatmap_correlation_2.png}
    \caption{Correlation matrix for top-10 frequent words in four topics inferred by \us{} for NeurIPS dataset.}
    \label{fig:heatmap}
\end{figure}

To provide insights about the word correlation in \us{}, we visualise the inferred word correlation. 
We choose four interred popular topics across all years on the NeurIPS dataset and collect the top-10 frequent words for each topic (Supplementary Material). %, shown in~\Cref{tab:word-topic-distribution}. 
Then, we compute the covariance matrix among these frequent words (duplicate words are removed) by applying the learnt kernel function $\kappa_H$ to the mean of the variational posterior of the word representations $m_H$.
The covariance matrix is converted into a correlation matrix for better interpretability.
We visualise the resulting correlation matrix using a heatmap (\Cref{fig:heatmap}).
The color map is shown in the left top corner of the plot. 
A lighter color indicates a stronger correlation and a darker color indicates a weaker correlation. 
Due to the choice of the kernel function (squared exponential) no anti-correlation is captured in the correlation matrix.
We also applied a simple hierarchical clustering to the correlation matrix. 
With only the word relation, the words associated with the same topic are roughly grouped together (topics are here unknown to the clustering algorithm).
% 
% Indeed, we notice how some of the words with embedding distant to other words in the same topic are ``\textit{learn}'', ``\textit{train}'' and ``\textit{statist}'', each having a general meaning in the machine learning literature which cannot identify (by themselves) a specific topic. 
% Indeed, we notice how most of the correlated words belong to the same topic. 
For example, \emph{network}, \emph{weight}, \emph{neural} and \emph{layer}, which identify the topic \emph{neural network}, have a very similar embedding. The word pairs that are often used together in some research area show interesting strong correlations such as \emph{input}-\emph{output}, \emph{imag}-\emph{pixel}, \emph{time}-\emph{state}. 
This indicates that the word correlation has contributed to the identification of these topics.

\begin{figure}[t]
    \centering
    \includegraphics[width=\linewidth]{figures/word_proba4.png}
    \caption{Word counts in dataset and estimated word probability in dynamic topic models for few low frequency words.}
    \label{fig:word-proba}
\end{figure}
% \begin{figure}[t]
%     \centering
%     \includegraphics[width=\linewidth]{figures/word_proba_twinx.png}
%     % \caption{Word probability in "Bayesian inference" topic.}
%     \label{fig:word-proba}
% \end{figure}

We analyse the performance of \us{} in terms of modeling infrequent keywords compared to previous methods (see \Cref{fig:word-proba}). We plot the per-year word counts of three keywords (``Wishart'', ``Kalman'' and ``Hebbian'') in the Neurips datasets in the top row. All three words on averages appear less than 100 times a year, which are infrequent, but they are all clear indicators of ML and neuroscience sub-fields. 
We then compare the word probabilities of each word in the topic with the strongest connection to the word inferred by \us{}, DCTM and DETM. 
We plot the posterior of $\mu$ in the topic inferred by \us{} where the word is most prominent, and the posterior of $\beta$ for each one of the dynamic topic models we tested (we omit static topic models as the result would be a flat line of the word probability as it is independent of time). 
%For a fair comparison, we plot $\beta$ for the best topic in each model, as the order of topics may change across models.
\us{} is shown to be the most accurate in modelling these words, by capturing the general dynamics of the word in the dataset but without overfitting it.
% For example, note that the word "kalman" (which refers to the Kalman filter) is increasingly appearing over time (top plot). However, the topic related to the Kalman filters is decreasing over time, in line with the decreasing applications of the filter in recent years. \us{} is able to model an increasing probability after the Kalman filter was introduced, but decreasing in later years 

For example, consider the word ``Wishart'' corresponding to the Wishart distribution (and process). The word counts are very low (less than 50 counts in total each year). However, the word is very much indicative of Bayesian inference topic, as the Wishart distribution is the conjugate prior of the inverse covariance-matrix of a multivariate-normal. Indeed, it has a high correlation with the much more common word "posterior" ($\rho=0.688$ using the learnt $\kappa_H$ from \us{}).
\us{} is able to accurately model the increasing dynamics of such word over time (middle row). Conversely, DCTM and DETM that considers words independently do not have enough data to accurately model its dynamics.
% DETM is able to learn accurate spikes of the words especially when the word is more common (such as in the "perceptron" example). However, 

% Interestingly, the most correlated word as found by our model \us{} is \emph{wishart} (correlation $0.947$), a word which is relatively rare in the dataset (\Cref{fig:word-count}). We argue this may be due to the fact that while only a restricted subset of documents mention \emph{wishart} (\eg, for Wishart distributions), that is a very relevant word in the Bayesian inference literature (for example as the conjugate prior for the covariance matrix of a multivariate normal distribution). Instead, words such as \emph{posterior} are slightly less correlated to \emph{bayesian} (correlation $0.850$), even though it still has a very high probability of appearing in the topic. 

% It is also worth noting how our model inherently makes stemming redundant. For example, consider the words \textit{classifi} and \textit{classif}. A language processing expert could argue that stemming failed in this case, as words such as \textit{classification} and the verb \textit{classify} could be considered equivalent. Indeed, we notice how both words have very close embedding: while the words are different, \us{} was able to automatically assign almost the same embedding to the words, understanding their interchangeability. We argue that, in real cases, our \us{} could be most beneficial to avoid a stemming preprocessing.


\section{Conclusion}\label{sec:conclusion}
We developed an efficient approach to model word correlation in dynamic topic modeling.
Our approach incorporates word dynamics through the use of multi-output Gaussian processes. 
We improved the amortised inference by proposing a meta-encoder, which allows \us{} to be more sensitive to the changes of topic representations. 
Finally, we enable scalable inference for large vocabularies by deriving an asymptotically unbiased estimator of the gradient that allows us to dramatically subsample the number of words in computation.
%
As shown in our experiments, incorporating word correlation into dynamic topic modeling significantly improves the modeling quality and allows topic models to leverage information from related words. 

\bibliography{main_bib}


% SUPPLEMENTARY
\appendix
\newpage
\onecolumn

\title{Supplementary: Efficient Inference for Dynamic Topic Modeling with Large Vocabularies}
\maketitle
\section{Importance Sampling}
We can write the probability of words in a document conditioned on the parameter $\bm\eta_d$ and $\bm\beta$ as:
\begin{align}
    p(W_d|\bm\eta_d, \bm\beta) = \prod_{n=1}^{N_d}\text{Multi}(1, \sigma(\bm\xi_d)) = \prod_{n=1}^{N_d}\text{Cat}(\sigma(\bm\xi_d)) = \mathcal{\tilde{L}}_W.
\end{align}
Its derivative can be derived as:
\begin{align}
    \nabla \mathcal{\tilde{L}}_W 
    &= \E_{q(\bm\beta | H)q(\bm\eta_d)}\sum_{n=1}^{N_d}\left[ \nabla \log \text{Cat}(\sigma(\bm\xi_d))\right] \\
    &= \E_{q(\bm\beta | H)q(\bm\eta_d)}\sum_{n=1}^{N_d}\left[ \nabla \bm\xi_{d,n}
    - \nabla \log \sum_{j=1}^{P}\exp(\bm\xi_d)_j\right] \\
    &= \E_{q(\bm\beta | H)q(\bm\eta_d)}\sum_{n=1}^{N_d}\left[ \nabla \bm\xi_{d,n} 
    - \frac{1}{\sum_{j=1}^{P}\exp({\bm\xi_{d}}_j)}\nabla \sum_{i=1}^{P}\exp(\bm\xi_d)_i\right] \\
    &= \E_{q(\bm\beta | H)q(\bm\eta_d)}\sum_{n=1}^{N_d}\left[ \nabla \bm\xi_{d,n} 
    - \sum_{i=1}^{P}\frac{\exp(\bm\xi_d)_i}{\sum_{j=1}^{P}\exp(\bm\xi_d)_j}\nabla \bm\xi_{d,i} \right].
\end{align}

To approximate this derivative, we consider a random sample of $M$ words from the vocabulary and use those to approximate the normalisation constant.
% 
Consider a sample vector $\bm s \in \{1,...,P\}^{M+N_d}$, which represents a
sample of words in the vocabulary and stores the index of the $N_d$ positive (words appearing in document $d$) and the index of the $M$ sampled words.
Let $\xi_{d,i}^{\prime} := \xi_{d,i} - \ln(Q_{di} / P)$ if $y_i=0$ (\ie, word $i$ does not appear in document $d$), 
$\xi_{d,i}^{\prime} := \xi_{d,i} -\ln (Q_{di})$ otherwise, with $Q_{di}$ proposal distribution.
We shift the true logits by the expected number of occurrences of a word $i$, ensuring that the sampled softmax is asymptotically unbiased. In our experiment we choose $Q$ to be a uniform distribution over the subset of words considered, so $Q_{di} = 1 / (N_d + M)$ \citep{jean2014using}.
Then:
\begin{align}
    % \E_{q(\bm\beta | H)q(\bm\eta_d)}\sum_{n=1}^{N_d}\left[ \nabla \bm\xi_{d,n} 
    % - \sum_{i=1}^{P}\frac{\exp(\xi_{i}) }{\sum_{j=1}^{P}\exp(\xi_{j})}\nabla \xi_{d,i} \right] \\
\nabla \mathcal{\tilde{L}}_W  \approx \E_{q(\bm\beta | H)q(\bm\eta_d)}\sum_{n=1}^{N_d}\left[ \nabla \bm\xi_{d,n} 
    - \sum_{i=1}^{M+N_d}\frac{\exp(\xi^{\prime}_i) }{\sum_{j=1}^{M+N_d}\exp(\xi^{\prime}_j)}\nabla \xi_{d,i} \right].
%     \\
% = \E_{q(\bm\beta | H)q(\bm\eta_d)}\left[ \sum_{n=1}^{N_d} \nabla \bm\xi_{d,n} 
%     - N_d \sum_{i=1}^{M+N_d}\frac{\exp(\xi^{\prime}_i) }{\sum_{j=1}^{M+N_d}\exp(\xi^{\prime}_j)}\nabla \xi_{d,i} \right]
\end{align}

\section{FITC}
The FITC approximation for the multi-output Gaussian process results into the follow formulation:
\begin{align*}
    p_{\text{FITC}}&(\beta|U, Z_X, Z_H, X, H) \\
    = \N(&\beta|  K_{fu}K_{uu}^{-1}(U^\top)_:, \diag{K_{ff} - K_{fu}K_{uu}^{-1}K_{uf}}),
\end{align*}

where $\diag{\cdot}$ returns a diagonal matrix while keeping the diagonal entries, and $A_:$ denotes $vec(A)$, the column-wise vectorisation of the matrix $A$.
Since $K_{fu}$, $K_{ff}$ and $K_{uu}$ have a Kronecker structure, we can rewrite mean and covariance to compute them efficiently as follows
\begin{align*}
    K_{fu}K_{uu}^{-1}(U^\top)_: = (K_{fu}^X{K_{uu}^X}^{-1}U^\top {K_{uu}^H}^{-\top}{K_{fu}^H}^\top)_:
\end{align*}
\begin{align*}
    & \diag{K_{ff} - K_{fu}K_{uu}^{-1}K_{uf}} \\
    &= \diag{K_{ff}} \\
    &- \left(\diag{K_{fu}^H{K_{uu}^H}^{-1}{K_{fu}^H}^\top} \otimes \diag{K_{fu}^X{K_{uu}^X}^{-1}{K_{fu}^X}^\top}\right)_:
\end{align*}
Note that the last line becomes a vectorised outer product between vectors and solved efficiently. We can use the same trick for $\diag{K_{ff}}$. 

The full derivation is the following:

\begin{align*}
    &K_{fu}K_{uu}^{-1}(U^\top)_: \\
    &=
    (K_{fu}^H \otimes K_{fu}^X)(K_{uu}^H \otimes K_{uu}^X)^{-1}(U^\top)_: \\
    &= (K_{fu}^H \otimes K_{fu}^X)({K_{uu}^H}^{-1} \otimes {K_{uu}^X}^{-1})(U^\top)_: \\
    &= (K_{fu}^H{K_{uu}^H}^{-1} \otimes K_{fu}^X{K_{uu}^X}^{-1})(U^\top)_: \quad \text{(matrix eq)} \\
    &= (K_{fu}^X{K_{uu}^X}^{-1}U^\top((K_{fu}^H{K_{uu}^H}^{-1})^\top)_:\\
    &= (K_{fu}^X{K_{uu}^X}^{-1}U^\top {K_{uu}^H}^{-\top}{K_{fu}^H}^\top)_:
\end{align*}

\begin{align*}
    & \diag{K_{ff} - K_{fu}K_{uu}^{-1}K_{uf}} \\
    &= \diag{K_{ff}} - \diag{K_{fu}K_{uu}^{-1}K_{uf}} \\
    &= \diag{K_{ff}} - \diag{(K_{fu}^H{K_{uu}^H}^{-1} \otimes K_{fu}^X{K_{uu}^X}^{-1})(K_{fu}^H \otimes K_{fu}^X)^{\top}} \\
    &= \diag{K_{ff}} - \diag{K_{fu}^H{K_{uu}^H}^{-1}{K_{fu}^H}^\top \otimes K_{fu}^X{K_{uu}^X}^{-1}{K_{fu}^X}^\top} \\
    &= \diag{K_{ff}} - \left(\diag{K_{fu}^H{K_{uu}^H}^{-1}{K_{fu}^H}^\top} \otimes \diag{K_{fu}^X{K_{uu}^X}^{-1}{K_{fu}^X}^\top}\right)_:
\end{align*}

\section{Matrix Normal Distribution}
The matrix normal is related to the multivariate normal distribution in the following way:
\begin{align}
\mathbf{X} \sim \mathcal{MN}_{n\times p}(\mathbf{M}, \mathbf{U}, \mathbf{V}),
\end{align}
if and only if
\begin{align}
\mathrm{vec}(\mathbf{X}) \sim \mathcal{N}_{np}(\mathrm{vec}(\mathbf{M}), \mathbf{V} \otimes \mathbf{U})
\end{align}
where $\otimes$  denotes the Kronecker product and $\mathrm{vec}(\mathbf{M})$ denotes the vectorization of $\mathbf {M}$.

Sampling from the distribution and the $\kl$ divergence can be computed efficiently.
$U_{\bm\beta_k}$ can be sampled efficiently following the procedure:
\begin{enumerate*}[label=\emph{(\roman*)}]
    \item sample ${C}\sim\mathcal{MN}_{h\times x}(\mathbf{0},{I},{I})$, $C\in\R^{h\times x}$, a collection of independent samples from a standard normal distribution; then  
    \item let $U_{\bm\beta_k} = (M + {ACB})_:$, where $\Sigma^H = AA^\top$ and $\Sigma^\tV = B^\top B$.
\end{enumerate*}
%
The $\kl$ divergence between $q(U_{\bm\beta_k})$ and $p(U_{\bm\beta_k})$ can also be computed efficiently (see Supplementary Material).

Sampling from the matrix normal distribution is a special case of the sampling procedure for the multivariate normal distribution. Let $\mathbf{X}$  be an $n$ by $p$ matrix of $np$ independent samples from the standard normal distribution, so that

\begin{align}
\mathbf{X}\sim\mathcal{MN}_{n\times p}(\mathbf{0},\mathbf{I},\mathbf{I}).
\end{align}
Then let

$\mathbf{Y}=\mathbf{M}+\mathbf{A}\mathbf{X}\mathbf{B}$,
so that

\begin{align}
\mathbf{Y}\sim\mathcal{MN}_{n\times p}(\mathbf{M},\mathbf{AA}^T,\mathbf{B}^T\mathbf{B}),
\end{align}
where $A$ and $B$ can be chosen by Cholesky decomposition or a similar matrix square root operation.

\subsection{KL Divergence}
The KL divergence between two matrix-variate normal distributions, \eg, $q(U_{\bm\beta_k})$ and $p(U_{\bm\beta_k})$, can be analytically computed as:
\begin{align*}
    &\kl(q(U_{\bm\beta_k}) || p(U_{\bm\beta_k})) = \frac12 \bigg( M_x\log \frac{|K^H_{uu}|}{|\Sigma^H|} + M_H\log \frac{|K^\tV_{uu}|}{|\Sigma^\tV|} \\
    &\hspace{9mm}+ \tr(M^\top(K^\tV_{uu})^{-1}M(K^H_{uu})^{-1}) + \tr((K^H_{uu})^{-1}\Sigma^H) 
    \tr((K^\tV_{uu})^{-1}\Sigma^\tV) - M_H M_x )\bigg).
\end{align*}

To implement $\tr[M^\top (K^\tV)^{-1}M(K^H)^{-1}]$, we use $K^\tV = L_\tV L_\tV^\top$, 
$K^H = L_H L_H^\top$, $A = L_\tV^{-1} M L_H^{-\top}$, then $\tr[M^\top (K^\tV)^{-1}M(K^H)^{-1}] = \tr(A^\top A)$.

First, recall that
\begin{align}
KL(q || p) = \int q(x) (\log q(x) - \log p(x)) dx.
\end{align}

which in the case of two multivariate Gaussian distributions, say $p(x) = \N(m_1, S_1)$, $q(x) = \N(m_2, S_2)$ is equal to
\begin{align}
    \int \big[ \frac12 \log\frac{|S_2|}{|S_1|} - \frac12 (x-m_1)^\top S_1^{-1}(x-m_1) + \frac12 (x-m_2)^\top S_2^{-1}(x-m_2) \big] q(x) dx \\
    = \frac12 \log\frac{|S_2|}{|S_1|} - \frac12 tr\big\{\E[(x-m_1)(x-m_1)^\top]S_1^{-1}\big\} + \frac12 \E[ (x-m_2)^\top S_2^{-1}(x-m_2)] \\
    = \frac12 \log\frac{|S_2|}{|S_1|} - \frac12 tr\{\E[(x-m_1)(x-m_1)^\top]S_1^{-1}\} + \frac12 \E[(x-m_2)^\top S_2^{-1}(x-m_2)] \\
    = \frac12 \log\frac{|S_2|}{|S_1|} - \frac12 tr\{I_d\} + \frac12 (m_1-m_2)^\top S_2^{-1}(m_1-m_2) + \frac12 tr\{S_2^{-1}S_1\} \\
    = \frac12 [\log\frac{|S_2|}{|S_1|} - d + tr\{S_2^{-1}S_1\} + (m_1-m_2)^\top S_2^{-1}(m_1-m_2) ]
\end{align}

Now, we can use a Kronecker representation of $S_1$ and $S_2$ as
$S_1 = S_h \otimes S_x$ and $S_2 = K_h \otimes K_x$. Let $M = m_1 - m_2$. Also, we consider a vectorised version of $M$, and we indicate it as $M_:$.
Then the KL divergence becomes:
(using $|V\otimes U| = |V|^n|U|^p$, and mixed product property of Kron)
\begin{align}
    &\frac12 [\log\frac{|K_h \otimes K_x|}{|S_h \otimes S_x|} - d + 
    tr\{(K_h \otimes K_x)^{-1}(S_h \otimes S_x)\}
    + M_:^\top (K_h \otimes K_x)^{-1}M_: ] \\
    % 
    &= \frac12 [ n\log\frac{|K_h|}{|S_h|} + p\log\frac{|K_x|}{|S_x|} - np + 
    tr\{(K_h^{-1} \otimes K_x^{-1})(S_h \otimes S_x)\}
    + M_:^\top (K_h^{-1} \otimes K_x^{-1})M_: ] \\
    % 
    &= \frac12 [ n\log\frac{|K_h|}{|S_h|} + p\log\frac{|K_x|}{|S_x|} - np + 
    tr\{(K_h^{-1} S_h)\otimes(K_x^{-1} S_x)\}
    + M_:^\top ((K_h^{-1} \otimes K_x^{-1})M_:)]  \quad (associative)\\
    % 
    &= \frac12 [ n\log\frac{|K_h|}{|S_h|} + p\log\frac{|K_x|}{|S_x|} - np + 
    tr(K_h^{-1}S_h)tr(K_x^{-1} S_x)
    + M_:^\top (K_x^{-1}MK_h^{-1})_: ] \quad (kron matrix equations) \\
    % 
    &= \frac12 [ n\log\frac{|K_h|}{|S_h|} + p\log\frac{|K_x|}{|S_x|} - np + 
    tr(K_h^{-1}S_h)tr(K_x^{-1} S_x)
    + tr[M^\top K_x^{-1}MK_h^{-1}] ]
\end{align}

\section{Variational Inference for Gaussian and Wishart process}
\paragraph{Inference for $\bm\mu$.}
We first augment the Gaussian process with a set of auxiliary variables with a set of corresponding time stamps, \ie, 
\begin{align}
p(\bm\mu|\tV) = \int p(\bm\mu| \uM_\mu, \tV, \zV_\mu) p(\uM_\mu| \zV_\mu) \dif \uM_\mu, \label{eqn:aux_mu}
\end{align}
where $\uM_\mu$ is the auxiliary variable for $\bm\mu$ and $\zV_\mu$ is the corresponding index. 
Both $p(\bm\mu| \uM_\mu, \tV, \zV_\mu)$ and $p(\uM_\mu| \zV_\mu)$ follow the same Gaussian processes as the one for $p(\bm\mu|\tV)$, \ie, these Gaussian processes have the same mean and kernel functions. 
As shown in \Cref{eqn:aux_mu}, the above augmentation does not change the prior distributions for $\bm\mu$.

The variational posterior of $\bm\mu$ is constructed in a special form to enable efficient inference \citep{pmlr-v5-titsias09a}: $q(\bm\mu, \uM_\mu) = p(\bm\mu| \uM_\mu) q(\uM_\mu)$. $q(\uM_\mu) = \mathcal{N}(\mM_{\mu},\sM_{\mu})$ is a multivariate normal distribution, in which the mean and covariance are variational parameters.
$p(\bm\mu| \uM_\mu)$ is a conditional Gaussian process \citep{hensman2013gaussian}.
When $\bm\mu$ is used in the down-stream distributions, a lower bound can be derived,
\begin{align}
\log p(\cdot|\bm\mu) \geq \mathbb{E}_{q(\bm\mu)}[p(\cdot|\bm\mu)] - \text{KL}\left(q(\uM_\mu)||p(\uM_\mu)\right), \label{eqn:bound_mu}
\end{align}
where $q(\bm\mu) = \int p(\bm\mu| \uM_\mu) q(\uM_\mu) \dif \uM_\mu$.

\paragraph{Inference for $\Sigma$.}
We derive a similar stochastic variational inference method for the Wishart Process. 
We augment each GP $p(\fV_{ij}|\tV)$ in the Wishart process with a set of auxiliary variables and a set of the corresponding inputs, 
\begin{align}
p(\fV_{ij}|\tV) = \int p(\fV_{ij}| \uV_{ij}, \tV, \zV_{ij}) p(\uV_{ij}| \zV_{ij}) \dif \uV_{ij}, 
\end{align}
where $\uV_{ij}$ is the auxiliary variable, $\zV_{ij}$ is the corresponding inputs and $p(\fV_{ij}| \uV_{ij})$
is a conditional Gaussian process \citep{hensman2013gaussian}.
%
We define the variational posterior of $\fV_{ij}$ to be $q(\fV_{ij}, \uV_{ij}) = p(\fV_{ij}| \uV_{ij}) q(\uV_{ij})$, where $q(\uV_{ij}) = \N(\bm m_{ij}, \bm s_{ij})$. We also define the variational posterior of $\bm{\ell}$ to be $q(\bm{\ell}) = \N(\bm{m}_\ell, {S}_\ell)$, where ${S}_\ell$ is a diagonal matrix. As the diagonal elements of $\lM{}$ needs to be positive, we apply a {change of variable} to the variational posterior of the diagonal elements, \ie, $\ell_m = \log(1+\exp(\hat{\ell}_m)), q(\hat{\ell}_m) = \N(\bm m_{\ell_m}, S_{\ell_m})$.
% 
Note that $\bm z_\mu$ and $\bm z_{ij}$ are variational parameters instead of random variables. For this reason, we will omit them from the notation for convenience.

We can derive a variational lower bound with such a set of variational posterior for all the entries $\{\fV_{ij}\}$ and $\bm{\ell}$, when $\cov$ is used for some down-stream distributions,
\begin{align}\label{eqn:bound_sigma}
% &\log p(\cdot|\cov) \geq \mathbb{E}_{q(\fM)}[p(\cdot|\cov)] - \sum_{i,j} \text{KL}\left(q(\uV_{ij})||p(\uV_{ij})\right),\\
&\log p(\cdot|\cov) \geq \mathbb{E}_{q(\fM)q(\bm\ell)}[p(\cdot|\cov)] - \sum_{i,j} \text{KL}\left(q(\uV_{ij})||p(\uV_{ij})\right) 
- \text{KL}\left(q(\bm{\ell})||p(\bm{\ell})\right),
\end{align}
where $q(\fM) = \prod_{ij}\int p(\fV_{ij}| \uV_{ij}) q(\uV_{ij}) \dif \uV_{ij}$.

\subsection{Lower Bound for \us{}}
After deriving the variational lower bounds for the individual components of \us{}, we assemble these components together to form the final variational lower bound.
The word distributions for individual topics are used in defining the distribution of individual words for each document $d$, $p(W_d|\bm\eta_d, \bm\beta^{(\tV_d)})$. 
% To simplify the notation, we follow \citep{tomasi2020stochastic} and 
% Let $q(\Sigma) \propto q(\bm\ell)q(F)$, and $\kl(q(\Sigma) || p(\Sigma)) = \sum_{i,j} \kl\left(q(\uV_{ij})||p(\uV_{ij})\right) + \kl\left(q(\bm{\ell})||p(\bm{\ell})\right)$.
Combining the lower bounds (10), (4), (8), %\eqref{eqn:bound_doc}, \eqref{eq:bound_H}, \eqref{eqn:bound_beta}, 
\eqref{eqn:bound_mu} and \eqref{eqn:bound_sigma}, 
% and the lower bounds for GPs and GWP,
we can derive the complete variational lower bound $\mathcal{L}$ of \us{}.
\begin{align*}
\begin{split}
\log ~&p(W) \geq  \mathbb{E}_{q(\bm\mu)q(\bm\ell)q(F)q(\bm\beta)}\left[\mathcal{L}_W\right]
% &\sum_{d=1}^D \Big( \mathbb{E}_{q(\bm\eta_d)q(\bm\beta^{(\tV_d)})}\left[\log p(W_d | \bm\eta_{d}, \bm\beta^{(\tV_d)})\right] \\
% &\hspace{2mm}-\mathbb{E}_{q(\bm\eta_d)q(\bm\mu_{\tV_d})q(\Sigma_{\tV_d})}\left[\kl\left(q(\bm\eta_d)||p(\bm\eta_d| \bm\mu_{\tV_d}, \cov_{\tV_d})\right)\right] \Big) \\
- \kl\left(q(\uM_\beta)||p(\uM_\beta)\right) - \kl\left(q(H)||p(H)\right) \\
&- \kl\left(q(\uM_\mu)||p(\uM_\mu)\right)  - \kl\left(q(\bm{\ell})||p(\bm{\ell})\right) - \sum_{i,j} \kl\left(q(\uV_{ij})||p(\uV_{ij})\right)  = \mathcal{L}.
\end{split}
\end{align*}
% 
The first term of $\mathcal{L}$ can be further decomposed by plugging in (3),%\Cref{eqn:bound_doc},
\begin{align}\label{eq:empirical-elbo}
&\E_{q(\bm\mu)q(\bm\ell)q(F)q(\bm\beta)}\left[\mathcal{L}_W\right] = \nonumber\\
&\hspace{2mm}\sum_{d=1}^D \Big( \E_{q(\bm\eta_d)q(\bm\beta)}\left[\log p(W_d | \bm\eta_{d}, \bm\beta^{(\tV_d)})\right] - \mathbb{E}_{q(\bm\eta_d)q(\bm\mu_{\tV_d})q(\Sigma_{\tV_d})}\left[\kl\left(q(\bm\eta_d)||p(\bm\eta_d| \bm\mu_{\tV_d}, \cov_{\tV_d})\right)\right] \Big). \nonumber
\end{align}
% This formulation allows us to easily perform mini-batch training by data sub-sampling. For each mini-batch, we randomly sub-sampling the data set and re-weight the term $\mathbb{E}_{q(\bm\mu)q(\Sigma)q(\bm\beta)}\left[\mathcal{L}_W\right]$ according to the ratio between the size of dataset and the size of the mini-batch as shown in \Cref{eqn:minibatch_eta}.
Note that all variational parameters of $q(\bm\mu)$, $q(\bm\ell)$, $q(F)$, $q(\bm\beta)$, $q(\bm\eta)$ are optimised.

\section{Datasets}
We include a complete list of details for the dataset we used in our analysis.

We considered the following datasets: State of the Union corpus (\emph{SotU}), department of justice press releases (\emph{DoJ}), Elsevier corpus (\emph{Abstracts}) \citep{kershaw2020elsevier}, Blog Authorship Corpus (\emph{Blogs}) \citep{schler2006effects}, NeurIPS conference papers (\emph{NeurIPS}) \citep{perrone2017poisson}, A Million News Headlines (\emph{News}), Twitter sentiment classification (\emph{Twitter}) \citep{go2009twitter}.

For each dataset, we consider the total indicated number of samples (if not otherwise specified), and divide the dataset into 75\% for training and rest for test.


\paragraph{Blog Authorship Corpus \citep{schler2006effects}.} 
The corpus\footnote{\url{https://u.cs.biu.ac.il/~koppel/BlogCorpus.htm}} consists of the posts of 19k bloggers gathered from \texttt{blogger.com} from June 1999 to August 2004. The corpus incorporates a total of 681k posts, from which we draw a random sample of 5649 for training and 5650 for testing. After our preprocessing, we considered 3000 words in our vocabulary. License: free use for non-commercial research purposes.

\paragraph{State of the Union corpus (1790-2018).}
The dataset\footnote{\url{https://kaggle.com/rtatman/state-of-the-union-corpus-1989-2017}} includes a yearly address of the US president, from 1790 to 2018 (229 years).
Our vocabulary includes 4583 words after preprocessing. We split the data into 170 documents as training and 57 documents as test data. License: CC BY-SA 4.0

\paragraph{NeurIPS conference papers (1987-2015) \citep{perrone2017poisson}.}
The dataset\footnote{\url{https://archive.ics.uci.edu/ml/datasets/NIPS+Conference+Papers+1987-2015}} includes 5804 conference papers from 1987 to 2015 including an average of 34 papers per year. 
We preprocessed the dataset leading to 4799 words. In both cases we used 4237 documents as training data and 1567 as test data.

\paragraph{Department of justice press releases (2009-2018).}
The dataset\footnote{\protect\url{https://kaggle.com/jbencina/department-of-justice-20092018-press-releases}} includes 13087 press releases from the Department of Justice 
from 2009 to 2018 (115 unique timestamps), preprocessed to include 9591 unique words. Documents were split into 9674 for training and for 3413 testing. License: CC0: Public Domain

\paragraph{Elsevier OA CC-BY Corpus \citep{kershaw2020elsevier}.} 
The dataset\footnote{\protect\url{https://data.mendeley.com/datasets/zm33cdndxs/2}} includes 40k open access (OA) CC-BY abstracts taken from articles from across Elsevier’s journals, published from 2010 to 2019. After our preprocessing, we considered 13126 words in the vocabulary. License: CC BY 4.0
% We also consider a smaller subset for comparison purposes, comprising 2929 documents in the training set and 2929 in the test set, including only 1000 words in the vocabulary.

\paragraph{A Million News Headlines.} 
The dataset\footnote{\url{https://kaggle.com/therohk/million-headlines}} includes 1.2M news headlines published over a period of 17 Years (from 2003 to 2019). We took a random sample of 1M. After our preprocessing, we considered a vocabulary of size 22459. License: CC0: Public Domain
% We also consider a smaller dataset with vocabulary size of 1000, comprising 3732 training and 1207 test samples.

\paragraph{Twitter sentiment classification \citep{go2009twitter}.}
The dataset\footnote{\url{https://www.kaggle.com/kazanova/sentiment140}} contains 1.6M tweets, from April to May 2009. We randomly sampled 1M tweets.
We preprocessed samples using a tweet tokenizer, removing usernames and replacing repeated character sequences (length 3 or more) with sequences of length 3 \citep{bird2009natural}.
After our preprocessing we considered 83582 tokens. 

\subsection{Experiment settings.}
We split each dataset considering 75\% of the samples as training and 25\% as test. Documents associated with the same time stamps were assigned to the same split.

For each dynamic topic model we used a Mat\'ern 3/2 kernel for $\bm\beta$, to allow topics to quickly incorporate new words. This is important especially to incorporate neologisms, and particularly for datasets such as NeurIPS conference papers and Elsevier corpus, where the names of novel models become quoted in citations (for example, "LDA" starting to appear in publications together as "topic modeling" after its introduction in 2003). 
For the other parameters $\bm\mu$ and $\bm f$ we use a squared exponential kernel, as we expect a smooth temporal evolution of both topic probabilities and their correlation. 
We initialise amplitude and length scale of kernels as $1$ and $0.5$ respectively, and we optimise for them %along with other prior parameters for the distribution of $\beta$, $\mu$ and $f$, 
using the approximate empirical Bayes approach~\citep{maritz2018empirical}. 

%\paragraph{Hyperparameters.}
%We implemented our models using the TensorFlow framework \citep{dillon2017tensorflow}. 
Experiments were conducted using Adam optimiser with learning rate $0.001$ and up to $10$k epochs until convergence. With our configuration, DCTM took around 6s/epoch to analyse 7000 training samples in 3000 dimensions using a single GPU NVIDIA Tesla V100, completing 5000 epochs in 8 hours (on average). Using MIST we achieved a runtime of $\sim$~2.5s/epoch, completing 5000 epochs in 3.5 hours.
% 
We experimented with different number of topics, and report the results using a default choice of 30 for all datasets (20 for SotU) to maintain consistency with previous works.
We also experimented with a different number of inducing points for the three components $\bm\beta$, $\bm\mu$ and $\bm f$, thus controlling the complexity of the variational posterior used from both DCTM and our models (static models such as LDA and CTM do not have such dynamic components). The number of inducing points used for such components is 15, 20 and 15, respectively. \us{} has an additional component for the latent embedding of words in $\bm\beta$; we used $M_H = 200$ in $Q = 10$ dimensions.
We initialised the posterior for $H$ by transforming the words in our vocabulary using ELMO embeddings \citep{peters2018deep} pre-trained on the 1 Billion Word Benchmark, and take the first $Q$ principal components using a PCA transformation.

For the posterior of $\bm\eta$, when using a static encoder (\eg, for DCTM) we considered a dense neural network with three layers with size $500$, $300$ and $200$, respectively. To account for the increased input dimensionality in our meta-encoder we instead used a dense neural network with three layers, with size $1000$, $600$ and $400$, respectively.\footnote{Since the encoder is a collection of variational parameters, we emphasise that increasing its size does not overfit the model.}

% \section{Approximate Perplexity}
% The perplexity metric is computed as:
% \begin{align}
%   \text { perplexity }=\exp \left\{-\frac{1}{\left|D\right|} \sum_{d \in D} \frac{1}{N_d}\E_{q(\bm\beta | H)q(\bm\eta_d)}\sum_{n=1}^{N_d}\left[\log p(W_d | \bm\xi_{d})\right]\right\}
% \end{align}
% However, in case of a large vocabulary the log probability cannot be computed exactly, so we need to resort to an approximation by sampling $M$ random negative words, which do not appear in the document:
% \begin{align}\label{eq:approx-perplexity}
%     \E_{q(\bm\beta | H)q(\bm\eta_d)}\sum_{n=1}^{N_d}\left[\log p(W_d | \bm\xi_{d})\right]
% \approx \E_{q(\bm\beta | H)q(\bm\eta_d)}\left[\sum_{n=1}^{N_d} \bm\xi_{d,n} 
%     - \log\sum_{n=1}^{M+N_d} \exp\left(\bm\xi_{d,n} - \log(r_{nd})\right) \right],
% %     \\
% % = \E_{q(\bm\beta | H)q(\bm\eta_d)}\left[ \sum_{n=1}^{N_d} \nabla \bm\xi_{d,n} 
% %     - N_d \sum_{i=1}^{M+N_d}\frac{\exp(\xi^{\prime}_i) }{\sum_{j=1}^{M+N_d}\exp(\xi^{\prime}_j)}\nabla \xi_{d,i} \right]
% \end{align}
% where $r_{nd} = M / (P - N_d)$ if $n$ is one of the negative words (and $r_{nd} = 1$ otherwise) is the uniform probability of picking word $n$.
% % . The ELBO for a test document $d^*$ is computed using \Cref{eq:empirical-elbo}. 


\section{Additional Results}
\paragraph{Varying number of topics.}
\begin{figure}
    \centering
    \includegraphics[width=0.5\linewidth]{figures/neurips_change_k.png}
    \caption{Perplexity at varying the number of topics for the NeurIPS dataset.}
    \label{fig:varying-topics}
\end{figure}

Our analysis does not show substantial differences when varying the number of topics. We experimented with topics varying between 10 and 100 and showed some examples in \Cref{fig:varying-topics}.

% % \paragraph{Meta-encoder effect.}
% % \begin{table}[t]
% %     \caption{Effect of using meta-encoder on the perplexity.}
% %     \label{tab:encoder-effect}
% %     \centering
% % \resizebox{0.99\linewidth}{!}{%
% % \begin{tabular}{cccccccc}\toprule
% %     Meta-Enc.  & SotU & DoJ & Abstracts & Blogs & News  & Twitter & NeurIPS \\\midrule
% %     Yes
% %     & \textbf{846.81} & 498.96 & \textbf{1120.13} & \textbf{949.76} & 1414.90 & \textbf{866.81} & \textbf{888.59} \\
% %     No & 852.86 & \textbf{493.66} & 1158.02 & 951.95 & \textbf{1414.37} & 871.13 & 967.97\\
% %     \bottomrule
% %     \end{tabular}}
% % \end{table}
% % % \begin{table}[t]
% % %     \caption{Effect of using meta-encoder on the perplexity.}
% % %     \label{tab:encoder-effect}
% % %     \centering
% % % \resizebox{0.5\linewidth}{!}{%
% % % \begin{tabular}{ccc}\toprule
% % %     Meta-Encoder  & Yes & No \\\midrule
% % %     SotU  & \textbf{846.81} & 852.86 \\
% % %     DoJ & 498.96 & \textbf{493.66}  \\
% % %     Abstracts & \textbf{1120.13}& 1158.02  \\
% % %     Blogs & \textbf{949.76} & 951.95  \\
% % %     News  & 1414.90 & \textbf{1414.37}  \\
% % %     Twitter & \textbf{866.81} & 871.13  \\
% % %     NeurIPS & \textbf{888.59} & 967.97 \\
% % %     \bottomrule
% % %     \end{tabular}
% % % }
% % % \end{table}

% % To highlight the benefit of using the meta-encoder, we also trained a set of \us{} with the encoders as defined in~\citep{tomasi2020stochastic}, in which the encoder only takes the document representation as inputs.
% % The comparison is shown in \Cref{tab:encoder-effect}.
% % %
% % \us{}, with the meta-encoder, performs better on five out of seven datasets than the one without the meta-encoder.
% % DoJ is the only dataset on which without the meta-encoder \us{} performs noticeably better.
% % It is special because the topics in DoJ change very little across time. 
% % % This result shows that our meta-encoder would not clearly benefit from encoding such dynamics when the temporal dynamics of $\bm\beta$ does not change much.
% % Indeed, this result suggests how the benefit of using our meta-encoder is limited when the temporal dynamics of $\bm\beta$ does not change much.
% % % \mounia{We argue that our meta-encoder would not clearly benefit from encoding such dynamics when the temporal dynamics of $\bm\beta$ does not change much.}

% % % The benefits are especially clear in documents with larger time span and large amount of documents. \todo{other}
% % % The results on NeurIPS allows for interesting considerations.
% % % The best performing model on the small dataset (1047 words) is \us-E, a variant of our model with no meta-encoder. This suggests that, in this case, incorporating dynamics in the posterior of $\bm\eta$ parameter may be redundant. Indeed, a closer inspection reveals that static models such as LDA and CTM perform already quite well, without considering temporal dynamics at all. In such cases, the number of documents over time is enough for a reliable inference.
% % % However, this does not hold when the dimension of the dataset increases. With 4799 words in the vocabulary (almost 5 times than the small dataset), the perplexity of the baselines more than doubles for LDA and CTM, and the performance gap between static and dynamic models increases. Here, the best performing model is \us{}, that includes our meta-encoder. Indeed, the benefit of our model is most visible when the number of words is higher, as their co-occurrence in documents is not enough to capture low frequency words.

% % % The performance difference between \us, \us-F and \us-E largely depends on the dataset at hand. Notably, \us-E outperforms the other versions in only two cases, NeurIPS (1047) and News (1000), the two small versions of the correspondent datasets. This suggests how, in presence of a restricted vocabulary, encoding a temporal information for every document is not necessarily beneficial. Vice versa, results on larger datasets emphasise how, incorporating additional contexts for the documents allows for a better inference of the words in the vocabulary.
% % % 
% % % \todo{The performance difference between FITC and not can go in the suppl if we have time}
% % % We note how in general the performance of \us{} is comparable to \us-F. As \us-F considers the FITC approximation the model is much faster than \us, but achieves comparable and sometimes even better results than the former. 

% \paragraph{Coherence.}
% To evaluate topics extracted from our model, we also compute a measure of coherence \citep{roder2015exploring}.
% The coherence is a measure which quantifies how much a set of texts is coherence based on the topics extracted by the topic model. We compute the normalised pointwise mutual information (NPMI) score (the higher the better).
% A limitation of such metric is that it requires a fixed set of topics to compute against a set of documents. However our topic model is dynamic, meaning each document could potentially have a different set of topics (in terms of the appearing words) based on the timestamp of the document. We approximate this measure to retrieve a static set of topics based on the most frequent words of a topic across all time points (\cref{tab:coherence}).
% The results show that, while the metric is more appropriate to static models which infer independent topics (such as LDA), our model is able to outperform the state of the art on 4 out of 8 datasets.

% \begin{table}[]
%     \centering
%     \caption{Coherence metric across datasets and methods. The higher the better.}
%     \label{tab:coherence}
%     \begin{tabular}{lcrrrr}
% \toprule
% {} &       \#words & LDA &       CTM &      DCTM &      MIST \\
% \midrule
% SotU          & 1442  & -0.592569 & -0.578372 & \textbf{-0.550996} & -0.561410 \\
% DoJ           & 2622  & \textbf{-0.359855} & -0.413999 & -0.363901 & -0.366874 \\
% Abstracts     & 3000  & -0.468714 & -0.446985 & -0.432409 & \textbf{-0.402740} \\
% Blogs         &  3000 & \textbf{-0.222944} & -0.365012 & -0.387720 & -0.238367 \\
% News    &  3000 & -0.343660 & -0.562780 & -0.592340 & \textbf{-0.297206} \\
% Twitter & 3000  & -0.173941 & -0.547871 & -0.721763 & \textbf{-0.119556} \\
% NeurIPS       &  4799 & -0.633054 & -0.400259 & -0.501386 & \textbf{-0.365073} \\
% Twitter (ext.)   & 83582  &  \textbf{0.002190 }& -0.226310 &       - & -0.122431 \\
% \bottomrule
% \end{tabular}
% \end{table}

% \begin{table}[t]
%     \caption{Top 10 words associated with four popular topics.}\label{tab:word-topic-distribution}
%     \centering
%     \resizebox{0.6\columnwidth}{!}{%
%     \begin{tabular}{cp{2em}l}
%         \toprule
%         Topic & Color & Top Words \\
%         \midrule
% 18 & \cellcolor[HTML]{1f77b4} &      state time dynam sequenc model transit use process observ trajectori \\
% 6  & \cellcolor[HTML]{ff7f0e} &  estim distribut inform sampl probabl statist densiti mean entropi measur\\
% 8  & \cellcolor[HTML]{2ca02c} &          network unit layer input train output hidden learn neural weight\\
% 2  & \cellcolor[HTML]{d62728} &           imag object featur use pixel visual model segment recognit face\\
% \bottomrule
% \end{tabular}}
% \end{table}
% \begin{figure}[t]
%     \centering
%     \includegraphics[width=\linewidth]{neurips_p_eta.png}
%     \caption{\protect Mean of the mixture of topics $\sigma(\bm\eta)$ for a few selected topics throughout the time span predicted using \us{}.\footnotemark{}}\label{fig:neurips-topics}
% \end{figure}
% \footnotetext{Computed as $\langle\sigma(\bm\eta_{t_*}) \rangle_{p(\bm\eta_{t_*}|D)}$, where $p(\bm\eta_{t_*}|D) = \int p(\bm\eta_{t_*}|\bm{\mu_{t_*}},\cov_{t_*} ) p(\bm{\mu_{t_*}}|D) p(\cov_{t_*}|D) \dif \bm{\mu_{t_*}} \dif  \cov_{t_*}$.}

% \Cref{fig:neurips-topics} shows the mean of topics over time as computed by \us{} on NeurIPS dataset.
% % 
% The distribution shows a decreasing trend for topic 8, associated with \emph{neural networks} (consistent with prior knowledge and previous results on this dataset).
% Similar considerations can be made for topic 1, associated with \emph{neuroscience}.
% Topic 19 (associated with \emph{topic modeling}) has a spike between the years 2004 and 2007. We can attribute this to the interest for topic modeling after the introduction of LDA in 2003, and the following publications of CTM and DTM in 2006.


\bibliography{main_bib}
\bibliographystyle{abbrvnat}

\end{document}
