\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage[capitalize]{cleveref}
\usepackage{float}
\usepackage{xcolor}
\usepackage{bbm}
\usepackage{graphicx}
\usepackage{amssymb}
\usepackage{amsmath}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\newcommand{\R}{\mathbb{R}}
\newcommand{\Z}{\mathbb{Z}}
\newcommand{\N}{\mathbb{N}}
\newcommand{\B}{\mathcal{B}}
\newcommand{\hist}{\mathcal{H}}
\newcommand{\F}{\mathcal{F}}
\newcommand{\E}{\mathbb{E}}
\newcommand{\Prob}{p}%\mathbb{P}}
\newcommand{\q}{q}%\mathbb{Q}}
\newcommand{\prob}{\Prob}
\newcommand{\var}{\text{Var}}
\newcommand{\cov}{\text{Cov}}
\newcommand{\ind}{\mathbbm{1}}
\newcommand{\sep}{\!\;|\;\!}
\newcommand{\hit}{\text{hit}}

\usepackage{multirow}
\newcommand{\mc}[1]{\multicolumn{1}{c}{#1}}

\setcounter{equation}{5}
\setcounter{figure}{5}
\setcounter{table}{0}


\allowdisplaybreaks
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros

\title{Inference for Mark-Censored Temporal Point Processes:\\Supplementary Material}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<alexjb@uci.edu>}{Alex~Boyd}{}}
\author[2]{Yuxin~Chang}
\author[1,2]{Stephan~Mandt}
\author[1,2]{Padhraic~Smyth}
% Add affiliations after the authors
\affil[1]{%
    Department of Statistics\\
    University of California, Irvine
}
\affil[2]{%
    Department of Computer Science\\
    University of California, Irvine
}
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\appendix

\section{Bias and Variance Analysis of Censored Intensity Estimator}
In practice, the numerator and denominator of 
Eq. (4) %\cref{eq:censored_intensity_result} 
are estimated with Monte-Carlo samples, resulting in the following approximation:
\begin{align*}
\underline{\lambda}^*_k(t) & \approx \frac{\frac{1}{M}\sum_{i=1}^M \lambda_k(t \sep \hist^{(i)}(t))\exp\left(-\int_0^t\sum_{k'\in\mathbb{O}}\lambda_{k'}(s\sep \hist^{(i)}(s))ds\right)}{\frac{1}{M'}\sum_{j=1}^{M'} \exp\left(-\int_0^t\sum_{k'\in\mathbb{O}}\lambda_{k'}(s\sep \hist^{(j)}(s))ds\right)}
\end{align*}
where  $\hist^{(i)}(t), \hist^{(j)}(t) \overset{\text{iid}}{\sim} q$ for $i=1,\dots,M$ and $j=1,\dots,M'$. For simplicity, we typically set $M=M'$. This estimator is what is typically referred to as a ratio estimator, and while it is consistent unfortunately for finite samples it is biased. 

To see in what way this is biased, we will recast this form into a more general format. Consider random variables $X, \{X_i\}_{i=1}^M, \{X_j'\}^{M'}_{j=1} \overset{\text{iid}}{\sim} p_X$ with support $\mathcal{X}$, and functions $f: \mathcal{X} \rightarrow \mathbb{R}^{+,0}$ and $g: \mathcal{X} \rightarrow [0,1]$. We assume the mean and variance of both $f(X)g(X)$ ($\mu_{fg}$ and $\sigma^2_{fg}$ respectively) and $g(X)$ ($\mu_g$ and $\sigma^2_g$) exist and $\mu_g \in (0,1)$. This implies that the quantity of interest $\frac{\mu_{fg}}{\mu_g}:=\frac{\E[f(X)g(X)]}{\E[g(X)]}$ is well defined. We now can investigate the bias of a finite sample ratio estimator through a second-order Taylor series expansion around $\frac{\mu_{fg}}{\mu_g}$:
\begin{align*}
\E\left[\frac{\frac{1}{M}\sum_{i=1}^M f(X_i)g(X_i)}{\frac{1}{M'}\sum_{j=1}^{M'} g(X_j')}\right] & \approx \frac{\E\left[\frac{1}{M}\sum_{i=1}^M f(X_i)g(X_i)\right]}{\E\left[\frac{1}{M'}\sum_{j=1}^{M'} g(X_j')\right]} - \frac{\cov\left(\frac{1}{M}\sum_{i=1}^M f(X_i)g(X_i), \frac{1}{M'}\sum_{j=1}^{M'} g(X_j')\right)}{\E\left[\frac{1}{M'}\sum_{j=1}^{M'} g(X_j')\right]^2} \\
& \quad + \frac{\var\left(\frac{1}{M'}\sum_{j=1}^{M'} g(X_j')\right)\E\left[\frac{1}{M}\sum_{i=1}^M f(X_i)g(X_i)\right]}{\E\left[\frac{1}{M'}\sum_{j=1}^{M'} g(X_j')\right]^3} \\
& = \frac{\mu_{fg}}{\mu_g} - \frac{\sum_{i=1}^M\sum_{j=1}^{M'}\cov\left(f(X_i)g(X_i), g(X_j')\right)}{MM'\mu_g^2} + \frac{\var\left(g(X)\right)\mu_{fg}}{M'\mu_g^3} \\
& = \frac{\mu_{fg}}{\mu_g} + \frac{\sigma^2_g\mu_{fg}}{M'\mu_g^3} \text{ since } X_i \perp X_j' 
\end{align*}
Likewise, the variance of the ratio estimator can also be approximated with a second-order Taylor series expansion around $\frac{\mu_{fg}}{\mu_g}$:
\begin{align*}
\var\left(\frac{\frac{1}{M}\sum_{i=1}^M f(X_i)g(X_i)}{\frac{1}{M'}\sum_{j=1}^{M'} g(X_j')}\right) & \approx \frac{\var\left(\frac{1}{M}\sum_{i=1}^M f(X_i)g(X_i)\right)}{\E\left[\frac{1}{M'}\sum_{j=1}^{M'} g(X_j')\right]^2} \\
& \quad - \frac{2\cov\left(\frac{1}{M}\sum_{i=1}^M f(X_i)g(X_i), \frac{1}{M'}\sum_{j=1}^{M'} g(X_j')\right)\E\left[\frac{1}{M}\sum_{i=1}^M f(X_i)g(X_i)\right]}{\E\left[\frac{1}{M'}\sum_{j=1}^{M'} g(X_j')\right]^3} \\
& \quad + \frac{\var\left(\frac{1}{M'}\sum_{j=1}^{M'} g(X_j')\right)\E\left[\frac{1}{M}\sum_{i=1}^M f(X_i)g(X_i)\right]^2}{\E\left[\frac{1}{M'}\sum_{j=1}^{M'} g(X_j')\right]^4} \\
& = \frac{\sigma^2_{fg}}{M\mu_g^2} - \frac{2\mu_{fg}\sum_{i=1}^M\sum_{j=1}^{M'} \cov\left(f(X_i)g(X_i), g(X_j')\right)}{MM'\mu_g^3} + \frac{\sigma_g^2\mu_{fg}^2}{M'\mu_g^4} \\
& = \frac{\sigma^2_{fg}}{M\mu_g^2} + \frac{\sigma_g^2\mu_{fg}^2}{M'\mu_g^4} \text{ since } X_i \perp X_j'.
\end{align*}

It can be tempting to consider reusing samples for both the numerator and the denominator (i.e., $M=M'$ and $X_i=X_i'$ for $i=1,\dots,M$) as this would save in the amount of computations needed for computing the ratio estimate. This would result in the following expected value and variance of the estimator:
\begin{align*}
\E\left[\frac{\frac{1}{M}\sum_{i=1}^M f(X_i)g(X_i)}{\frac{1}{M}\sum_{j=1}^M g(X_j)}\right] & \approx \frac{\mu_{fg}}{\mu_g} - \frac{\sum_{i=1}^M\sum_{j=1}^{M}\cov\left(f(X_i)g(X_i), g(X_j)\right)}{M^2\mu_g^2} + \frac{\sigma^2_g\mu_{fg}}{M\mu_g^3} \\
& = \frac{\mu_{fg}}{\mu_g} - \frac{\cov\left(f(X)g(X), g(X)\right)}{M\mu_g^2} + \frac{\sigma^2_g\mu_{fg}}{M\mu_g^3} \\
\var\left(\frac{\frac{1}{M}\sum_{i=1}^M f(X_i)g(X_i)}{\frac{1}{M}\sum_{j=1}^{M} g(X_j)}\right) & \approx  \frac{\sigma^2_{fg}}{M\mu_g^2} - \frac{2\mu_{fg}\sum_{i=1}^M\sum_{j=1}^{M} \cov\left(f(X_i)g(X_i), g(X_j)\right)}{M^2\mu_g^3} + \frac{\sigma_g^2\mu_{fg}^2}{M\mu_g^4}  \\
& = \frac{\sigma^2_{fg}}{M\mu_g^2} - \frac{2\mu_{fg} \cov\left(f(X)g(X), g(X)\right)}{M\mu_g^3} + \frac{\sigma_g^2\mu_{fg}^2}{M\mu_g^4}.
\end{align*}

Either forms of the expected values of the estimators can be used to help us correct for the bias by simply moving all terms on the right that are not $\frac{\mu_{fg}}{\mu_g}$ to the left. Interestingly, we can see that there is potential for reusing samples to not only save on computation, but to also reduce the variance of the estimator. Should $\cov(f(X)g(X),g(X)) > 0$, which is often the case in practice, then the variance will be reduced.   

\section{Further Experimental Details and Results}

\subsection{Datasets}

The following are more in depth descriptions on the different real-world datasets used in experiments. All sequences used for both training and inference are preprocessed to only allow sequences with at least 5 events and at most 200. Summary statistics can be found in \cref{tab:data_statistics}.

\paragraph{Taobao}
The Taobao user behavior dataset \citep{zhu2018learning} was originally intended for recommendations during online shopping sessions, which includes four different behaviors: page viewing, purchasing, adding items to the chart, and adding items to a wishlist. We focus on modeling the page viewing of users as events, and let the item category be the associated event mark. Modeling this information has various marketing implications such as click through rate of recommending some types of items. Due to the large scale of the dataset, we use a subset of 2,000,000 events on 8 consecutive calendar days inclusive (November 25th, 2017 - December 2nd, 2017), as well as the most frequent 1,000 marks (item categories). All user sequences have the same time length of $T=192$ hours.

\paragraph{Reddit} The Reddit comments dataset \citep{baumgartner2020pushshift} contains records of comments made by different users on various posts listed in the social media site \url{reddit.com}. One month's worth of data (October 2018) was used to extract user sequences, and the mark vocabulary was defined as the top 1000 communities (subreddits) determined by marginal comment volume. The month was divided into multiple week-long sequences for each user, with event times in units of hours ($T=178$ hours).

\paragraph{MemeTracker} The MemeTracker dataset \citep{leskovec2009meme} tracks to common phrases (memes) as they appear on various websites. We compile these records into sequences, each pertaining to a single meme with events defined as the time of mention and the website they appeared on as the mark. Only the mentions in the top 5000 websites by marginal volume were considered. Sequences were defined as one-week-long chunks spanning August 2008 to April 2009, and event times were measured in hours ($T=178$ hours).   

\paragraph{Email} 
Lastly, the Email dataset \citep{paranjape2017motifs} contains the email records for a research organization over the course of 803 days. Sequences were defined as the collection of incoming emails for a given user where each mark was the address of the original sender. These sequences were defined over four week intervals and event times were measured in days ($T=28$ days). After preprocessing the sequences, we were left with 808 different unique addresses.

% Insert table of statistics
\begin{table}
    \caption{Summary Statistics for the Four Real-World Datasets}
    \label{tab:data_statistics}
    \centering
    \begin{tabular}{llrrrrr}
    \toprule
    & & & Mean & \multicolumn{3}{c}{\# Sequences}  \\
    Dataset & \mc{$T$} & \mc{$M$} & \mc{$|\mathcal{H}|$} &  Train & Valid & Test \\
    \midrule
    Taobao & 8 Days & 1000  & 62.6 & 13.3K & 1.8K  & 2.7K  \\
    Reddit & 1 Week  & 1000 & 65.2 & 343K & 15K & 34K \\
    MemeTracker & 1 Week  & 5000 & 23.4 & 271K & 9K  & 21K  \\
    Email & 28 Days   & 808   & 31.1 & 6.9K & 1.5K & 1.5K \\
    \bottomrule
    \end{tabular}
\end{table}

\subsection{Model \& Training Details}

For each of the real-world datasets, a neural Hawkes process model \citep{mei2017neural} was trained on fully observed sequences for a given dataset. 
Each model was trained using the Adam stochastic gradient optimization algorithm \citep{DBLP:journals/corr/KingmaB14} with default hyperparameters, a learning rate of 0.001, and a linear warm-up learning rate schedule over the first 1\% of training iterations. Each iteration optimized the parameters against the average log-likelihood for a batch of 128 training sequences. Gradients were clipped to have a maximum norm of $10^4$ for stability. All models were trained for a fixed amount of epochs; however, each one was confirmed to have converged based on average held-out validation log-likelihood.

Models possessed different hyperparameters depending on the dataset due to differences in the amount of data and total possible marks. Details can be found in \cref{tab:model_details}.

\begin{table}
    \centering
    \caption{Model Hyperparameters for Real-World Datasets}
    \begin{tabular}{lcccc}
    \toprule
    Hyperparameter & Taobao & Reddit & MemeTracker & Email \\
    \midrule
    \# Training Epochs & 300 & 50 & 50 & 300 \\
    Mark Embedding Size & 64 & 64 & 64 & 32 \\
    Recurrent Hidden State Size & 128 & 128 & 128 & 64 \\
    \bottomrule
    \end{tabular}
    \label{tab:model_details}
\end{table}


\subsection{Next Event Prediction}

Alongside likelihood, we are also interested in making predictions for next events in the presence of censored data. The following section details the prediction experiments conducted for both synthetic and real-world settings.

\paragraph{Setup}
We follow the same settings for the next event prediction as \citet{du2016recurrent, mei2017neural} on both event time and event mark. The predicted time is chosen to be the expected time of the next event occurrence, which is defined as 
$$\hat{\tau}_i=\mathbb{E}\left[\tau_i \mid \mathcal{H}[0,\tau_{i-1}]\right]=\int_{\tau_{i-1}}^{\infty} t \lambda^*(t) \exp\left(-\int_{\tau_{i-1}}^t \lambda^*(s) ds\right) d t.$$
We measure predictive performance for this with the mean absolute error between predicted and true next event time. Without the knowledge of the event time $\tau_i$, the predicted distribution of the next event type is defined to be
$$\prob(\hat{\kappa}_i=k)=\int_{\tau_{i-1}}^{\infty}\lambda_k^*(t) \exp\left(-\int_{\tau_{i-1}}^t \lambda^*(s) ds\right) d t,$$
and is evaluated via top-10 accuracy (i.e., the proportion of predictions in which true mark $\kappa_i$ appears in the set of top-10 highest probability predicted marks). Both predictions can be achieved by approximating integrals numerically, for both the censored and baseline methods.

Similar to the likelihood ratio experiments, we evaluated these methods on sequences that have been artificially censored. For the synthetic experiments, we evaluate 1000 sequences $\hist(T)$ sampled from their respective models and then randomly choose a subset of unique marks that appear in each sequence to be censored $\mathbb{C}$, the proportion of which is determined for each value $\gamma \in \{0.2, 0.4, 0.6, 0.8\}$. For real-world experiments, the same is done except the sequences originate from held-out sets and $\gamma$ is also allowed to be $0.5$.

We condition each method on the occluded sequence $\hist_\mathbb{O}[0,\tau_{\lfloor\frac{n}{2}\rfloor}]$ where $|\hist(T)|=n$ and have each produce predictions for the next time $\hat{\tau}_{\lfloor\frac{n}{2}\rfloor+1}$ and the next mark $\hat{\kappa}_{\lfloor\frac{n}{2}\rfloor+1}$.

\paragraph{Synthetic Results}
Figure \ref{fig:synth_next_event} reports the results evaluated on three parametric point process models with 20 distinct marks. When predicting next time to event, both versions of Hawkes processes achieve less error under our framework compared to the baseline. The performance gap between methods widen as more information is censored. However, the baseline outperforms our method for self-correcting models, which may be due to the fact that the occurrence of an event has an inhibiting effect on future events. This results in the baseline always overestimating the intensity as it lacks the censored events to correct it. For this model, this leads to always underestimating the next time to event which is favorable as this will be bounded between $\tau_{\lfloor\frac{n}{2}\rfloor}$ and $\tau_{\lfloor\frac{n}{2}\rfloor+1}$. This can be seen as a systematic bias inherent to the specific model parameterization.

As for the prediction of the next event type, both self-correcting processes and Hawkes processes with dense interaction between events have similar performances as random guesses that will have an accuracy of around 0.5 for top-10 accuracy. This is expected for both models, as there is not much imposed correlation between events of different types due to how the models were instantiated. 
However, the Hawkes processes with block-diagonal interactions better model the structure in sequential events, where the prediction accuracy is much higher than 0.5, which in general decreases as more marks are censored. It is clear that our method is less sensitive to the amount of censored information and significantly outperforms random guesses, as long as the model is able to capture the underlying structured dynamics of the event sequences. 


\paragraph{Real-World Results}
Real-world datasets naturally have more meaningful structures and larger vocabulary sets compared to synthetic experiments. We evaluate the results on all four datasets that have different numbers of marks ranging from 808 to 5000. The prediction of the next event time of our method is on par with the baseline, while we see consistent improvements in the next event prediction evaluated by top-10 accuracy. Furthermore, the accuracy in general, regardless of method, tends to decrease with more information being censored which is expected. 
% The top-10 accuracy reduces to around 0 when approximately 100 marks are censored on all datasets because the number of unique marks can be much less than the total number of possible marks on a sequence level. We leave this investigation to future work for personalized predictions in the presence of censoring.




\begin{figure}
    \centering
    \includegraphics[width=\linewidth]{figures/synth_next_event_plots.pdf}
    \caption{Next event prediction results for censored and baseline methods across the three different parametric MTPPs. Top plots indicate the mean absolute error in next time prediction, middle plots indicate top-10 accuracy in next mark prediction, and bottom plots show density of the number of marks censored across the sequences used for the experiments.}
    \label{fig:synth_next_event}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[width=\linewidth]{figures/next_event_plots.pdf}
    \caption{Same format as \cref{fig:synth_next_event} except using held-out sequences from real-world datasets with respectively trained neural-based models.}
    \label{fig:next_event}
\end{figure}

\subsection{Model Misspecification} \label{sec:appendix_misspecification}
Recall in the synthetic experiments that we evaluated the log-likelihood for the mark-censored model $\prob_\text{Cen}$ and the baseline method $\prob_\text{Base}$ on censored sequences that were originally sampled from the same model $p$ used in both methods. Under this setting, for a given mark-censoring scheme $\mathbb{C}$ and $\mathbb{O}$ and sampled sequences $\hist_\mathbb{O}(t) \sim p$, it is guaranteed that
\begin{align*}
\E_{\prob(\hist_\mathbb{O}(t))}\left[\prob_\text{Cen}(\hist_\mathbb{O})\right] \geq \E_{\prob(\hist_\mathbb{O}(t))}\left[\prob_\text{Base}(\hist_\mathbb{O})\right] 
\end{align*}
with the inequality being strict so long as $\prob(\hist_\mathbb{C}(t)=\emptyset \sep \hist_\mathbb{O}(t)) > 0$. This is due to the fact that the mark-censored model is simply a marginalized version of the original model, thus resulting in no model misspecification for this setup.

That being said, we no long have this guarantee once we start considering sequences that are drawn from a different distribution from the model we are using. This is inherently the same scenario that was evaluated in the real-world data experiments, as all of the sequences used there came from some other source $\prob_\text{data}$ whereas the models $p$ were learned to best approximate this distribution. Naturally, the closer $p$ is to $\prob_\text{data}$ (i.e., the less model misspecification there is) the more we can start to trust that the censored method will produce superior results to the baseline.


\subsection{Sensitivity Analysis}
We perform an ablation study for synthetic experiments using different numbers of samples and integration points. The parameters of the Hawkes process are drawn from the same distributions as described in Section 4.1, where we used 128 MC samples and 1024 integration points. Figure \ref{fig:diff_seqs} shows the results of the same experiment but varies the number of Monte Carlo sampled sequences and keeps the number of integration points as 1024, while \ref{fig:diff_int} shows the same results but varies the number of integration points while keeping the number of Monte Carlo samples fixed to 128. Aside from slight deviations on the lower end of the values tested (e.g., number of sampled sequences $=2$ and number of integration points $=8$), the results across the board are roughly consistent. This indicates that our method is fairly robust and does not necessitate prohibitive amounts of computing resources to employ.

That being said, we do recommend evaluating this on a case-by-case basis as each model and dataset are different. In critical applications, this concern can be taken care of by iteratively sampling sequences and monitoring the convergence of the resulting censored intensity function. 

\begin{figure}
    \centering
    \includegraphics[width=\linewidth]{figures/log_likelihood_plots_diff_seqs.pdf}
    \caption{Distributions of likelihood ratios across number of marks censored for the duration of the sequences used for synthetic experiments. Integration points is fixed as 1024, with varying numbers of MC samples used for estimation. Values greater than 1 indicate higher likelihoods under the mark-censored model.}
    \label{fig:diff_seqs}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[width=\linewidth]{figures/log_likelihood_plots_diff_int.pdf}
    \caption{Same format as Fig. \ref{fig:diff_seqs} except using 128 MC samples and different numbers of integration points for estimation.}
    \label{fig:diff_int}
\end{figure}




% \clearpage
\bibliography{references}

\end{document}
