\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
% version; also before submission to
% see how the non-anonymous paper
% would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
% ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage[capitalize]{cleveref}
\usepackage{float}
\usepackage{xcolor}
\usepackage{bbm}
\usepackage{graphicx}
\usepackage{amssymb}
\usepackage{amsmath}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\newcommand{\R}{\mathbb{R}}
\newcommand{\Z}{\mathbb{Z}}
\newcommand{\N}{\mathbb{N}}
\newcommand{\B}{\mathcal{B}}
\newcommand{\hist}{\mathcal{H}}
\newcommand{\F}{\mathcal{F}}
\newcommand{\E}{\mathbb{E}}
\newcommand{\Prob}{p}%\mathbb{P}}
\newcommand{\q}{q}%\mathbb{Q}}
\newcommand{\prob}{\Prob}
\newcommand{\var}{\text{Var}}
\newcommand{\cov}{\text{Cov}}
\newcommand{\ind}{\mathbbm{1}}
\newcommand{\sep}{\!\;|\;\!}
\newcommand{\hit}{\text{hit}}
\allowdisplaybreaks




% \title{Temporal Point Processes with Censored Marks}
\title{Inference for Mark-Censored Temporal Point Processes}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
% \aistatsauthor{Alex Boyd${}^1$ \And Yuxin Chang${}^2$ \And  Stephan Mandt${}^{1,2}$ \And Padhraic Smyth${}^{1,2}$}
% \runningauthor{Alex Boyd, Yuxin Chang, Stephan Mandt, Padhraic Smyth}

% \aistatsaddress{${}^1$Department of Statistics\quad ${}^2$Department of Computer Science \\ University of California, Irvine} ]



\author[1]{\href{mailto:<alexjb@uci.edu>}{Alex~Boyd}{}}
\author[2]{Yuxin~Chang}
\author[1,2]{Stephan~Mandt}
\author[1,2]{Padhraic~Smyth}
% Add affiliations after the authors
\affil[1]{%
    Department of Statistics\\
    University of California, Irvine
}
\affil[2]{%
    Department of Computer Science\\
    University of California, Irvine
}
  
\begin{document}
\maketitle

\begin{abstract}
Marked temporal point processes (MTPPs) are a general class of stochastic models for modeling the evolution of events of different types (``marks'') in continuous time. These models have broad applications in areas such as medical data monitoring, financial prediction, user modeling, and communication networks. Of significant practical interest in such problems is the issue of missing or censored data over time. In this paper, we focus on the specific problem of inference for a trained MTPP model when events of certain types are not observed over a period of time during prediction.
We introduce the concept of mark-censored sub-processes and use this framework to develop a novel marginalization technique for inference in the presence of censored marks. The approach is model-agnostic and applicable to any MTPP model with a well-defined intensity function. We illustrate the flexibility and utility of the method in the context of both parametric and neural MTPP models, with results across a range of datasets including data from simulated Hawkes processes, self-correcting processes, and multiple real-world event datasets. 
\end{abstract}


\section{Introduction}\label{sec:intro}

Stochastic models for event data evolving in continuous time are typically referred to as temporal point processes. An important class within this general family is {\it marked temporal point processes} (MTPPs),  where each event in time is associated with a random outcome known as a mark. In general, the mark can either be discrete or continuous; in this work we focus on discrete marks. The flexibility of MTPPs has allowed them to be applied to a broad range of applications, including medical diagnosis \citep{islam2017marked}, epidemic spread models \citep{marmarelis2022metric}, environmental data analysis \citep{brillinger2000}, financial data prediction \citep{zhu2021probabilistic,shi2022state}, communication network modeling \citep{mishra2013anonymity}, user behavior analysis \citep{yang2021atpp, hatt2020early}, misinformation spread models \citep{zhang2021vigdet}, and activity prediction \citep{fortino2020exploiting}. 

The foundations for MTPP models have their origins in the statistical literature (e.g., \cite{lewis1972multivariate,daley2003introduction, andersen2012statistical}), with subsequent  development of specific classes of MTPPs such as multivariate self-exciting Hawkes processes \citep{hawkes1971spectra} and multivariate self-correcting processes \citep{zheng1991application}. 
More recently, there has been significant activity in  the development of machine learning methods for MTPPs, with a significant emphasis on approaches
%including variants of multivariate Hawkes processes \citep{ghassemi2022online},
that take advantage of neural representation learning, such as recurrent MTPPs \citep{du2016recurrent},  neural Hawkes processes \citep{mei2017neural}, stochastic variants of deep MTPPs \citep{hong2022deep}, 
 %latent MTPPs \citep{linderman2017bayesian}, 
scalable deep MTPPs \citep{turkmen2020fastpoint}, as well as general approaches to forecasting with deep MTPP models \citep{deshpande2021long}.


%Marked temporal point processes (MTPP) are a powerful class of probabilistic models that are able to capture the dynamics of sequences of marked events (i.e., pairs of times and categorical labels). In doing so, they enable straightforward ways of generating and predicting future trajectories of sequences (either from scratch or conditioning on partially completed sequences). 
 
An important practical aspect of working with real-world event data is that censoring of observations can occur in a number of different ways. 
For example, a common example of right-censoring often occurs in survival analysis (a sub-field of temporal point processes) in which a patient's event of interest is unobserved due to the end of a data collection period. This particular type of censoring is  well-studied and there are well-known methods for accommodating this during training and inference. More recently, there has been work on handling broader categories of censoring for neural MTPP models, for example, censoring where each event has a type-specific probability of being missing \citep{mei2019imputing}.
% all events are missing during some time-period (e.g., \cite{mei2019imputing}).

In this paper we focus on a different problem, the problem of making predictions when some, 
% \emph{but potentially not all}, 
or all,
marks are censored over (potentially open-ended) intervals of time, i.e., there is partial censoring of a specific subset of marks. We will refer to this type of censoring as {\it mark-censoring}. 
% To our knowledge, there has been no prior work that addresses this problem of inference with mark-censored sequences for MTPPs in general. 
To our knowledge, there has been no prior work that addresses this problem of adapting MTPPs to mark-censored sequences at inference time.
The problem is motivated by the real-world scenario where an MTPP model has been trained on a known set of marks with fully-observed data, but where at prediction time some of the marks (and their associated timing) are no longer observable. 
For example, in medical data analysis, certain types of events that were measured in the training dataset at a particular hospital might no longer be recorded when the model is deployed at a different hospital. % due to a lack of equipment. 
%Or, in modeling user behavior with apps on a mobile phone, event types from certain apps might not be available due to software changes when the model is deployed (but were available during the training period). 
%Lastly, 
Or, in system monitoring, all events of a certain type could be censored over a window of time  due to events such as network and power outages, and accommodating such gaps is important for modeling future dynamics once outages are resolved.
% For instance, the classification of categories of items on Amazon or the genres of Netflix may change over time, then the previous category label is no longer available if we remove some unpopular categories with fewer items or combine two sub-categories into a single one.

Previous work such as \citet{linderman2017bayesian} focuses on special cases of missingness patterns and/or only applies to specific model architectures (as will be discussed in more detail in Section \ref{sec:related}). In contrast, our work is able to handle all of the scenarios shown in Figure \ref{fig:example_censoring}.
%where we focus on the problem of censoring only occurring at inference time but not during training.
The basis of our approach is a novel marginalization technique that can correct the intensity for the censoring of marks. Our proposed method is {\it model-agnostic} in that it can be applied to any MTPP with a well-defined intensity function. We demonstrate this by employing our method on different types of MTPP models and evaluating predictive performance and simulation behavior under a censored-mark regime.

\begin{figure}
    \centering
    \includegraphics[width=0.975\linewidth]{figures/example_censoring.pdf}
    \caption{Visualization of an example sequence with four possible marks. $\mathbb{M}$ is the vocabulary of possible event types, $\hist_k$ is the history of events with types equal to $k$. Boxes  over  sequences represent different  modes of censoring that could occur during generation: (1) mark-agnostic censoring for a particular interval, (2) censoring of green and red marks over an open interval, and (3) censoring of blue and orange marks over a finite interval. The occurrence of an event or the total count of events during an interval is not known, differentiating our scenarios from the typical ``interval censoring'' in survival analysis or MTPPs.}% and survival analysis.} 
    \label{fig:example_censoring}
\end{figure}




\section{Related Work}\label{sec:related} 

A broad range of temporal censoring scenarios 
%of temporal point processes or general time series data 
have been studied in the literature, such as asynchronous event times \citep{upadhyay2018deep, trouleau2019learning} and interval-censored point process data \citep{fan2009local, rizoiu2022interval}. 
Here we focus the discussion of related work to MTPPs where the marks are from a fixed vocabulary.
Existing work on missingness %that studied different missingness mechanisms 
in this context can broadly be divided into three categories.

The first category considers various incomplete intervals, regardless of event types, and focuses on novel tasks such as imputing missing events and sequential representation learning. For example, \citet{shchur2019intensity} proposed a flow-based mixture model that enables closed-form sampling and handles missing data through imputation. \citet{xu2017learning} assumes that a proportion of each short doubly-censored event sequence is observed, and in turn proposes a sampling-stitching data synthesis method based on parametric Hawkes processes to sample long training sequences that improve predictions. 
% that conditioned on future observations and ensemble possible complete sequences (particles) for particle smoothing using a sequential Monte Carlo approach.
% cite particle filtering? http://approximateinference.org/2017/accepted/LindermanEtAl2017.pdf

% The second category increases the modeling power by allowing any latent unobserved event, regardless of event type, in point processes. 
The second category considers the scenario in which each individual event, regardless of mark or time of occurrence, has a chance of being censored.
For the Hawkes process, for example, sampling methods were developed to identify latent structure in the data \citep{shelton2018hawkes} or to correct for biased marks that are underrepresented \citep{zhou2021multivariate}. In  neural settings, \citet{gupta2021learning, gupta2022modeling} proposed the use of two MTPPs to model missing events in order to make better predictions. 
%Assuming each event is independently censored with event-type specific probabilities, 
\citet{mei2019imputing} proposed bidirectional-LSTM models that are conditioned on future observations to apply particle smoothing to impute unobserved events.

The third category of prior work assumes that events are observed but the mark and/or the exact event time is unknown. For instance, \citet{deutsch2020abc} developed an approximate Bayesian
 algorithm to fit Hawkes processes in the presence of noisy event times, and \citet{calderon2021linking} 
 %proposed a Partial Mean Behavior Poisson process for 
 addressed partially interval-censored Hawkes processes, where the total event counts on the censored intervals are available. For the case of Hawkes models, \citet{linderman2017bayesian} imputed latent marks and developed a sequential Monte Carlo approach for latent Hawkes processes that can also be applied to multiple types of censoring.


In summary, previous approaches to censoring in MTPPs either focus on specific types of missingness mechanisms during training time or focus on one specific type of model such as parametric or neural Hawkes process models. In contrast, our approach considers a broad range of interval- and mark-censoring mechanisms (see \cref{fig:example_censoring}) and is model-agnostic in that it can work with any MTPP model with a marked intensity function at prediction time. Furthermore, the results of our method yield a well-defined intensity function of a MTPP that can be used just the same as any other MTPP, meaning various statistics can be computed such as expected next event (time and mark), log likelihood of partially observed sequences, etc.
% , and allows censoring to occur at inference time on models trained %with fully-observed data 
% without any additional finetuning.
% without training a new model even if the model is trained on fully-observed data.
%marks. 


\section{Mark-Censored Temporal Point Processes}\label{sec:censoring}
% \section{Censoring of MTPPs}\label{sec:censoring}
\subsection{Preliminaries}
\paragraph{Notation}
Let $\tau_1, \tau_2, \dots \in \R_{\geq 0}$ be a sequence of continuous random variables that are ordered, or more formally $\forall i: \tau_i < \tau_{i+1}$. These variables represent the time of occurrence for events of interest. Alongside each time of an event is an accompanying piece of information, such as a label or location, that is commonly referred to as a \emph{mark}. We will represent each mark as a random variable drawn that takes on discrete values from a fixed set of $M$ values: $\kappa_i \in \mathbb{M}\equiv\{1, \dots, M\}$.
 
Let the \emph{history} of events up until, but not including, time $t$ be denoted as $\hist(t)=\{(\tau_i,\kappa_i)\sep \tau_i < t \text{ for } i=1,2,\dots\}.$
This implies that $\hist(\tau_i)=\{(\tau_1,\kappa_1),\dots,(\tau_{i-1},\kappa_{i-1})\}$. 
For our purposes, we will often refer to histories over specific ranges of time such as $\hist[a,b)$ for all events with times occurring in the interval $[a,b)$.
% Realizations of these sequences of events are referred to as \emph{marked temporal point patterns} and represented with lower-case letters: $h(t)=\{(t_i,k_i)\sep t_i < t \text{ for } i\in\Z^+\}$.
Additionally, it is often convenient to consider \emph{mark-specific} histories (i.e., sequences that only contain events of specified marks). These will be denoted as either $\hist_A:=\{(\tau,\kappa)\in\hist\sep \kappa \in A\}$ or $\hist_k:=\{(\tau,\kappa)\in\hist\sep \kappa=k\}$ for $A\subset\mathbb{M}$ and $k\in\mathbb{M}$. 

\paragraph{Marked Temporal Point Processes}
The generative mechanisms for these event sequences are generally referred to as \emph{marked temporal point processes} (MTPPs). 
MTPP models define a probability distribution over a given sequence of $N$ events, $\prob(\hist[0,\tau_N])$.\footnote{For brevity and consistent notation, we will be using $\prob(\cdot)$ in reference to both probability density and mass when appropriate.} These models are typically constructed in an autoregressive fashion,
\begin{align*}
\prob(&\hist[0,\tau_N])=\prod_{i=1}^{N}\prob(\tau_i,\kappa_i\sep\hist[0,\tau_{i-1}]), %\\
%& = \prod_{i=1}^{N}\prob(\tau_i\sep\hist[0,\tau_{i-1}])\prob(\kappa_i\sep\tau_i,\hist[0,\tau_{i-1}])
\end{align*}
where the joint distribution for the next event $(\tau_i,\kappa_i)$ conditioned on all prior events is modeled by the expected, instantaneous rate of change for each mark. This is referred to as the \emph{marked intensity function} and is defined formally as
\begin{align*}
\lambda_k(t\sep\hist(t))dt:=\E_p\left[\ind(|\hist_k[t,t+dt)|=1)\sep \hist(t)\right].
\end{align*}
For brevity, we typically use the following $*$ convention to suppress the conditional: $\lambda^*_k(t):=\lambda_k(t\sep\hist(t))$. Additionally, the following notation will be used to represent the sum of different marked intensities: $\lambda^*_A(t):=\sum_{k\in A}\lambda^*_k(t)$ for $A \subset \mathbb{M}$. Note that these functions not only condition on the preceding events, but also on the fact that no events have occurred since the last event up until time $t$, i.e., $\prob(\cdot\sep\hist[0,t)) \neq \prob(\cdot\sep\hist[0,\tau_{i-1}])$.

The \emph{total intensity function} $\lambda^*(t):=\lambda^*_\mathbb{M}(t)$, also referred to as the \emph{ground intensity}, is sufficient to describe the timing of the next event $\tau_i$. The distribution of the mark conditioned on the timing of the next event is naturally described as 
$\prob(\kappa_i=k\sep\tau_i=t,\hist(t))\equiv \frac{\lambda^*_k(t)}{\lambda^*(t)}$. We will be assuming that the native output of any model we are working with will produce a vector of marked intensity functions over the mark space $\mathbb{M}$ evaluated at time $t$. 
% Any MTPP with a defined set of marked intensity functions can be easily sampled from by utilizing a thinning procedure \citep{ogata1981lewis}, if not directly.

Lastly, the likelihood of a given sequence $\hist$ of length $N$ over an observation window $[0,T]$ can be computed in terms of intensity values:
\begin{align}
\prob(\hist[0,T])=\left(\prod_{i=1}^{N}\lambda_{\kappa_i}^*(\tau_i)\right)\exp\left(-\int_0^T \lambda^*(s)ds\right)\hspace{-0.2em}.\label{eq:likelihood}
\end{align}

\paragraph{Sampling} Any well-behaved MTPP can be easily sampled by using a thinning procedure \citep{ogata1981lewis}, if not directly. This procedure relies on the fact that the superposition of two point processes can be characterized as another point process whose total intensity is the sum of individual total intensities. As such, one can sample candidate event times from a homogenous Poisson process with rate $D$ that dominates the total intensity of the MTPP of interest. These times will be accepted iteratively with probability $\lambda^*(t) / D$, and subsequent marks are sampled from $\prob(\kappa_i=k\sep\tau_i=t,\hist(t))$.


\subsection{Mark-Censored Sub-Process}

\paragraph{Problem Statement} %As mentioned previously, 
Assume that we have access to a trained MTPP with intensity functions $\lambda^*_k(t)$ for $k\in\mathbb{M}$. We are interested in performing inference on such a model in the presence of censoring. In particular, we are interested in a type of censoring we term \emph{mark-censoring} in which only events of types $k\in\mathbb{O}\subset\mathbb{M}$ are observed, while all events of types $k\in\mathbb{C}:=\mathbb{M}\setminus\mathbb{O}$ are censored and unobserved. 
In particular, we assume in mark-censoring that we know (a) the time-interval where censoring occurs and (b) which kinds of marks are missing (e.g., knowing the time intervals and colors of marks in the censoring boxes displayed in \cref{fig:example_censoring}). 
Below we develop the framework for the case when censoring takes place over all of time (i.e., $t\in[0,\infty)$); however, as we will discuss later in this section, the general approach can be directly applied to a range of more complicated censoring schemes (such as those illustrated in \cref{fig:example_censoring}).
% For now we will assume that this censoring takes place over all of time (i.e., $t\in[0,\infty)$); however, later on in this work we will extend the developed framework to handle more complicated censoring schemes.

% Since this adequately models the entire distribution for event sequences as a whole, it then stands to reason that embedded within this model is a well defined sub-process that represents a MTPP that only observes events of types $k\in\mathbb{O} \subset \mathbb{M}$, while events of types $k\in\mathbb{C} := \mathbb{M} \setminus \mathbb{O}$ are censored and unobserved. Without loss of generality, we will assume that this censoring is taking place over the interval $[0,\infty)$.\footnote{More complex scenarios are possible, such as having a finite window or having multiple different censoring windows with different censored sets of marks. Details for accommodating these will be listed later.}

% Note that this is can almost be thought of as possessing an equivalent model to the original one except it was only defined for events that only have marks $k\in\mathbb{O}$; however, one key difference is that we are assuming that the previously described scenario still allows for conditioning on events of types $\mathbb{C}$ even if they are censored moving forward in time. This embedded model that only observed events of types $k\in\mathbb{O}$ will be referred to as the censored sub-process.

\paragraph{On Censoring}
The term ``censoring'' can be quite a loaded concept with regards to statistical models. 
% In our work, we assume the censoring of sequences to refer to partially observing them with known information as to \emph{when} the censoring occurs and to \emph{which} marks. 
In our work we assume the absence of certain marks over a time interval to correspond to \emph{missing completely at random} (MCAR) \citep{heitjan1996distinguishing}, i.e., we assume that the realized sequence $\hist$ (both observed and unobserved portions) are independent of \emph{why} it is censored in the first place. We leave handling of more informative censoring to future work.


\begin{figure*}
    \centering
   % \includegraphics[width=0.74\linewidth]
    \includegraphics[width=0.86\linewidth]{figures/example_intensities_spread_two_col.pdf}
    \caption{Intensity visualizations (lines) alongside conditioned sequences (dots) for a sequence sampled from a self-correcting point process (top), the same process with blue marks censored from time 3 to 7 (middle), and the naive intensity results for the censored sequence (bottom). The middle sequence displays both the observed sequence as opaque dots and the various censored continuations sampled from the importance distribution as transparent dots. Note that the intensity of the censored mark (blue) after the censoring interval (at time 7) does not necessarily equal the intensity before censoring (at time 3).}
    \label{fig:example_intensities}
\end{figure*}


\paragraph{Censored Intensity Function} 
Since we have access to the original MTPP, which models the entire distribution for event sequences as a whole, embedded within this model is a well-defined sub-process that represents an MTPP that only observes events of types $k\in\mathbb{O}$. We refer to this embedded model as a \emph{mark-censored sub-process}. This sub-process can be thought of as the original model with the censored information marginalized out of it. Had this sub-process been our intended model from the beginning, we could have achieved comparable results by censoring the original training data and training a model on what remains. There is one key difference, however, which is that the mark-censored sub-process still allows for conditioning on events of types $\mathbb{C}$ even if they are censored moving forward in time (e.g., in the case that the censoring interval only started at time $t > 0$ instead of at $t=0$---see case 3 in \cref{fig:example_censoring}).

The censored sub-process is a fully-fledged MTPP, and as such it has its own set of marked intensity functions. We will denote these as $\underline{\lambda}^*_k(t)$ for $k \in \mathbb{O}$ (should $k\in\mathbb{C}$ then $\underline{\lambda}^*_k(t)=0$). Likewise, the total intensity for a censored sub-process is defined as $\underline{\lambda}^*(t):=\underline{\lambda}^*_\mathbb{O}(t)$. These will be referred to as the \emph{censored intensity} from here forward. Note that for any MTPP with well-defined intensity functions $\lambda_k^*$, by the point process superposition property it is justified for the censored intensity $\underline{\lambda}_k^*$ to exist for any arbitrary censoring \citep{daley2003introduction}.



\paragraph{High Level Intuition for Censored Intensity}
Later in this section we will present a formal definition of the censored intensity, as well as a tractable estimator for it that solely relies on the original underlying MTPP with likelihood $p$ and intensity $\lambda^*_k(t)$ functions for $k\in\mathbb{M}$. However, prior to presenting these, we will first give an informal overview to help understand the arguments at a high level.

We start by recognizing that we are interested in obtaining the intensity at time $t$ for a censored point process where we only observe events of types $k\in\mathbb{O}$ and no events of types $k\in\mathbb{C}$. To accomplish this, we would prefer to directly marginalize out all possible sequences of $\hist_\mathbb{C}(t)$; however, for most MTPPs this is unobtainable analytically. Instead, we can approximate the censored intensity $\underline{\lambda}^*_k(t)$ for $k\in\mathbb{O}$ with the original intensity by simply sampling a possible sequence $\tilde{\hist}_\mathbb{C}(t)$ from the original point process:
\begin{align*}
\underline{\lambda}^*_k(t) \approx \lambda_k(t\sep \hist_\mathbb{O}(t), \tilde{\hist}_\mathbb{C}(t)),
\end{align*}
where $\tilde{\hist}_\mathbb{C}(t) \sim \prob(\cdot \sep \hist_\mathbb{O}(t))$. Naturally, we cannot directly perform this sampling, so we will do the next best thing and simply sample from the model as usual except that we will prevent any new event with types $k\in\mathbb{O}$ from occurring (i.e., set $\lambda_k^*(t)=0$ when sampling).



To get a better approximation, this should be done many times with different sampled trajectories: $\tilde{\hist}^{(i)}_\mathbb{C}(t)$ for $i = 1,\dots,n$. One could simply compute a standard average where $\underline{\lambda}_k^*(t)\approx 1/n \sum_{i=1}^n \lambda_k(t\sep \hist_\mathbb{O}(t), \tilde{\hist}_\mathbb{C}^{(i)}(t))$; however, since we did not sample $\tilde{\hist}^{(i)}_\mathbb{C}(t)$ perfectly from the model without adjustments we must account for the fact that some samples will be more likely under the original model than others.

As such, we can instead perform a weighted average:
\begin{align*}
\underline{\lambda}_k^*(t) \approx \frac{\sum_{i=1}^n \lambda_k\!\left(t\sep \hist_\mathbb{O}(t), \tilde{\hist}^{(i)}_\mathbb{C}(t)\right)\omega\!\left(\tilde{\hist}_\mathbb{C}^{(i)}(t)\right)}{\sum_{i=1}^n \omega\!\left(\tilde{\hist}_\mathbb{C}^{(i)}(t)\right)}
\end{align*}
where $\omega(\cdot)$ determines the weight of a sampled trajectory. We define this weight to be the probability of the imposed sampling restriction (i.e., no \emph{new} events of types $k\in\mathbb{O}$ allowed) being satisfied under the original model. This can be computed for a given sample and is equal to
\begin{align*}
\omega(\tilde{\hist}_\mathbb{C}(t)) = \exp\left(-\int_0^t \lambda_\mathbb{O}(s \sep \hist_\mathbb{O}(s), \tilde{\hist}_\mathbb{C}(s))ds \right).
\end{align*}



% \begin{figure*}
%     \centering
%    % \includegraphics[width=0.74\linewidth]
%     \includegraphics[width=0.88\linewidth]{figures/example_intensities_spread_two_col.pdf}
%     \caption{Intensity visualizations (lines) alongside conditioned sequences (dots) for a sequence sampled from a self-correcting point process (top), the same process with blue marks censored from time 3 to 7 (middle), and the naive intensity results for the censored sequence (bottom). The middle sequence displays both the observed sequence as opaque dots and the various censored continuations sampled from the importance distribution as transparent dots. Note that the intensity of the censored mark (blue) after the censoring interval (at time 7) does not necessarily equal the intensity before censoring (at time 3).}
%     \label{fig:example_intensities}
% \end{figure*}


As an illustration of this censored intensity $\underline{\lambda}_k^*(t)$,   \cref{fig:example_intensities}  shows the original, censored, and naive intensities for an example sequence sampled from a self-correcting process. After the censoring interval (in gray) ends at $t=7$, the censored intensity tracks the original true intensity (top) much more closely than the naive intensity (bottom) does. In this context, naive intensity is referring to the original intensity being computed while treating the partially observed sequence $\hist_\mathbb{O}$ as if it were the fully observed sequence $\hist$.

% \footnotetext{The intensity function is defined as $\lambda_k(t)=\exp\left(\mu_k t - \sum_{\tau,\kappa \in \mathcal{H}(t)}\alpha_{\kappa,k}\right)$ where $\mu_1=\mu_2=\frac{1}{5}$, $\alpha_{1,1}=\alpha_{1,2}=\alpha_{2,2}=\frac{1}{4}$ and $\alpha_{2,1}=0$.}

The approximation of $\underline{\lambda}_k^*(t)$ is for finite samples and is a ratio estimator \citep{tin1965comparison}. Taking the limit as $n\rightarrow\infty$ converts each summation into an expected value with respect to the proposal distribution, as ratio estimators are consistent. This description matches what will formally be derived below in \cref{eq:censored_intensity_result}. Please refer to the Appendix for an in depth analysis on the bias and variance of this estimator when using finite samples.


%%%%%%
\paragraph{Formal Definition of $\underline{\lambda}$}
Without loss of generality, we will assume that any prior events being conditioned on have been shifted to end at $t=0$ such that $\hist(0)$ contains all of the previous events. It can be shown that the censored intensity function for the sub-process is just a specific marginalization of the original intensity function:
\begin{align*}
& \underline{\lambda}^*_k(t) := \underline{\lambda}_k(t\sep\hist(0),\hist_\mathbb{O}[0,t)=\emptyset) \text{ for } k \in \mathbb{O} \\
& = \lim_{\Delta \downarrow 0} \frac{1}{\Delta} \Prob(\hit(k)\in [t, t+\Delta) \sep \hist(0),\hist_\mathbb{O}[0,t)=\emptyset) \\
& = \lim_{\Delta \downarrow 0} \frac{1}{\Delta} \E_{\prob(\hist_\mathbb{C}[0,t)\sep \hist(0), \hist_\mathbb{O}[0,t)=\emptyset)} \big[ \\
& \quad\quad \Prob(\hit(k)\in [t, t+\Delta) \sep \hist(0),\hist_\mathbb{O}[0,t)=\emptyset, \hist_\mathbb{C}[0,t))\big] \\
& = \lim_{\Delta \downarrow 0} \frac{1}{\Delta} \E_{\prob(\hist_\mathbb{C}[0,t)\sep \hist(0), \hist_\mathbb{O}[0,t)=\emptyset)} \big[ \\
& \quad\quad \Prob(\tau_i\in [t, t+\Delta), \kappa_i=k \sep \hist(t))\big], \quad |\hist(t)| = i-1\\
& = \E_{\prob(\hist_\mathbb{C}[0,t)\sep \hist(0), \hist_\mathbb{O}[0,t)=\emptyset)} \big[ \\
& \quad\quad \lim_{\Delta \downarrow 0} \frac{1}{\Delta} \Prob(\tau_i\in [t, t+\Delta), \kappa_i=k \sep \hist(t))\big] \text{ by DCT} \\
& = \E_{\prob(\hist_\mathbb{C}[0,t)\sep \hist(0), \hist_\mathbb{O}[0,t)=\emptyset)} \left[\lambda^*_k(t)\right] 
\end{align*}
where in this context, $\hit(k)$ refers to the first occurrence time of event $k$, and $\hist(t):=\hist(0)\cup\hist_\mathbb{O}[0,t)\cup\hist_\mathbb{C}[0,t)$. The Dominated Convergence Theorem (DCT) holds true because we assume that there exists some value $D$ that is greater than $\lambda^*_k(t)$ for any given $t$. Note that this assumption is typically made to sample from arbitrary MTPPs.


\paragraph{Tractable Estimation of Censored Intensity}

To approximate the censored intensity function $\underline{\lambda}_k^*(t)$, we need to perform a Monte Carlo estimation on the above derived expected value, $\E_{\prob(\hist_\mathbb{C}[0,t)\sep \hist(0), \hist_\mathbb{O}[0,t)=\emptyset)} \left[\lambda^*_k(t)\right]$. The only issue is that we cannot directly sample from $\prob(\hist_\mathbb{C}[0,t)\sep \hist(0), \hist_\mathbb{O}[0,t)=\emptyset)$ due to the autoregressive nature of MTPPs. 

Consider the proposal distribution $\q$ which is a MTPP with intensity function
\begin{align}
\mu^*_k(t) = \begin{cases} 0 & \text{ if } k \in \mathbb{O} \text{ and } t \geq 0 \\
\lambda^*_k(t) & \text{ otherwise.} \label{eq:proposal_intensity}
\end{cases}
\end{align}
This can essentially be thought of as the original MTPP prior to censoring, and then during sampling it only produces sequences of events that cannot be observed. The likelihood for a sequence under this distribution is computed as follows:
\begin{align}
& \q(\hist_\mathbb{C}[0,t)) := \q(\hist_\mathbb{C}[0,t) \sep \hist(0)) \label{eq:proposal_likelihood} \\
& = \left[\prod_{i=1}^N \mu^*_{\kappa_i}(\tau_i)\right]\exp\left(-\int_0^t \mu^*(s)ds \right) \nonumber \\
& = \!\left[\prod_{i=1}^N \lambda^*_{\kappa_i}(\tau_i) \ind(\kappa_i \in \mathbb{C})\right]\!\exp\left(-\int_0^t \lambda^*_\mathbb{C}(s)ds \right) \nonumber 
\end{align}
where $|\hist_\mathbb{C}[0,t)|=N$.
Note that the proposal distribution has the same support as $\prob(\hist_\mathbb{C}[0,t)\sep\hist(0),\hist_\mathbb{O}[0,t)=\emptyset)$.\footnote{It follows that $\E_{q(\hist_\mathbb{C})[0,t)}\left[\ind(\hist_\mathbb{O}[0,t)=\emptyset)\right]=1$, which becomes useful for subsequent derivations.}

Using importance sampling with this proposal distribution, we can see that the censored intensity becomes tractable:
\begin{align*}
&\underline{\lambda}^*_k(t)  = \E_{\prob(\hist_\mathbb{C}[0,t)\sep \hist(0), \hist_\mathbb{O}[0,t)=\emptyset)} \left[\lambda^*_k(t)\right] \\
&\! = \E_{\q(\hist_\mathbb{C}[0,t))}\!\! \left[\lambda^*_k(t)\frac{\prob(\hist_\mathbb{C}[0,t)\sep \hist(0), \hist_\mathbb{O}[0,t)=\emptyset)}{\q(\hist_\mathbb{C}[0,t))}\right] \\
&\! = \E_{\q(\hist_\mathbb{C}[0,t))}\!\! \left[\lambda^*_k(t)\frac{\prob(\hist_\mathbb{C}[0,t)\sep \hist(0))\ind(\hist_\mathbb{O}[0,t)=\emptyset)}{\prob(\hist_\mathbb{O}[0,t)=\emptyset\sep \hist(0))\q(\hist_\mathbb{C}[0,t))}\right] \\
&\! =\! \frac{\E_{\q(\hist_\mathbb{C}[0,t))}\!\! \left[\lambda^*_k(t)\frac{\left[\prod_{i=1}^N \lambda^*_{\kappa_i}(\tau_i) \right]\exp\left(-\int_0^t \lambda^*(s)ds \right)}{\left[\prod_{i=1}^N \lambda^*_{\kappa_i}(\tau_i) \ind(\kappa_i \in \mathbb{C})\right]\exp\left(-\int_0^t \lambda^*_\mathbb{C}(s)ds \right)}\right]}{\prob(\hist_\mathbb{O}[0,t)=\emptyset\sep \hist(0))} \\
&\! = \frac{\E_{\q(\hist_\mathbb{C}[0,t))} \!\! \left[\lambda^*_k(t)\exp\left(-\int_0^t \lambda^*_\mathbb{O}(s)ds \right)\right]}{\prob(\hist_\mathbb{O}[0,t)=\emptyset\sep \hist(0))}.
\end{align*}
Note that in this context $\prob(\hist_\mathbb{C})$ is equivalent to the likelihood of $\hist_\mathbb{C}$ under the original model $p$, \emph{as if it were a fully observed sequence $\hist$}.

Now the expected value can be approximated with easy-to-access Monte Carlo samples. The only immediate problem is evaluating $\prob(\hist_\mathbb{O}[0,t)=\emptyset\sep \hist(0))$ as this does not have a closed form solution; however, as in the recent approach of \citet{boyd2022probabilistic}, we can estimate this statement using importance sampling.
% conveniently this statement falls under a class of estimable probabilistic queries that can take advantage of importance sampling to estimate \citep{boyd2022probabilistic}. 
Interestingly, we can actually utilize the exact same proposal distribution $q$ as specified in \cref{eq:proposal_intensity,eq:proposal_likelihood} to represent $\prob(\hist_\mathbb{O}[0,t)=\emptyset\sep \hist(0))$ as a tractable expected value:
% this probabilistic statement as an expected value:
\begin{align*}
\prob(&\hist_\mathbb{O}[0,t)=\emptyset\sep \hist(0))  \!=\! \E_{\prob(\hist[0,t)\sep \hist(0))}\left[\ind(\hist_\mathbb{O}[0,t)=\emptyset)\right] \\
& = \E_{\q(\hist_\mathbb{C}[0,t))} \left[\ind(\hist_\mathbb{O}[0,t)=\emptyset)\frac{\prob(\hist_\mathbb{C}[0,t)\sep \hist(0))}{\q(\hist_\mathbb{C}[0,t))}\right] \\
& = \E_{\q(\hist_\mathbb{C}[0,t))}\left[\exp\left(-\int_0^t \lambda^*_\mathbb{O}(s)ds\right)\right].
\end{align*}
Thus, the censored intensity can be ultimately represented as a ratio of two expected values:
\begin{align}
\hspace{-0.975em}\implies \underline{\lambda}^*_k(t) & = \frac{\E_{\q(\hist_\mathbb{C}[0,t))}\!\! \left[\lambda^*_k(t)\exp\left(-\int_0^t \lambda^*_\mathbb{O}(s)ds \right)\right]}{\E_{\q(\hist_\mathbb{C}[0,t))} \!\! \left[\exp\left(-\int_0^t \lambda^*_\mathbb{O}(s)ds \right)\right]}.\! \label{eq:censored_intensity_result}
\end{align}
In practice, this censored intensity can be approximated using Monte Carlo (MC) estimates for both the numerator and denominator.

It is worth reiterating that this estimator, which accounts for the censoring of marks $\mathbb{C}$ at inference time, only requires a  trained MTPP along with samples from it. No further training, additional models, or specific architectures are required to properly deal with the censoring. 


\paragraph{More Complex Censoring Regimes}
All of the derivations thus far have been focused on having a static set of marks $\mathbb{C}$ being censored for an indefinite amount of time; however, there are many other types of censoring that can occur for a given MTPP. For example, the censoring could occur over a specific window of time for either some or all marks $\mathbb{M}$. This could occur, for instance, in settings where the connection is briefly lost to some or all sensors in a system. Furthermore, censoring could occur multiple times over different windows, and the marks being censored across each window need not be the same from censoring to censoring. See \cref{fig:example_censoring} for example censoring scenarios.

We can easily extend our previous results to cover the most general case allowing for censoring over arbitrarily many time windows and arbitrarily different censored marks. To do so, first we will define the censoring schedule. The observed and censored marks, $\mathbb{O}$ and $\mathbb{C}$, are no longer static and will potentially change over time. This will be represented via $\mathbb{O}(t), \mathbb{C}(t) \subset \mathbb{M}$ for $t \geq 0$. This results in the proposal distribution $q$ now being characterized by the intensity function $\mu_k^*(t)=\lambda^*_k(t)\ind(k\in\mathbb{C}(t))$. Lastly, the resulting censored intensity estimate also accommodates this dynamic censoring:
\begin{align}
\underline{\lambda}^*_k(t) & = \frac{\E_{\q(\hist[0,t))} \left[\lambda^*_k(t)\exp\left(-\int_0^t \lambda^*_{\mathbb{O}(s)}(s)ds \right)\right]}{\E_{\q(\hist[0,t))} \left[\exp\left(-\int_0^t \lambda^*_{\mathbb{O}(s)}(s)ds \right)\right]}. 
\end{align}
This result is achieved effectively for free as the censored intensity $\underline{\lambda}_k^*(t)$ in the static setting is technically defined individually for any given moment in time $t$, making the swap from $\mathbb{O}$ to $\mathbb{O}(t)$ and $\mathbb{C}$ to $\mathbb{C}(t)$ for each $t$ well defined.  

\paragraph{More Complex Mark Spaces $\mathbb{M}$}
Our setting of interest has the marks being modeled come from some discrete, finite mark space $\mathbb{M}:=\{1,\dots,M\}$; however, that does not have to be the case. We can easily extend our method to apply for more complex mark spaces. Consider an arbitrary mark space $\mathbb{M}$ which could be finite, continuous, high-dimensional, etc. and let $\nu$ be a reference measure for $\mathbb{M}$ (e.g., the Lebesgue measure for $\mathbb{M}\equiv\mathbb{R}$). Assume we have a MTPP model with marked intensity function $\lambda^*(t, m)$ for $m \in \mathbb{M}$, and that under our framework we know the observed and censored portions of the mark space at any given time, $\mathbb{O}(t) \subset \mathbb{M}$ and $\mathbb{C}(t):=\mathbb{M}\setminus\mathbb{O}(t)$ respectively. From this, the censored intensity defined in \cref{eq:censored_intensity_result} can be readily used by letting $\lambda_{\mathbb{O}(t)}^*(t):=\int_{\mathbb{O}(t)}\lambda^*(t,m)d\nu(m)$ which can either be computed analytically or estimated with Monte-Carlo samples. The proposal distribution stays the same as previously defined and samples from it can be achieved easily using either rejection sampling on top of the typical thinning procedure.

% \begin{align*}
% \cov\left(f(X)g(X), g(X)\right) & = \E\left[f(X)g^2(X)\right]-\mu_{fg}\mu_g \\
% & <  \E\left[f(X)g(X)\right]-\mu_{fg}\mu_g \text{ since } g(X) \in (0,1) \implies g^2(X) < g(X) \\
% & = \mu_{fg} - \mu_{fg}\mu_g
% \end{align*}
% \begin{align*}
% \cov\left(f(X)g(X), g(X)\right) & = \E[f(X)]\cov(g(X), g(X)) + \E[g(X)]\cov(f(X), g(X)) + \E[(f(X)-\mu_f)(g(X)-\mu_g)^2] \\
% & = \mu_f \sigma^2_g + \mu_g\cov(f(X), g(X)) + \E[f(X)g^2(X)] - \mu_f \mu_g^2
% \end{align*}


% \begin{figure}
%     \centering
%     \includegraphics[width=\linewidth]{Figures/censored_visualization.pdf}
%     \caption{Caption}
%     \label{fig:my_label}
% \end{figure}


\section{Experiments}\label{sec:experiments}
We investigate experimentally the impact that mark-censoring has on various MTPP models and the ability of our proposed marginalization method to handle such censoring relative to baseline. Our investigations are carried out across both classical parametric models and neural network-based models on both synthetic and real-world data, respectively. We find, as a whole, that in the presence of mark-censoring, the inference ability of a model (i.e., assigning likelihood to observed sequences) suffers significantly in comparison to properly accounting for the missing data via our method. Not surprisingly, we also find that our method yields larger improvements as the information being censored becomes more influential with respect to the information observed. 

% We also investigate the effect of  accounting for censoring when performing the next event (time and mark) prediction. In synthetic settings, we observe different systematic differences in performance, depending on the underlying model. In real-world settings with neural models, we found roughly equal performance for time prediction and slight, consistently better, mark predictions for the mark-censored model over the baseline original model. More details, in-depth results, and discussion for the next event prediction can be found in the Appendix.
We also investigate the effect that mark-censoring has on next event (time and mark) prediction. We observe in general systematic differences that our mark-censored model has on these predictions, with positive improvements in real-world settings. Lastly, we also perform a sensitivity analysis on the effect of both the number of sequences sampled as well as the resolution used in estimating integrals has on our method. We find that our method is typically fairly robust to these hyperparameters. More details and exact results for both of these experiments can be found in the Appendix.

\paragraph{Censoring}
In each of the experiments, we analyze the performance of models using various sequences $\hist(T)$ of differing lengths $T$.
For the synthetic setting, we utilize sequences that have been drawn from the given models. For the real-world data, we use held-out sequences from the dataset that a given model was trained on.

For every sequence being used, we filter out events according to a particular censoring scheme that is selected for each sequence individually to produce $\hist_\mathbb{O}(T)$. To ensure that the chosen censoring scheme is relevant for a given sequence, we randomly select a non-empty subset $\mathbb{C}(t)$ of the unique marks that actually appear in $\hist(T)$ for $t\in[0,T]$. The proportion of marks to censor, relative to the total number of unique marks in each sequence $\hist(T)$, which we will refer to as $\gamma$, is varied based on the particular sequence for the experiment being conducted.

It is important to note that since information in $\hist(T)$ is informing the censoring scheme $\mathbb{C}$ that we technically no longer have data that is MCAR. As we will see, in spite of violating this assumption the mark-censored model still yields substantial performance gains.

\paragraph{Methods \& Metric of Interest}
For the main set of experiments, we primarily compared two approaches. Both rely on an existing MTPP and are used to calculate the likelihood of a given observed sequence $\hist_\mathbb{O}(T)$.\footnote{Previous works are not compared against in these experiments due to them largely having different goals and setups (such as learning from censored data during training time or imputing missing data), as well as typically not having a proper likelihood.}

The first approach is our proposed mark-censored model, using $\underline{\lambda}_k^*$ for $k \in \mathbb{O}$. Since this is a well-defined MTPP, we can calculate the likelihood of $\hist_\mathbb{O}(T)$ using \cref{eq:likelihood} in conjunction with the censored intensity. Results for this method will be labeled as ``Censored.'' 
Synthetic and real-world experiments use 128 and 64 MC samples to estimate the censored intensity respectively;
% All synthetic experiments use 128 MC samples to estimate the censored intensity, whereas the real-world experiments all use 64 MC samples; 
both use 1024 integration points for numerically estimating integrals.

The second approach is a baseline method for comparison, based on a slight adaptation to the original model that takes advantage of knowing what marks are being 
censored for a given sequence. This method uses the original intensity $\lambda^*_k(t)$ for $k\in\mathbb{O}$ and sets the intensity to be 0 when $k \in \mathbb{C}$. Results for this method will be labeled as ``Baseline.'' In general, we expect the two methods to be comparable should $\prob(\hist_\mathbb{C}(T)=\emptyset\sep\hist_\mathbb{O}(T)) \approx 0$ as the two methods would produce similar intensity values.

We do not include results where we evaluate the likelihood of the observed sequence $\hist_\mathbb{O}(T)$ as if it were a fully observed, uncensored sequence under the original model. Since intensity values are always non-negative, likelihood values using this approach will \emph{never} be better than the baseline. Because of this, we only compare against the baseline as it effectively captures the original model's inference capabilities while still managing to leverage information about the mark-specific censoring scheme to some degree. Note also that none of the methods discussed earlier in Related Work are used as baselines since none are applicable to mark-censoring and model-agnostic. % across multiple different types of models.

Results are reported as likelihood ratios between the censored method and the baseline method for individual observed sequences. These ratios directly quantify how much more likely the censored method perceives a sequence to be relative to the baseline. Values above 1 are evidence in favor of the censored method, and below 1 for the baseline. It should be noted that the sequences used in these experiments are censored over the entire observation window $[0,T]$.



\subsection{Experiments on Synthetic Data}

\begin{figure}
    \centering    \includegraphics[width=\linewidth]{figures/vert_synth_box_log_likelihood_plots.pdf}
    \caption{Distributions of likelihood ratios across number of marks censored for the duration of the sequences used for synthetic experiments with self-correcting, Hawkes (dense), and Hawkes (block-diagonal) models. Values greater than 1 indicate higher likelihoods under the mark-censored model.}
    \label{fig:synth_log_likelihood}
\end{figure}

\begin{figure}
    \centering    \includegraphics[width=\linewidth]{figures/vert_int_synth_box_log_likelihood_plots.pdf}
    \caption{Distributions of likelihood ratios for a block-diagonal Hawkes model with varying interaction strengths applied to off-diagonal $\alpha$ terms. Values greater than 1 indicate higher likelihoods under the mark-censored model compared to the baseline.}
    \label{fig:synth_interaction}
\end{figure}

\begin{figure*}
    \centering
    \includegraphics[width=\linewidth]{figures/log_likelihood_plots.pdf}
    \caption{Same setup as  \cref{fig:synth_log_likelihood} except with results produced on four real-world datasets with trained neural Hawkes models. Note that we display the results with respect to the absolute amount of marks censored rather than the percentage censored as we suspect this has a more direct impact on the likelihood ratios, especially when dealing with sequences that naturally have few unique marks compared to the total mark space $\mathbb{M}$---as is typical in real datasets.}
    \label{fig:log_likelihood}
\end{figure*}


% \begin{figure*}[ht]
%     \centering
%     \includegraphics[width=\linewidth]{figures/synth_box_log_likelihood_plots.pdf}
%     \caption{First three sets of plots on left describe the distributions of likelihood ratios (top) and number of marks censored (bottom) for synthetic experiments with self-correcting, Hawkes (dense), and Hawkes (block-diagonal) models. Rightmost plot describes the distributions of likelihood ratios for a block-diagonal Hawkes model with varying interaction strengths applied to off-diagonal $\alpha$ terms.}
%     \label{fig:synth_log_likelihood}
% \end{figure*}

\paragraph{Models}
% To mitigate the noise from model misspecification, 
We evaluate our method on randomly instantiated parametric MTPPs including Hawkes processes \citep{hawkes1971spectra} and self-correcting processes \citep{isham1979self} (also known as stress release model \citep{zheng1991application}), where the sampled sequences are evaluated on the same model.

For Hawkes processes with exponential kernels, the intensity has the form 
$\lambda_k^*(t) = \mu_k + \sum_{\tau,\kappa\in\hist(t)} \phi_{\kappa,k}(t - \tau)$. 
% $\lambda_k^*(t) = \mu_k + \sum_{\kappa=1}^K \sum_{\tau_{\kappa, i} < t} \phi_{\kappa k}(t - \tau_{\kappa, i})$, 
% where $\tau_{\kappa, i}$ refers to the time for the $i$th event of type $\kappa$. 
The kernel can be expressed as $\phi_{i,j}(z) = \alpha_{ij} \exp(-\beta_{ij} z)$, where parameters $\alpha_{ij}, \beta_{ij} > 0$ for $i,j\in\mathbb{M}$ specify the excitation effects and decay rates respectively that events of type $i$ have on events of type $j$. We consider two different instantiations of this type of model; both with 20 marks. We refer to the first type   as ``Hawkes (dense)'' with all parameters drawn from the following distributions: $\alpha_{ij}\stackrel{iid}{\sim}\text{Unif}[0.075, 0.2]$, $\beta_{ij}\stackrel{iid}{\sim}\text{Unif}[0.4, 1.2]$, and $\mu_k\stackrel{iid}{\sim} \text{Unif}[0.1,0.5]$. To better emulate realistic settings in which events correlate strongly with other events of certain types, we also consider a sparsely-parameterized version which we refer to as ``Hawkes (block-diagonal)'' \citep{wu2020diagnostics}. This model is instantiated by drawing $\alpha_{ij}\stackrel{iid}{\sim}\text{Unif}[0.3, 0.8]$ when $\lfloor \frac{i-1}{5} \rfloor = \lfloor \frac{j-1}{5} \rfloor$ and $\alpha_{ij}=0$ otherwise.\footnote{Note that different ranges of values were chosen for $\alpha$ between the dense and block-diagonal Hawkes models to normalize the effective rate of events overall. This is done by, in expectation, having the same values for $\sum_{i\in\mathbb{M}} \alpha_{ij}$.} This effectively imposes a block-diagonal structure on the matrix $\{\alpha_{ij}\}$, resulting in four subgroups of correlated marks.
Values for $\mu$ and $\beta$ are drawn similarly to the dense model. 

In contrast, self-correcting processes use  the intensity function $\lambda_k^*(t) = \exp\left(\eta_k t - \sum_{\tau, \kappa \in \hist(t)} \delta_{\kappa k}\right),$
 where $\delta_{\kappa k}>0$ determines the inhibition that past events of type $\kappa$ have on future events of type $k$. The model used for this class also has 20 marks and is instantiated by drawing weights $\delta_{ij} \stackrel{iid}{\sim} \text{Unif}[0.3, 0.8]$.
Values for $\eta$ are drawn similarly to $\mu$.

% We consider dense weights in matrix $\boldsymbol{\alpha}$ for self-correcting processes and Hawkes processes; so all values in $\boldsymbol{\alpha}$ and decay rates $\boldsymbol{\beta}$ are randomly and uniformly sampled. We add additional experiments on Hawkes processes with block-diagonal structure in $\boldsymbol{\alpha}$ by imposing entries that are not block-diagonal to be 0, because there tend to be events that have stronger correlations than some of the others in the real world \citep{wu2020diagnostics}; we observe consistent results in real-world experiments. For Hawkes processes with dense weights, we scale the $\boldsymbol{\alpha}$ by $\frac{1}{4}$ to roughly normalize the total intensity,  which results in an approximately similar number of events in each sampled sequence as the model with block-diagonal weights. The values on the diagonal of $\boldsymbol{\alpha}$ represent the exciting or inhibiting effect of each event on itself, and the off-diagonal values determine the interaction between different types of events. 



\paragraph{Results}
We evaluated the likelihood ratio of 1000 censored sequences on all three models with interaction strength fixed at 0.5 (a scalar that controls the interaction between events of different types) for each value $\gamma\in\{0.2, 0.4, 0.6, 0.8\}$. Each sequence prior to censoring was sampled from each model (self-correcting, Hawkes (dense), and Hawkes (block-diagonal)) over the time window $t\in[0,2]$ and contains at most 200 events. These results are shown in \cref{fig:synth_log_likelihood}, where the likelihood ratio of the censored method compared to the baseline is visualized with respect to the number of marks censored. 
We see a systematic improvement in the estimated likelihood when using the mark-censored model. Furthermore, the improvement increases as more information is censored; however, it is clear that the improvement depends on the relationship between events and the underlying model dynamics (i.e., the form of $\lambda$) as noted by the difference in results between models.
% In all three parametric settings (i.e., Hawkes processes with dense or block-diagonal weights and self-correcting processes with dense weights), we observe systematic improvements in log-likelihood estimations under our framework. Furthermore, the Hawkes processes with block-diagonal weights consistently improve the next event predictions when there is no model misspecification, both in event type and time, which is discussed in the Appendix.
% Figure \ref{fig:synth_log_likelihood} studied the impact of our method with a varying number of censored marks or different interaction strengths (scalar numbers that define the interaction between events of different types). 
% The latter is only feasible in synthetic experiments since real-world datasets completely determine the parameters of the single model to be trained. 
% In each experiment, we randomly initiated a parametric model with 20 marks where $\alpha_{i,j} \sim \operatorname{Unif}([0.3, 0.8]), \forall \alpha_{i,j} \in \boldsymbol{\alpha}$ and $\beta_{i,j} \sim \operatorname{Unif}([0.4, 1.2]), \forall \beta_{i,j} \in \boldsymbol{\beta}$. Then we rescale $\boldsymbol{\alpha}$ or impose block-diagonal weights for dense and block-diagonal Hawkes processes respectively. For block-diagonal Hawkes processes, we consider the simple case that the 20 marks are partitioned into 4 groups with 5 events in each.
% The left three plots in Figure \ref{fig:synth_log_likelihood} are the likelihood ratios between our method and the baseline against the number of marks being censored. For each model with a single fixed interaction strength, 1000 sequences are sampled. Each sequence has at most 200 events. For each sequence, we randomly sample $\gamma = {0.2, 0.4, 0.6, 0.8}$ of the unique marks that appear in the sequence to be completely censored on the time window $T=[0, 2.0]$. 
% The censored intensity is estimated using 128 samples and 1024 integration points. 

To further investigate this, %we performed an ablation study with 
for the block-diagonal Hawkes model we artificially modulate the interaction strength between events of different types. To do this, we performed the same likelihood ratio evaluation on 1000 sequences with $\gamma=0.5$ using the same block-diagonal Hawkes model but with $\alpha^\prime_{ij}:=
c\alpha_{ij}$ if  $i\neq j$ and
$\alpha_{ij}$ otherwise for each value of $c\in\{0.1, 0.2, \dots, 1.0\}$. 
% In contrast, the right-most plot in Figure \ref{fig:synth_log_likelihood} fixes the proportion of censoring marks $\gamma = 0.5$ and varies the interaction strength of the model evenly spaced by 0.1 from 0.1 to 1. 
This results in 10 different models that have the same diagonal values in $\boldsymbol{\alpha}$ but different scales of off-diagonal values. 
The results  in \cref{fig:synth_interaction}  clearly demonstrate that properly 
%censoring a given MTPP to accommodate 
accommodating mark-censored sequences yields the biggest impact when there is high correlation between observed and censored events.
% Similarly, results are obtained on 1000 sequences with at most 200 events, and $T=2.$
% as well as 128 samples and 1024 integration points for importance sampling.

% Figure \ref{fig:synth_log_likelihood} shows systematic improvements in our method compared to the baseline. The gain in our method tends to increase with more censored marks or more interactions between events of different types. However, the improvement depends on the relationship between events and the exciting or inhibiting effect of current events on future events.



\subsection{Experiments on Real-World Data}



\paragraph{Models}
Many real-world data involve working with large vocabularies of possible marks, $|\mathbb{M}|=M$. Because of this, it can often be more parameter efficient to train a neural network based MTPP rather than a classical parametric one. The model architecture of choice for our experiments is the neural Hawkes process, a continuous-time RNN that takes inspiration from the parametric Hawkes process \citep{mei2017neural}. Details on model hyperparameters, optimizer, training regime, etc. can be found in the Appendix.

\paragraph{Datasets}
We evaluate our censoring method on neural Hawkes models that have been trained individually on four different datasets.
The \textbf{Taobao} user behavior dataset \citep{zhu2018learning} contains page-viewing records of different categories ($M=1000$) from users on an e-commerce platform.
The \textbf{Reddit} dataset \citep{baumgartner2020pushshift} contains comments that users have made on various communities ($M=1000$) on the social media website \url{reddit.com}. 
\textbf{MemeTracker} \citep{leskovec2009meme} contains records of what websites ($M=5000$) a common phrase, or meme, was mentioned on over time.
Lastly, the \textbf{Email} \citep{paranjape2017motifs} dataset contains sequences of sender addresses of incoming emails ($M=808$) for each recipient within a research organization. More information on various aspects of these datasets and details of data preprocessing can be found in the Appendix. The following results are achieved using models that have been sufficiently trained on their respective datasets.

\paragraph{Results}
We evaluated the likelihood ratio of 1000 held out, censored sequences for each dataset for each value $\gamma\in\{0.2, 0.4, 0.5, 0.6, 0.8\}$. The results are shown in \cref{fig:log_likelihood}. Similar to the results in the synthetic experiments, we see a systematic trend towards a large improvement in likelihood over censored sequences across the board. This improvement increases significantly as more marks are censored.

% It is worth noting that in the synthetic, we were afforded certain guarantees on the censoring method due to the sequences originating from the same models being used for inference. For these real-world settings, however, there exists the possibility of model misspecification that could allow the baseline to perform better for a given censored sequence. It stands to reason then that models that have higher likelihoods of held-out, fully observed sequences are better suited for use with censored sequences as well. See the Appendix for a further discussion on model misspecification.




% \begin{figure*}
%     \centering
%     \includegraphics[width=\linewidth]{figures/next_event_plots.pdf}
%     \caption{Caption}
%     \label{fig:next_event}
% \end{figure*}


\section{Conclusions}\label{sec:conclusion}

% {\bf REMINDER: be sure to briefly discuss limitations here or at the end of the Results section, e.g., what happens if we have mark-censoring during training? Could be mixed in with the discussion of future work, e.g., "A limitation of our proposed approach is that it only handles mark-censoring at prediction time; mark-censoring during training is much more challenging due to X and Y and is clearly a useful direction for future work.}

In this work we proposed a novel marginalization technique for inference in the presence of mark-censoring, for any black-box MTPP model trained on complete histories. 
Our method demonstrates systematic improvements in log-likelihood for both synthetic and real-world data settings. 

A limitation of the approach is that it is restricted to prediction time and is not practical for use during training with mark-censored training data. The main hurdle that needs to be overcome to make our method viable in this setting is that current sampling methods for MTPPs are not differentiable. 
% Another limitation is that inference scales linearly in time with respect to the expected number of censored events, which can lead to large computation times when ???. 
In addition, while our approach is guaranteed to have higher likelihood on average for MTPP models with no misspecification, this guarantee does not hold for misspecified models (e.g., on real data sources)---see Appendix for more details.
%censored inference is potentially sensitive to model misspecification when used on real sequences.
% , and our access to estimated censored intensity allows other general applications such as goodness-of-fit tests and comparing models with different vocabularies with censored data at inference time. 
% Our framework can be straightforwardly extended to continuous marks such as spatial location, as long as the underlying models have access to the proposal distributions. 
% However, the improvement of our method such as predictions can be limited by model misspecification, while we focus on the general framework to deal with mark-censoring and leave it as a future direction to develop better models to mitigate this issue.

Aside from directly addressing these limitations, potential future directions of this work include applying this approach to applications such as assessing good-of-fit and comparing models with different vocabularies, extending the methodology to the continuous mark setting, incorporating more informative censoring schemes (e.g., assuming data is \emph{not} MCAR), and permanently applying censoring via model distillation with a mark-censored process.
% To discuss:
% mark-censoring during training
% time complexity sensitive to interaction (scale of expected censored events)



%  For Alex: method can be straightforwardly extended to continuous marks too? e.g., spatial location


% underlying models have access to the proposal distributions
% future: continuous marks

% \newpage
% \clearpage

\paragraph{Acknowledgements}
This work was supported by National Science Foundation Graduate
Research Fellowship grant DGE-1839285, by an NSF CAREER Award, by the National Science Foundation under award numbers 1900644, 2003237, and 2007719, by the National Institute of Health under awards R01-AG065330-02S1 and R01-LM013344, by the Department of Energy under grant DE-SC0022331, by the HPI Research Center in Machine Learning and Data Science at UC Irvine, and by Qualcomm Faculty awards.

\bibliography{references}





% \begin{abstract}
%   This is the abstract for this article.
%   It should give a self-contained single-paragraph summary of the article's contents, including context, results, and conclusions.
%   Avoid citations; but if you do, you must give essentially the whole reference.
%   For example: This whole paper is devoted to praising É. Š. Åland von Vèreweg's most recent book (“Utopia's government formation problems during the last millenium”, Springevier Publishers, 2016).
%   Also, do not put mathematical notation and abbreviations in your abstract; be descriptive.
%   So not “we solve \(x^2+A xy+y^2\), where \(A\) is an RV”, but “we solve quadratic equations in two unknowns in which a single coefficient is a random variable”.
%   The reason is that mathematical notation will not display correctly when the abstract is reused on the proceedings website, for example, and that one should not assume the abstract's reader knows the abbreviation.
%   Of course the same remarks hold for your paper's title.
% \end{abstract}

% \section{Introduction}\label{sec:intro}
% UAI 2023 papers have to be prepared using \LaTeX.
% To start writing your paper, copy \texttt{uai2023-template.tex} and replace title, authorship, and content with your own.

% The UAI 2023 paper style is based on a custom \textsf{uai2023} class.
% The class file sets the page geometry and visual style.\footnote{%
%     The class uses the packages \textsf{adjustbox}, \textsf{environ}, \textsf{letltxmacro}, \textsf{geometry}, \textsf{footmisc}, \textsf{caption}, \textsf{textcase}, \textsf{titlesec}, \textsf{titling}, \textsf{authblk}, \textsf{enumitem}, \textsf{microtype}, \textsf{lastpage}, and \textsf{kvoptions}.
% }
% The class file also loads basic text fonts.\footnote{%
%     Fonts loaded are \textsf{times} (roman), \textsf{helvet} (sanserif), \textsf{courier} (fixed-width), and \textsf{textcomp} (common symbols).
% }
% \emph{You may not modify the geometry or style in any way, for example, to squeeze out a little bit of extra space.}
% (Also do not use \verb|\vspace| for this.)
% Feel free to use convenience functionality of loaded packages such as \textsf{enumitem}.
% The class enables hyperlinking by loading the \textsf{hyperref} package.

% You are free to load any packages available in \TeX{Live}~2020 that are compatible with the UAI class.\footnote{In case this template or your submission does not compile, always first make sure your \TeX\ installation is up-to-date.}
% (Mik\TeX{} and Mac\TeX{} generally contain the same packages.)
% Do not load conflicting packages—you will get an error message—, as this complicates creating the proceedings.
% Please avoid using obsolete commands, such as \verb|\rm|, and obsolete packages, such as \textsf{epsfig}.\footnote{%
%     See \url{https://ctan.org/pkg/l2tabu}.
% }

% \swap[ ]{in the header of your source file.}{Feel free to include your own macros}

% \section{General Formatting Instructions}
% As a general rule: \emph{follow the template}.

% \subsection{Authorship}
% Reviewing is double-blind.
% However, you can already fill in your author names and affiliations in the \verb|\author| block in the preamble following the example of the template because the class will remove it as long as the option \textsf{accepted} is not passed to the class.
% Nevertheless, make sure any other information in the paper does not disclose your identity, for example URLs to supplementary material.

% \subsection{Sectioning}
% Three numbered sectioning commands are provided: \verb|\section|, \verb|\subsection|, and \verb|\subsubsection|.
% Please respect their order, so do not put a \verb|\subsubsection| directly beneath a \verb|\section|.
% One unnumbered sectioning command is provided, \verb|\paragraph|.
% It can be used directly below any numbered section level.
% Do not use any other sectioning commands.

% \subsubsection{Typing the Section Titles}
% The \verb|\section| and \verb|\subsection| titles are uppercased by the class.
% Please type them in title case.
% (This is used in the PDF bookmarks.)
% Please also write the \verb|\subsubsection| titles in title case.

% \paragraph{What is title case?}
% \href{https://en.wikipedia.org/wiki/Title_case}{Wikipedia} explains:
% \begin{quote}
%     Title case or headline case is a style of capitalization used for rendering the titles of published works or works of art in English.
%     When using title case, all words are capitalized except for ‘minor’ words (typically articles, short prepositions, and some conjunctions) unless they are the first or last word of the title.
% \end{quote}

% \subsection{References, Citations, Footnotes}\label{sec:etc}
% \subsubsection{Cross-Referencing}
% Always use \verb|\label| and \verb|\ref|—or a command with a similar effect—when cross-referencing.
% For example, this subsection is Section~\ref{sec:etc}.

% \subsubsection{Citations}
% Citations should include the author's last name and year.
% They should be part of the sentence.
% An example parenthetical citation: “Good introductions to the topic are available \citep{latexcompanion}.”
% An example textual citation: “\citet{einstein} discusses electrodynamics of moving bodies.”
% Do not use a parenthetical citation where a textual one is appropriate.
% An example of what \emph{not} to do: “\citep{einstein} discusses electrodynamics of moving bodies.”

% We strongly advise to use reference list software such as Bib\TeX{} and a citation package such as \textsf{natbib}.
% The reference style you use should be compatible with the author-year citations.
% Both the citation style and reference style used should be consistent.

% For the original submission, take care not to reveal the authors' identity through the manner in which one's own previous work is cited.
% For example, writing
% “I discussed electrodynamics of moving bodies before \citep{einstein}.” would be inappropriate, as it reveals the author's identity.
% Instead, write “\citet{einstein} discussed electrodynamics of moving bodies.”

% \subsubsection{Footnotes}
% You can include footnotes in your text.\footnote{
%     Use footnotes sparingly, as they can be distracting, having readers skip back and forth between the main text and the foot of the page.
% }
% The footnote mark should follow the fragment to which it refers, so a footnote\footnote{
%     A footnote is material put at the foot of a page.
% }
% for a word has a footnote mark attached to that word and a footnote for a phrase or sentence has a footnote mark attached to the closing punctuation.

% \section{Math}\label{sec:math}
% The class file does not load any math support package like \textsf{amsmath}\footnote{%
%   See the \textsf{amsmath} documentation at \url{https://ctan.org/pkg/amsmath} for further details.
% }.
% We advise using the \textsf{mathtools}\footnote{%
%   See the \textsf{mathtools} documentation at \url{https://ctan.org/pkg/mathtools} for further details.
% }
% package, which extends \textsf{amsmath} with fixes and even more useful commands.
% Feel free to load other support packages for symbols, theorems, etc.

% Use the \textsf{amsmath} environments for displayed equations.
% So, specifically, use the \texttt{equation} environment instead of \verb|$$...$$| and the \texttt{align} environment instead of \texttt{eqnarray}.\footnote{For reasons why you should not use the obsolete \texttt{eqnarray} environment, see Lars Madsen, \textit{Avoid eqnarray!} TUGboat 33(1):21--25, 2012.}
% An \texttt{equation}:
% \begin{equation}\label{eq:example}
%   0 = 1 - 1.
% \end{equation}
% Two \texttt{align}'ed equations:
% \begin{align*} % no numbers with starred version
%   1 + 2 &= 3,\\
%   1 - 2 &= -1.
% \end{align*}
% Equations can also be put inline, of course.
% For example, Equation~\eqref{eq:example}: \(0=1+1\). % $0=1+1$ also works
% (Notice that both inline and displayed math are part of the sentence, so punctuation should be added to displayed math.)

% The \textsf{amsmath} and \textsf{mathtools} packages provide a lot of nice functionality, such as many common math operators, e.g., \(\sin\) and \(\max\), and also commands for defining new ones.

% \section{Floats}\label{sec:floats}
% Floats, such as figures, tables and algorithms, are moving objects and are supposed to float to the nearest convenient location.
% Please do not force them to go in the middle of a paragraph.
% They must respect the column width.

% Two-column floats are possible.
% They appear at the top of the next page, so strategic placement may be necessary.
% For an example, see Figure~\ref{fig:tikz}.
% They may not enter the margins.
% \begin{figure*}
%     \centering
%     \begin{tikzpicture}[xscale=1.5]
%         \coordinate (origin);
%         \draw[->] (origin) -- +(1cm,0) node[below] {$x$};
%         \draw[->] (origin) -- +(0,1cm) node[left] {$y$};
%         \fill[gray] (45:1cm) circle[radius=.2cm];
%     \end{tikzpicture}
%     \caption{A Nice Filled Ellipse with a Pair of Coordinate Axes.}\label{fig:tikz}
% \end{figure*}

% All material in floats should be legible and of good quality.
% So avoid very small or large text and pixelated or fuzzy lines.

% \subsection{Figures}\label{sec:figures}
% Figures should go in the \texttt{figure} environment and be centered therein.
% The caption should go below the figure.
% Use \verb|\includegraphics| for external graphics files but omit the file extension.
% Supported formats are \textsf{pdf} (preferred for vector drawings and diagrams), \textsf{png} (preferred for screenshots), and \textsf{jpeg} (preferred for photographs).
% Do not use \verb|\epsfig| or \verb|\psfig|.
% If you want to scale the image, it is better to use a fraction of the line width rather than an explicit length.
% For example, see Figure~\ref{fig:pitt}.

% \begin{figure}[!htb]
%   \centering
%   \includegraphics[width=0.7\linewidth]{pitt}
%   \caption{A View of a Nice City.}\label{fig:pitt}
% \end{figure}

% Do not use \verb|\graphicspath|.
% If the images are contained in a subdirectory, specify this when you include the image, for example \verb|\includegraphics{figures/mypic}|.

% \subsection{Tables}\label{sec:tables}
% Tables should go in the \texttt{table} environment and be centered therein.
% The caption should go above the table and be in title caps.
% For an example, see Table~\ref{tab:data}.
% \begin{table}
%     \centering
%     \caption{An Interesting Table.}\label{tab:data}
%     \begin{tabular}{rl}
%       \toprule % from booktabs package
%       \bfseries Dataset & \bfseries Result\\
%       \midrule % from booktabs package
%       Data1 & 0.12345\\
%       Data2 & 0.67890\\
%       Data3 & 0.54321\\
%       Data4 & 0.09876\\
%       \bottomrule % from booktabs package
%     \end{tabular}
% \end{table}

% \subsection{Algorithms}\label{sec:algorithms}
% You can load your favorite algorithm package, such as \textsf{algorithm2e}\footnote{See the \textsf{algorithm2e} documentation at \url{https://ctan.org/pkg/algorithm2e}.}.
% Use the environment defined in the package to create a centered float with an algorithm inside.

% \section{Back Matter}
% There are a some final, special sections that come at the back of the paper, in the following order:
% \begin{itemize}
%   \item Author Contributions (optional)
%   \item Acknowledgements (optional)
%   \item References
% \end{itemize}
% They all use an unnumbered \verb|\subsubsection|.

% For the first two special environments are provided.
% (These sections are automatically removed for the anonymous submission version of your paper.)
% The third is the ‘References’ section.
% (See below.)

% (This ‘Back Matter’ section itself should not be included in your paper.)


% \begin{contributions} % will be removed in pdf for initial submission 
% 					  % (without ‘accepted’ option in \documentclass)
%                       % so you can already fill it to test with the
%                       % ‘accepted’ class option
%     Briefly list author contributions. 
%     This is a nice way of making clear who did what and to give proper credit.
%     This section is optional.

%     H.~Q.~Bovik conceived the idea and wrote the paper.
%     Coauthor One created the code.
%     Coauthor Two created the figures.
% \end{contributions}

% \begin{acknowledgements} % will be removed in pdf for initial submission,
% 						 % (without ‘accepted’ option in \documentclass)
%                          % so you can already fill it to test with the
%                          % ‘accepted’ class option
%     Briefly acknowledge people and organizations here.

%     \emph{All} acknowledgements go in this section.
% \end{acknowledgements}

% % References
% \bibliography{uai2023-template}



\end{document}
