% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}
\usepackage{float}
\usepackage{graphicx}
\usepackage{enumitem}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{amsfonts}
\usepackage{mathtools}
\usepackage{caption}
\usepackage{subcaption}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams


\usepackage{xr}
\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother
\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}
\myexternaldocument{chen_580-supp}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\newcommand{\yuchen}[1]{{\color{blue}#1}}

\newtheorem{theorem}{Theorem}[section]
\newtheorem{corollary}{Corollary}[theorem]
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{prop}{Proposition}
\newtheorem{assumption}{Assumption}

\allowdisplaybreaks

% \title{ Short-term Temporal Dependency Detection under Heterogeneous Event Dynamic with Hawkes Processes }
\title{Detection of Short-Term Temporal Dependencies in Hawkes Processes with Heterogeneous Background Dynamics }

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,*]{\href{mailto:<albertchenyu@gmail.com>?Subject=Your UAI 2023 paper}{Yu Chen}{}}
\author[1,*]{Fengpei Li}
\author[1]{Anderson Schneider}
\author[1]{Yuriy Nevmyvaka}
\author[2]{Asohan Amarasingham}
\author[3]{Henry Lam}

% Add affiliations after the authors
\affil[1]{%
    Machine Learning Research, Morgan Stanley, New York, NY
}
\affil[2]{%
    Department of Mathematics and Biology, City College and The Graduate Center, City University of New York, New York, NY
}
\affil[3]{%
    Department of Industrial Engineering \& Operations Research, Columbia University, New York, NY
}
\affil[*]{%
    Authors have equal contribution
}

\begin{document}
\raggedbottom

\maketitle

\begin{abstract}
Many kinds of simultaneously-observed \textit{event sequences} exhibit mutually exciting or inhibiting patterns. Reliable detection of such temporal dependencies is crucial for scientific investigation. A common model is the Multivariate Hawkes Process (MHP), whose impact function naturally encodes a causal structure in Granger causality. 
However, the vast majority of existing methods use a transformed  \textit{standard} MHP intensity with a constant baseline, which may be inconsistent with real-world data. 
On the other hand, modeling irregular and unknown background dynamics directly is a challenge, as one struggles to distinguish the effect of mutual interaction from that of fluctuations in background dynamics.
In this paper, we address the short-term temporal dependency detection issue. We show that maximum likelihood estimation (MLE) for cross-impact from MHP has an error that can not be eliminated, but may be reduced by an order of magnitude using a heterogeneous intensity not for the target HP but for the interacting HP. Then we propose a robust and computationally-efficient modification of MLE that does not rely on the prior estimation of the heterogeneous intensity and is thus applicable in a data-limited regime (e.g., few-shot, unrepeated observations). Extensive experiments on various datasets show that our method outperforms existing ones by notable margins, with highlighted novel applications in neuroscience.
\end{abstract}






% {\color{blue}
% Final TODO: \textbf{hard code} any cross-reference to the supplementary material.
% }

\section{Introduction}\label{sec:intro}
% The
% XXX prompt us to build a flexible, extendable, robust, and computationally efficient tool to quantify the weak coupling effect.
% Our proposed point process regression model can be modularized into two basic components, 
% %
% XX not aiming at accurate estimation of the latent background factor but to reduce its .
% We verified the model empirically and theoretically.
% The new method enables us to discover the network between distant but interacting neurons on fine timescale in mouse visual system.
% , as the occurrence of one event can trigger or suppress that of another. 

A substantial amount of timestamp data manifest as a sequence of apparently irregular and asynchronous events. These are recorded in continuous time and observed in domains such as computational biology (e.g., neuronal spike trains \citep{kass2001spike,pillow2008spatio}, genomic events \citep{reynaud2010adaptive}), quantitative finance (e.g., limit order book modeling for high-frequency trading \citep{bacry2015hawkes,bowsher2007modelling}), credit risk modeling \citep{errais2010affine}), social media user activity \citep{farajtabar2015coevolve,zhou2013learning}, e-healthcare (\citep{ wang2018supervised}) and seismology (e.g., earthquake aftershock \citep{ogata1988statistical}). Besides asynchronicity, such sequence data often exhibit mutual interaction patterns in which the occurrence of one event can excite or inhibit the likelihood of another. For example, news-driven trading in behavioral finance studies the mutual excitation between investor-sentiment shocks and negative price jumps \citep{yang2018applications}, while in cortical networks inhibitory connectivity in firing-rate between neurons and synapses may underlie memory maintenance \citep{mongillo2018inhibitory}. Such an interaction patterns has been variously called a \textit{temporal dependency} \citep{zuo2020transformer}, \textit{cross-correlation} \citep{zhang2020self}, a \textit{coupling effect} \citep{pillow2008spatio} or \textit{Granger causality} \citep{xu2016learning}. As \citep{eichler2017graphical} note, although stand-alone notions of Granger causality can not establish cause-effect links, the detection of temporal dependencies remains useful for both prediction and scientific investigation. 

Temporal point processes (TPP)  \citep{cox1980point} are a powerful tool for modeling event sequences. Multivariate Hawkes processes (MHP) \citep{hawkes1971point}, as a special type of TPP, have been widely used as the \textit{de facto} tool for capturing temporal dependencies among event processes (see above, e.g.,\citep{bacry2015hawkes,farajtabar2015coevolve,wang2022hawkes,zuo2020transformer}). An MHP models occurrence probability using a history-dependent conditional \textit{intensity} and its \textit{impact function} (also called \textit{coupling filter}, \textit{trigger kernel}, \textit{influence function}, see \citep{pillow2008spatio, zhouj2021efficient, zhou2013learning}) is particularly well-suited to detect mutual excitatory effects. 
Inhibitory effects can also be incorporated, but some nonlinear link function is required to map the MHP intensity into $\mathbb R^+$ (e.g., notably a clipping function $x^+=\max(x,0)$ in \citep{10.3150/13-BEJ562} or sigmoid function in \citep{zhouj2021efficient}).


Despite the expressiveness of impact functions, the background component in MHP intensity is assumed to be time-invariant. Possibly due to the extra modeling difficulty entailed, virtually all existing studies on MHP use, implicitly or explicitly, nonlinear transform of standard MHP intensity with constant baseline, including modern DL-based methods (e.g., Transformer HP \citep{zuo2020transformer} HP in infinite relational model or Dirichlet mixture model \citep{blundell2012modelling,xu2017dirichlet}, sigmoid nonlinear MHP with P\'olya-Gamma variable augmentation \citep{zhouj2021efficient}, self-attentive HP and recurrent neural network \citep{zhang2020self}). Notable exceptions which incorporate temporal heterogeneity include \citep{mei2017neural}, a neurally self-modulating HP with LSTM and \citep{zhou2021nonlinear}, where a state-switching latent process is proposed (yet still assuming constant background within each state) and \citep{hawkes2018hawkes} where the heterogeneous background is briefly discussed as a generalization of MHP to represent "exogenous economic activity". 

However, real-world event dynamics are often decisively \textit{temporally heterogeneous}. 
For example, Twitter has information bursts spurred by exogenous events (e.g., breaking news or sports games)\citep{wang2022hawkes}, the firing of neurons is commonly driven by varying visual stimuli \citep{siegle2021survey}, and trading activity has a diurnal variation (e.g., more trades occur around market open/close than around noon \citep{bowsher2007modelling}). Under \textit{unknown} heterogeneous dynamics, temporal dependency detection is difficult as one struggles to distinguish the effect of mutual interaction from that of background intensity fluctuation (e.g., did the arrival of orders for stock A stimulate that for stock B, or did they both simply experience a nonlocalized spike in trading activity?).

In this paper, we show that the maximum likelihood estimation (MLE) of short-term temporal dependency detection for standard MHPs has non-negligible errors in the presence of heterogeneous background dynamics. However, this error decreases by an order of magnitude (in terms of impact window or kernel width) if the heterogeneous background between the \textit{target} HP (recipients of the impact) and \textit{source} HP (initiators of the impact) is \textit{uncorrelated} (or \textit{orthogonal} in the Hilbert space sense, $L_2[0,T]$ or $C[0,T]$, where $T$ is observation horizon). Thus, loosely speaking, MHP can still estimate short-term cross-impact reasonably well, unless the heterogeneous intensity between the target HP and source HP shares common/correlated background dynamics. Building on this insight, we propose a robust and computationally-efficient modification of MLE, which utilizes a nonparametric estimate of heterogeneous intensity -- not of the target HP, but of the source HP. By focusing on the background intensity of the source, we reduce the inference difficulty, and the error, due to the coupling between the target HP background and impact function, by regressing the commonly-varying background out of the target HP intensity. 

The contribution of this paper can be summarized as:
\begin{itemize}
    \item To the best of our knowledge, our work is the first to formally report and analyze the error of MLE of short-term temporal dependencies in MHPs due to heterogeneous background dynamics. We investigate the relation between estimation error and background-correlation among interacting HPs, which motivates a novel method to reduce the error.

    \item Through extensive numerical experiments, we show that our method exhibits superior performance and is robust, cost-efficient, applicable in a data-limited regime (e.g., when lacking repeated observations), and suitable for inference.
    
    \item Finally, we apply our method to mouse visual cortex data and discover distant interactions between neurons on a fine timescale in both top-down and bottom-up pathways, showcasing the method's direct applicability in neuroscience.

\end{itemize}


\section{Related Work}
\textbf{Hawkes process.} Many efforts have been devoted to detecting temporal dependency among point processes, e.g. \citep{chwialkowski2014kernel,gunawardana2011model}. Among point processes, Hawkes processes stand out as the most commonly used tool for modeling complex temporal dependencies in event sequences. The paper \citep{eichler2017graphical} established the link between Granger causality and impact functions in MHP and many methods are proposed to learn the temporal dependency in MHP, via group sparsity, \citep{xu2016learning}, nonparametric learning using Euler-Lagrange equation \citep{zhou2013learningtrigger}, isotonic nonlinear link function \citep{wang2016isotonic}, online learning \citep{yang2017online} and modern DL-based methods (see intro,  \citep{zuo2020transformer,blundell2012modelling,xu2017dirichlet,zhang2020self}.  However, these methods use direct or nonlinear transform of \textit{standard} MHP time-invariant base intensity, overlooking the heterogeneity in event dynamics. Notably, \citep{mei2017neural} implicitly allows for heterogeneity. Latent variable augmentation is proposed in \citep{zhouj2021efficient,zhou2021nonlinear,zhou2022efficient,zhou2020efficient} to incorporate the time-varying background, but the modeling of heterogeneity typically relies on piecewise constants. Moreover, most methods are data-intensive (e.g., as reported in \citep{yang2017online}, methods as \citep{zhou2013learningtrigger} require more than $10^5 d$ arrival data to obtain good results on $d\leq 5$ event streams) and computationally-extensive (e.g., MCMC, EM algorithm or complex neural architecture) which is unsuitable for inference in data-limited regimes. Indeed, often in practice, only short/unrepeated sequences are available \citep{salehi2019learning}, which not only amplifies the
risk of overfitting but also makes estimation of heterogeneous background infeasible. 


\begin{figure}[ht]
\centering
\begin{subfigure}{0.47\textwidth}
\centering
\includegraphics[width=0.99\linewidth]{jitter_fig/jitter_demo.pdf}
\end{subfigure}
\caption{Construction of Monte Carlo samples from the conditional null hypothesis in the conditional-inference based CCG technique. Blue dots are timestamps.
}
\label{fig:jitter_demo}
\end{figure}

\textbf{Conditional inference based cross-correlogram (CCG).} 
Heterogeneous dynamics are ubiquitous in neuroscience \citep{farajtabar2015coevolve}. Due to the limitations of TPP and MHP in this regime, a popular method in neuroscience for detecting temporal dependencies in cross-correlograms (CCG) is via conditional inference. Conditional hypothesis testing with a carefully designed null hypothesis can bypass the background heterogeneity issue. Particularly, given realizations of two point processes, CCG assesses temporal dependence between events by testing hypotheses about conditional distributions of CCG-statistics, conditioning on coarse timescale-statistics which reflect background dynamics. As shown in \citep{amarasingham2012conditional}, the method relies on conditional inference, where the samples from the null are generated by: shifting the timestamps within each \textit{jitter window} (reflecting prior knowledge on the timescale of interactions) by a random amount, which is small enough to preserve coarse-timescale statistics, but large enough to break the finely-timed interaction pattern (see Figure \ref{fig:jitter_demo}). However, this method requires prior knowledge of timescales and assumes that the timescale of background activity (parameterized as the jitter window width) is larger than that of the interaction effect (See Figure \ref{fig:sim_demo}) and discussion below. Additional details in Appendix A.3, C.6.
Also, the outcome of the hypothesis test alone does not measure the strength of the coupling effect directly.






%However, \textit{standard}/\textit{vanilla}  Hawkes model typically treat the background as a constant.
%When the coupling effect is estimated by the typical model while processes are driven by underlying correlated background activities, it will have non-negligible error due to misspecification.
%For dynamic background activities without repeated observations, it is almost impossible to have a good estimation of the non-constant background component.
%Our proposed point process regression model can overcome these challenges and achieve small error, higher sensitivity, less assumptions about timescale, and small computational complexity.


% Our method offers an alternative idea to the current conditional hypothesis testing framework, such as jitter-based method \citep{amarasingham2012conditional}, in detecting point-to-point coupling effects, and it outperforms the current tool in terms of higher sensitivity, less assumption about timescale, better model selection, and easier computation. 





\section{Analysis and Methods }
\subsection{Basic Concepts}
A temporal point process is a stochastic process whose realization consists of a list of discrete event timestamps $\{t_n\}_{n\in\mathbb N}\subseteq \mathbb R^+$, which can be equivalently represented by a counting process $\{N(t), 0\leq t\leq T\}$ \citep{daley2008introduction}. Formally, given a probability triple $(\Omega, \{\mathcal H_t\}_{0\leq t\leq T},\mathbb P)$, $N(t):=N((0,t],\omega) $ is a realization (i.e., $\omega\in\Omega$) of counting measure $N$ for the number of points in $(0,t]$ and $\mathcal H_t$ is the $\sigma$-algebra generated from $N(B)$ for Borel subsets $B\subseteq(0,t]$ ( or $(-\infty,t]$, we do not distinguish them here). The intensity of the point process is
$\lambda(t):=\lim_{\delta\to 0}\frac{1}{\delta} \mathbb P(N(t+\delta)-N(t)>0|\mathcal H_t)$. It can be shown (see \citep{ogata1978asymptotic}) for $\mathcal H_t$-progressively measurable $\lambda(t),f(t)$ with left continuous (thus predictable) $f(t)$ that $\mathbb E[dN(t)|\mathcal H_t]=\lambda(t)dt$ and
\begin{align}\label{ogte}
    \mathbb E\int_0^T f(t)dN(t) = &\mathbb E\int_0^T f(t)\mathbb E[dN(t)|\mathcal H_t] \nonumber \\
   =&\mathbb E\int_0^T f(t)\lambda(t)dt.
\end{align}sssss
%
For the multivariate Hawkes process, the density has the form
\begin{equation}\label{standardmhp}
    \lambda_j(t) = \alpha_j +\sum_{i=1}^d \int_0^t h_{i\to j}(t-s)dN_i(s)
\end{equation}
for $1\leq i,j\leq d$, where $d$ is the dimension (number of event streams), $\alpha_j$ is the \textit{baseline} intensity for process $N_j$ and $h_{i\to j}$ is the impact function from $N_i$ to $N_j$. Standard MHP models mutual excitatory behavior and requires $h_{i\to j}\geq 0$ to avoid negative intensity which is meaningless. However, one can simply set $\lambda \leftarrow \max(\lambda,0)$ \citep{10.3150/13-BEJ562} to extend MHP for modeling mutual inhibitory behavior.

\subsection{Heterogeneous event dynamics}

The standard MHP assumes the baseline intensity $\alpha$ to be a constant \eqref{standardmhp}, which is incongruous with the heterogeneous event dynamics frequently observed in real-world scenarios. To accommodate heterogeneity, instead of using \eqref{standardmhp} as building blocks to construct a complex structure, we directly proposed a generalized MHP intensity for $1\leq i,j \leq d$:
\begin{equation}\label{gMHP}
    \lambda_j(t) = \alpha_j +f_j(t)+\sum_{i=1}^d \int_0^t h_{i\to j}(t-s)dN_i(s)
\end{equation}
where $f_j(t)$ is the fluctuation in the background intensity. For now, we do not restrict whether $f_j$ is stochastic or deterministic, but simply assume it is $\mathcal H_t$-adapted. For identifiability between $\alpha$ and $f$, we assume $\int_0^T\mathbb E[f(t)]dt=0$ (or more generally $\int_0^P f(t)dt=0$ if it is deterministic and perodic with peroid $P$ or $\mathbb E[f]=0$ if $f(t)$ is stationary).

The main approaches for learning MHP falls under two directions: maximum likelihood-based (MLE) approaches \citep{ogata1978asymptotic,zhou2013learning,yang2017online} and moment-matching flavored approaches based on higher-order statistics \citep{da2014hawkes}. Due to the unknown statistical property of $f$, the moment-based methods are not applicable for \eqref{gMHP}. To investigate the applicability of the MLE approach for \eqref{gMHP}, we study a representative model for subsequent discussion. However, we emphasize that our proposed method applies generally to models from \eqref{gMHP}.


\subsection{Representative Model} \label{sec:representatie_model}
Consider two point processes $N_i$, $N_j$ as shown in Figure \ref{fig:couple_process_diagram}. 
The intensity functions are,
\begin{equation}
\begin{aligned}
\lambda_{j}(t)
&= \alpha_j
+ f_{j}(t) 
+ \int_0^t h_{i\to j}(t-s) dN_{i}(s) \\
%
% \lambda_{i}(t | \mathcal{H}_{t}) 
% &= \alpha_i
% + f_{i}(t)
% + \int_0^t h_{j\to i}(t-\tau) N_{j}(\mathrm{d} \tau) 
\lambda_{i}(t) 
&= \alpha_i
+ f_{i}(t)
\end{aligned}
\label{eq:true_model}
\end{equation}





\label{subsec:jitter_model}
\begin{figure}[ht]
\centering
\begin{subfigure}{0.48\textwidth}
\centering
\includegraphics[width=0.6\linewidth]{jitter_fig/jitter_regression_model_one_dir.pdf}
\end{subfigure}
\caption{ Illustrative MHP with a heterogeneous background. Two events stream $N_i, N_j$ with intensities $\lambda_i(t), \lambda_j(t)$, baseline $\alpha_i+f_{i}(t), \alpha_j+f_{j}(t)$ and the one-way impact function $h_{i\to j}$. }
\label{fig:couple_process_diagram}
\end{figure}


where $h_{i\to j}$ is the impact function and $f_{i}(t), f_{j}(t)$ are unknown fluctuations. There are various methods of learning 
the form $h_{i\to j}$ with data-driven and nonparametric techniques (\citep{zhou2013learningtrigger,xu2016learning,yang2017online}. To facilitate the discussion of MLE, we assume the form of impact has been learned within a 1-D parametric family $h_{i\to j}(\cdot)\in\{\theta\cdot \mathbf{1}_{[0,\sigma_h]}(\cdot)\}_{\theta\in\Theta}$ which is widely applied in neuroscience (here, $\mathbf{1}_{[0,\sigma_h]}(t) = 1$ if $0\leq t \leq \sigma_h$ and 0 otherwise). We set the ground truth impact to be $h_{i\to j}=c\cdot \mathbf{1}_{[0,\sigma_h]}$ for a given $c>0$. Moreover, we focus on the recovery of impact function (i.e., estimation of $c$) and treat other parameters as \textit{nuisance} parameters, as in \textit{profile likelihood}\citep{murphy2000profile}.

In MHP \eqref{standardmhp}, not considering $f_j$, one  parameterizes $\lambda_j$ as
\begin{equation}\label{like}
    \lambda_{\boldsymbol\theta }(t) = \theta_1 + \theta_2\int_0^t\mathbf{1}_{[0,\sigma_h]}(t-s)dN_i(s),
\end{equation}
which is misspecified and maximizes the log-likelihood:
\begin{align*}
   \hat{\boldsymbol{\theta}}=&\operatorname*{argmin}_{\boldsymbol\theta }\ell({\boldsymbol\theta }; \mathcal H_T)\nonumber\\
   =& -\int_0^T \lambda_{\boldsymbol\theta }(t)dt+\int_0^T \log\lambda_{\boldsymbol\theta }(t)dN_j(t),
\end{align*}
see, e.g., \citep{ogata1978asymptotic}. In the misspecified model, one would expect $\hat{\boldsymbol{\theta}}$ converges to $\boldsymbol\theta_{KL} $, the minimizer in KL-divergence information criterion \citep{white1982maximum}:
\begin{equation*}
    \boldsymbol\theta_{KL} =\operatorname*{argmin}_{\boldsymbol\theta }\Lambda(\boldsymbol\theta):=\mathbb E\ell({\boldsymbol\theta }),
\end{equation*}
under suitable regularity conditions, including $\mu$-strong convexity and $L$-Lipschitz gradient of $\Lambda$. We want to quantify the error between $[\boldsymbol\theta_{KL}]_2$ and $c$. We list technical conditions in Appendix B, along with proofs for the following results.
\begin{prop}\label{prop1}
    Under regularity conditions specified in Appendix B, for deterministic $f_i$ and $f_j$ in \eqref{gMHP}, the error satisfies
    \begin{equation*}
        |[\boldsymbol\theta_{KL}]_2-c|=\Theta\bigg(\bigg|\int_0^T\frac{ f_i(t)f_j(t)}{\alpha_j+c} dt\cdot \sigma_h+ o(\sigma_h)\bigg|\bigg).
    \end{equation*}
\end{prop}
\begin{prop}\label{prop2}
    Under the same condition as in Proposition \ref{prop1}, if $f_i$ and $f_j$ are stationary, the error satisfies 
    \begin{equation*}
        |[\boldsymbol\theta_{KL}]_2-c|=\Theta\bigg(\bigg|\frac{\text{Cov}(f_i,f_j)}{ \alpha_j +c}\sigma_h + o(\sigma_h)\bigg|\bigg).
    \end{equation*}
\end{prop}
where the big-$\Theta$ notation stands for a growth function with the same rate in upper and lower bound, i.e. $f(x)=\Theta(g(x))$ if there exists $0\leq m\leq M$ s.t. $mg(x)\leq f(x)\leq Mg(x), \forall x$.

Proposition \ref{prop1} and \ref{prop2} suggest that, under heterogeneous event dynamics, the error in estimating the impact function scales linearly with $\sigma_h$, with the coefficient determined by the "inner product" between $f_i$ and $f_j$. In fact, if we define $\langle f_i,f_j\rangle = \mathbb E\int_0^T f_i(t)f_j(t)dt$, then we can unify (and generalize to a non-stationary case) the result in Proposition \ref{prop1} and \ref{prop2}. We see that, for short-term temporal dependency detection $\sigma_h \to 0$, the ratio between estimation error and interaction timescale $\sigma_h$ is non-vanishing and non-negligible unless the two HPs have \textit{uncorrelated} background ($\langle f_i,f_j\rangle =0$).


How could one reduce the order of this error term? The most natural way is to observe or estimate $f_j$ directly. Indeed, given access to $f_j$, MLE is no longer misspecified. However, as discussed in \citep{zhou2020efficient}, the "exogenous component" (the baseline intensity) and the "endogenous" component (the impact function) are "coupled" in the likelihood, which hampers inference. In \citep{zhou2020efficient}, a \textit{branching} structure is used to decouple these two components in HP, which does not apply to MLE because when the same, typically limited data are used to estimate both $f_j$ and $h_{i\to j}$, the results are generally non-reliable (indeed, a naive use of MLE for fitting both would result in delta measures around the event timestamps for $N_j$). However, since the correlation between $f_i$ and $f_j$ results in a large error, one conjectures whether estimation of $f_i$, or entities highly correlated with $f_i$, could help regress out the common varying intensity out of $f_j$. Indeed, we have the following:
\begin{prop}\label{prop3}
    Under the same condition as in Proposition \ref{prop2}, if we let $r:=\max\{\|f_i-\mathbb Ef_i\|_\infty,\|f_j-\mathbb Ef_j\|_\infty\}$, if we have access to $g = \frac{ f_i-\mathbb E[f_i]}{\sqrt{\text{Var}(f_i)}}$ (i.e., normalized basis for $f_i$) in the likelihood \eqref{like} so that one parameterizes
    \begin{equation}\label{like2}
    \lambda_{\boldsymbol\theta }(t) = \theta_0 + \theta_1 g+\theta_2\int_0^t\mathbf{1}_{[0,\sigma_h]}(t-s)dN_i(s),
\end{equation}
then 
\begin{align*}
    [\boldsymbol\theta_{KL}]_1  =& \mathbb E[ (f_j-\mathbb E[f_j])g] +o(r^2+\sigma_h), \nonumber\\
    [\boldsymbol\theta_{KL}]_2  = &o(r^2+\sigma_h).
\end{align*}
\end{prop}

Although we can not directly observe $f_i$, Proposition \ref{prop3} suggests that using $f_i$ as a basis may reduce the error. Moreover, the form of $[\boldsymbol\theta_{KL}]_1 \approx \langle f_j,g\rangle$ also suggests using a "project $f_j$ on $f_i$" as basis to modify the MLE. 


\subsection {Proposed Method}
Inspired by the analysis above, we now propose our modification for estimating impact. In particular, we minimize the following expression modified from the likelihood function $\tilde{\ell}$:,
\begin{gather}
\min_{h_{i\to j}, \beta_j, \beta_w, \sigma_w } 
\left\{
-\sum_{s \in N_j } \log \tilde{\lambda}_{j}(s)
+ \int_0^T \tilde{\lambda}_{j}(s) \mathrm{d} s  \right\} 
\label{eq:target_likelihood} \\
%
\tilde{\lambda}_{j}(t) := 
\Big( \beta_j
+ \beta_w \; \overline{\textbf{s}_{i}}(t) 
+ \int_0^t h_{i\to j}(t-s) dN_{i}(s) 
\Big)_+
\label{eq:regression_intensity} \\
\overline{\textbf{s}_{i}}(t)
= \int_0^T W(t-s;\sigma_w) dN_i(s)
% = \sum_{t_m \in N_i} W(t - t_m)
\label{eq:mean_coarsen_regressor}
\end{gather}
where $\overline{\textbf{s}_{i}}$ can be regarded as the coarsened point process smoothed by a Gaussian kernel 
$W(\tau;\sigma_w)=\frac{1}{\sqrt{2\pi\sigma_w^2}} \exp(-\frac{\tau^2}{2\sigma_w^2})$ with scale $\sigma_w$, serving as a substitute basis for $f_i$. We also specify an algorithm that can be implemented in continuous time, which does not require one to discretize the time points \citep{eden2008continuous, foufoula1986continuous}, so that the memory requirement is proportional to the number of time points instead of the number of time bins. The optimization algorithm is detailed in Appendix A. Empirical and theoretical analysis of the estimator will be discussed in section \ref{subsec:simulation_study}.
% also see \cite[example 8.5(a)]{daley2003introduction}. 


\subsection{Other use cases of the method} \label{subsec:use_cases}
Before experiments, we present some generality in the application of the method, with details left to Appendix D.

\paragraph{Hypothesis testing } see Appendix D.2. 
We compare our model with conditional inference via CCG and standard MHP, in hypothesis testing. Both our model and CCG have proper uniform p-value distribution under the null of no interaction \citep[Theorem 10.14]{wasserman2004all}, where the standard MHP fails. Moreover, our method is also more powerful/sensitive at detecting weak signals with small sample sizes, see Figure \ref{fig:sim_demo}. Figure \ref{fig:sim_demo} shows a simulation example of fine timescale interaction between two point processes.
Synthetic data is generated by HP with one process inhibiting the other and a common fluctuating background in Figure \ref{fig:sim_demo}A.
Figure \ref{fig:sim_demo}B is the result of the conditional inference via cross-correlogram (CCG).
The curve is mostly in the negative region indicating some inhibitory influence, yet the majority part of the curve stays within the acceptance band (i.e., not statistically significant).
Figure \ref{fig:sim_demo}CD show the result of the standard MHP vs our method, where the impact function is represented as lag period. As shown, our method accurately detects the inhibitory relation and the estimated error is close to the true function (red curve), with the improvement compared to CCG in the statistical power and 
standard MHP in terms of error.
A similar observation in real data will be shown in Figure \ref{fig:neural_demo}.
A more detailed comparison between these models is in Appendix D.2.

\begin{figure}[ht]
\centering
\begin{subfigure}{0.48\textwidth}
\centering
\includegraphics[width=0.9\linewidth]{jitter_fig/sim_demo.pdf}
\end{subfigure}
\caption{Impact function estimation with background fluctuation in simulation.
\textbf{A}: Shared background intensity.
\textbf{B}: CCG-based conditional inference. The 95\% acceptance band is constructed using Monte Carlo samples from the null distribution.
\textbf{C}: Standard MHP. The red curve is the ground truth.
\textbf{D}: Our model.
The band in C and D is also 95\% pointwise CI.
}
\label{fig:sim_demo}
\end{figure}




\paragraph{Non-parametric fitting for the impact function} see Appendix D.1.
Our method does not have constraints on modeling the impact function, which can be easily extended to non-parametric fitting.
One option is the general additive model using splines \citep[ch. 5]{pillow2008spatio, hastie2009elements}.
By leveraging the integral trick (Appendix A), time points do not need to be discretized and computational cost is small.


\paragraph{Bayesian inference} see Appendix D.3. 
The method can be adopted for Bayesian inference where the uncertainty of the smoothing kernel width $\sigma_w$ is evaluated using a sampling-based inference algorithm.
The simulation shows that incorporating the uncertainty of  $\sigma_w$ does not affect the estimation of the temporal dependency significantly.

% As will be shown in section \ref{subsec:background_kernel_smoothing},
% the estimated $\sigma_w$ reflects the timescale of the background activity.


% and the nuisance variable $\overline{\textbf{s}_{i}}(t)$ in \eqref{eq:mean_coarsen_regressor} can be interpreted as an approximation of the background activity.



\section{Experiments }
In this section, we empirically verify the method through multiple simulation studies,
then apply the new tool to the neuroscience dataset where we discover a network of interacting neurons on a fine timescale.
For simulations, continuous-time point processes are generated using Lewis' thinning algorithm \citep{lewis1979simulation, ogata1981lewis}.
The gradient descent-based optimization algorithm is in Appendix A.
Our code is available
\url{https://github.com/AlbertYuChen/point_process_coupling_public}.


\subsection{Simulation study } \label{subsec:simulation_study}
% \yuchen{Take the comparison between ours, typical Hawkes, and conditional inference from Appendix? }


\subsubsection{Toy Example with background fluctuation } \label{subsec:background_artifacts}
In this synthetic dataset, the dynamic baselines have known form so that their correlation or the "inner product" between the source and target processes, as discussed in Section \ref{sec:representatie_model}, can be calculated in closed-form.
The background activities are
$f_i(t) = A \sin(2\pi (t - \phi_{\mathrm{rnd}})), 
f_j(t) = A \sin(2\pi (t -  \phi_{\mathrm{rnd}} - \phi_{\mathrm{lag}}))$,
where $A$ is the amplitude,
$T$ is the length of the trial.
We sample $\phi_{\mathrm{rnd}}\sim \mathrm{Unif(0, 1)}$ and set it to vary from trial to trial so the same background is never repeatedly observed. Here
$\phi_{\mathrm{lag}}$ controls the correlation between $f_i,f_j$, which we quantify using the \textit{normalized} dot product
$\langle f_i, f_j \rangle
:= \frac{1}{T A^2} \int_0^T f_i(s) f_j(s) \mathrm{d}s$.
When $\phi_{\mathrm{lag}}= 0$ and $0.5$, the dot product achieves the largest positive and negative value respectively;
when $\phi_{\mathrm{lag}}= 0.25$, the dot product is zero.

For the problem we are considering, short-term temporal dependency detection with dynamic background, there really is no "state-of-the-art" model as we are not mainly interested in predicting future observations, but we aim at gaining insight into the relationship between features and responses for scientific discovery, which is a more challenging task \citep{fan2020statistical}.
Although many recent point process models, such as \citep{mei2017neural, zhang2020self, zuo2020transformer}, are designed for the prediction task, 
one popular representative deep learning-based model by \cite{mei2017neural} using recurrent neural networks is included as the baseline model.
The performance of three models are compared: standard MHP, our model, and Neural Hawkes \citep{mei2017neural}. 
Some other deep learning models are not considered due to the convoluted black-box structure.
For example in \citep{zhang2020self}, the intensity function is
\begin{equation*}
\begin{aligned}
\lambda_i(t)=&\text{softplus}(\mu_{u,i+1}+ \\ 
& (\eta_{u,i+1}-\mu_{u,i+1})\exp(-\gamma_{u,i+1}(t-t_i))),
\end{aligned}
\end{equation*}
where the variables $\mu,\eta,\gamma$ are all functions of latent variables obtained
through attention network. Another example is \citep{zuo2020transformer}, where the intensity function is
\begin{equation*}
    \lambda_k(t)=f_k(\alpha_k\frac{t-t_j}{t_j}+\boldsymbol w_k^T \boldsymbol h(t_j)+b_k),
\end{equation*}
where $t_j$ is the last event (not necessarily type k) and $h$ is the latent variable that carries more history information extracted from transformers.
Just by observing the intensity form above, one realizes that these models, designed for the event sequence prediction, are very difficult to draw inference on the coupling effect.
The method in \citep{mei2017neural} is the simplest framework we found where one can split out the coupling effect with minimum modification of the model.


The impact function is the square window impact function with a given width, so only the amplitude needs to be estimated.
Neural Hawkes takes intervals of the superimposed point processes one by one in sequence. The impact function from source to target is modeled as
\begin{gather}
\boldsymbol{c}(t) = 
\bar{\boldsymbol{c}}_{i+1} + 
(\boldsymbol{c}_{i+1} - \bar{\boldsymbol{c}}_{i+1}) 
\mathbb{I}_{[0,\sigma_h]}(t-t_i^{\mathrm{source}}), \\
%
\boldsymbol{h}(t) = \boldsymbol{o}_i \odot \mathrm{tanh}(\boldsymbol{c}(t)), \\
\lambda_{\mathrm{target}}
= \left(\boldsymbol{W}_{\mathrm{target}}^T \boldsymbol{h} \right)_+,
\end{gather}
which is slightly modified for the context (original kernel in \citep{mei2017neural} is exponential).
The impact function is extracted from the model (the original model does not directly offer an estimated parameter) as
$h_{\mathrm{source}\to \mathrm{target}}(t)
= \boldsymbol{W}_{\mathrm{target}}^T
\left[ \boldsymbol{o}_i \odot \mathrm{tanh}(
(\boldsymbol{c}_{i+1} - \bar{\boldsymbol{c}}_{i+1}) 
\mathbb{I}_{[0,\sigma_h]}(t) \right]
$ which could capture a time point's impact on the intensity.
Instead of modeling multiple points in the history at once as in the standard MHP, Neural Hawkes considers non-linear mapping, which only receives one last interval, while the history effect is carried over $\boldsymbol{c}_{i+1}$, $\bar{\boldsymbol{c}}_{i+1}$, and $\boldsymbol{o}_{i}$ through a recurrent neural network. The result is shown in Figure \ref{fig:bias_comparison} while
details are left in Appendix C.1.

\begin{figure}[ht]
\centering
\includegraphics[width=0.6\linewidth]{jitter_fig/bias_vs_dotprod_3model.pdf}
\caption{ A comparison of impact function estimation between standard MHP, Neural Hawkes, and our model under dynamic background.
The confidence band is created from 100 simulations.
}
\label{fig:bias_comparison}
\end{figure}

As shown, the bias of standard MHP is nearly linearly correlated with the dot product, as suggested by theoretical analysis. The error of Neural Hawkes is less susceptible to this correlation, which corroborates the ability of a recurrent structure to capture the interaction effect despite dynamic background. However, the error and variance of the impact estimation from Neural Hawkes are visibly non-negligible. This is likely due to the fact that neural network models typically need large datasets for training.
In contrast, our model performs satisfactorily in this example.


\subsubsection{Background kernel smoothing } \label{subsec:background_kernel_smoothing}
The kernel-smoothed basis in \eqref{eq:mean_coarsen_regressor} plays a key role in our method.
This section studies the relationship between the kernel width and the error of the estimator. In special cases, we are able to approximate the behavior of the estimator with an analytical formula.
Following the model framework in \eqref{eq:true_model}, assuming the background activity is generated similar to the \textit{linear Cox process} \citep{diggle1985kernel} or the \textit{cluster process} \citep[Definition 6.3.I.]{daley2003introduction}:
\begin{equation}
f_{i}=f_{j}:= 
\sum_i \phi_{\sigma_I}(t - t^c_i)
\label{eq:linear_cox_main}
\end{equation}
where $\phi_{\sigma_I}(\cdot)$ is some positive and even function, i.e., $\phi_{\sigma_I}(\cdot) >0$ and $ \phi_{\sigma_I}(\tau) = \phi_{\sigma_I}(-\tau)$. Here
$t^c_i$ are the centers of the windows generated by a Poisson process with intensity $\rho$.
$f_{i}$ is second-order stationary with a \textit{reduced covariance density} defined as follows (also see Appendix E).
\begin{equation}
\begin{aligned}
\breve{c}_{\Lambda}(u) :=& 
\mathbb{E}[f_{i}(x) f_{i}(x + u)] 
- \mathbb{E}[f_{i}(x)] \mathbb{E}[f_{i}(x + u)] \\
=& \rho [\phi_{\sigma_I} \ast \phi_{\sigma_I}](u) \\
%%
%%
\breve{c}_{N}(u) :=& 
\mathbb{E}\left[ 
\frac{d N_{i}( x) d N_{i}(x + u) }{(\mathrm{d}t)^2} \right]
 - \mathbb{E}\left[\frac{d N_{i}(x)}{\mathrm{d}t}\right] 
 \mathbb{E}\left[\frac{d N_{i}(x+u)}{\mathrm{d}t}\right]  \\
=& \rho \cdot [\phi_{\sigma_I} \ast \phi_{\sigma_I}](u)
+ (\rho+\alpha_i) \delta(u)
%%
%%
% \breve{c}_{N\Lambda,2}(u) :=& 
% \mathbb{E}\left[ 
% \frac{N_{i}(\mathrm{d} x) f_{i}(x + u) }{\mathrm{d}t} \right] \\
% & - \mathbb{E}\left[\frac{N_{i}(\mathrm{d}x)}{\mathrm{d}t}\right] 
%  \mathbb{E}\left[ f_{i}(x+u) \right]  \\
% =& \rho [\phi_{\sigma_I} \ast \phi_{\sigma_I}](u) 
% = \breve{c}_{\Lambda,2}(u)
\end{aligned}
\end{equation}
which describes the smoothness of background activity, and
$\alpha_i$ is the constant in \eqref{eq:true_model}.
If adjacent points with lag $u$ have larger covariance $\breve{c}_{\Lambda}(u)$, the background would be smoother.
The impact functions are
% $h_{i\to j}(\tau) = \alpha_{i\to j} \mathbb{I}_{[0,\sigma_h]}(\tau)$,
$h_{i\to j}(t) = \alpha_{i\to j} h(t)$, with amplitude to be fitted, for example
$h(t) = \mathbb{I}_{[0,\sigma_h]}(t)$.
Then the error in model \eqref{eq:target_likelihood} may be approximated as,
\begin{equation}
\mathrm{error}(\hat\alpha_{i\to j}) \approx 
\frac{ 
\langle W, W \rangle_{\breve{c}_{N}} 
    \langle h, \mathbf{1} \rangle_{\breve{c}_{\Lambda}} 
- \langle h, W \rangle_{\breve{c}_{N}} 
    \langle W, \mathbf{1} \rangle_{\breve{c}_{\Lambda}}
}{
\langle W, W \rangle_{\breve{c}_{N}} 
\langle h, h^- \rangle_{\breve{c}_{N}}
- \langle W, \mathbf{1} \rangle_{\breve{c}_{\Lambda}}^2
}
\label{eq:bias_formula}
\end{equation}
% \yuchen{Add little o notation to this equation.}
$\mathbf{1}$ is the constant and
$h^-(\tau)=h(-\tau)$.
The special inner product here are defined as
$\langle g_1, g_2 \rangle_{\breve{c}} 
:= \int [g_1 \ast g_2](s) \breve{c}(s) \mathrm{d}s$ with $\ast$ denoting the convolution.
The derivation of the analytical formula is in Appendix E. Simulation and analytical results are presented in Figure \ref{fig:kernel_smoothing_eg}.
%
\begin{figure}[ht]
\centering
\includegraphics[width=0.8\linewidth]{jitter_fig/estimator_bias_likelihood.pdf}
\caption{ Error and likelihood of the estimator as functions of background smoothing kernel width $\sigma_w$ in \eqref{eq:mean_coarsen_regressor}. Numerical and theoretical results as in \eqref{eq:bias_formula} are shown in blue and dark respectively. The error of standard MHP is the blue dot on the right.
}
\label{fig:kernel_smoothing_eg}
\end{figure}
%
The error and log-likelihood are plotted as functions of the smoothing kernel width $\sigma_w$ in \eqref{eq:mean_coarsen_regressor}.
The MLE, indicated by the vertical line in Figure \ref{fig:kernel_smoothing_eg}, achieves a small error that agrees with the example in section \ref{subsec:background_artifacts}.
Interestingly, when the kernel width is too small or too large, including the theoretical limits by taking $\sigma_w \to 0$ or $\sigma_w \to\infty$, the model fails under heterogeneity. In this case, the error is close to that of standard MHP.
Details are in Appendix C.2.


\begin{figure}[ht]
\centering
\includegraphics[width=0.8\linewidth]{jitter_fig/bias_tune_sigma_I_sigma_h.pdf}
\caption{Error function with different background timescales $\sigma_I$ (top) or coupling effect timescales $\sigma_h$ (bottom).
Numerical results match the theoretical results well, so only theoretical results are presented according to \eqref{eq:bias_formula}.
}
\label{fig:bias_tuning}
\end{figure}

In Figure \ref{fig:kernel_smoothing_eg},
when $\sigma_w$ is between 20 ms and 120 ms, the error can be negative.
The error as a function of the background smoothing kernel has two roots. The roots are related to the timescale of the coupling effect $\sigma_h$ and the timescale of the background $\sigma_I$ as in \eqref{eq:linear_cox_main}.
In Figure \ref{fig:bias_tuning},
if $\sigma_I$ increases, the root on the right, corresponding to the MLE, will move toward the right, as the background smoothing kernel $W$ captures the fluctuation of the background.
If $\sigma_h$ increases, the root on the left will move toward the right.
This can be intuitively interpreted by \eqref{eq:bias_formula}.
Let $W_h$ be the kernel with $\sigma_w \approx \sigma_h$, then
$\langle W_h, W_h \rangle_{\breve{c}_{N}}
    \langle h, \mathbf{1} \rangle_{\breve{c}_{\Lambda}}
\approx \langle h, W_h \rangle_{\breve{c}_{N}}
    \langle W_h, \mathbf{1} \rangle_{\breve{c}_{\Lambda}}$.
So $\sigma_w = \sigma_h$ is close to the root of \eqref{eq:bias_formula}.
Changing the amplitude of the impact function $\alpha_{i\to j}$ in a certain range does not influence the bias curve.
More details are in Appendix C.3.


\subsubsection{Two-way cross connections and self-connections }
The model in Figure \ref{fig:couple_process_diagram} only shows one cross-connection $i\to j$.
This simulation scenario includes the most general two-way MHP cross/self connections between processes ($i\to j$ and $j\to i$), and self-connections ($i\to i$ and $j\to j$). The comparison between our model and standard MHP is in Table \ref{tab:full_model_results}. Details of the experiment are in Appendix C.4.
Our model considerably outperforms the standard MHP in estimating cross-impact connections.
However, both models perform poorly on self-connection estimation, as they are considered nuisance parameters in our method.

% \begin{table}[H]
% \centering
% % {\scriptsize
% \begin{tabular}{cccclcc}
% \cline{1-3} \cline{5-7}
%  & \multicolumn{2}{c}{Our model} &   & \multicolumn{2}{c}{Standard Hawkes} \\ \cline{1-3} \cline{5-7}
%  & $i$ & $j$ &  &  & $i$ & $j$ \\ \cline{1-3} \cline{5-7} 
% $i$ & $1.70(0.18)$ & $\textbf{0.21}(0.14)$  & 
%     & $i$ & $2.39(0.18)$ & $2.39(0.19)$  \\ \cline{1-3} \cline{5-7} 
% $j$ & $\textbf{0.22}(0.15)$ & $1.66(0.18)$ &  
%     & $j$ & $2.40(0.19)$ & $2.39(0.18)$ \\ \cline{1-3} \cline{5-7} 
% \end{tabular}
% % }
% \caption{Comparison between our model and standard MHP model in full connection task.
% Rows are source nodes, columns are target nodes.
% Each cell shows the mean absolute error with standard deviation.
% Unit in spikes/sec.
% }
% \label{tab:full_model_results}
% \end{table}

\begin{table}[H]
\centering
% \resizebox{0.49\textwidth}{!}{%
% \begin{tabular}{cccclcc}
% \cline{1-3} \cline{5-7}
%  & \multicolumn{2}{c}{Our model} &  &  & \multicolumn{2}{c}{Standard Hawkes} \\ \cline{1-3} \cline{5-7}
%  & $i$ & $j$ &  &  & $i$ & $j$ \\ \cline{1-3} \cline{5-7} 
% $i$ & $1.70(0.18)$ & $\textbf{0.21}(0.14)$  & 
%     & $i$ & $2.39(0.18)$ & $2.39(0.19)$  \\ \cline{1-3} \cline{5-7} 
% $j$ & $\textbf{0.22}(0.15)$ & $1.66(0.18)$ &  
%     & $j$ & $2.40(0.19)$ & $2.39(0.18)$ \\ \cline{1-3} \cline{5-7} 
% \end{tabular}
% }
\begin{tabular}{ccccc}
\hline
 & \multicolumn{2}{c}{Our model} & \multicolumn{2}{c}{Standard Hawkes} \\ \hline
 & $i$ & $j$ & $i$ & $j$ \\ \hline
$i$ & $1.70(0.18)$ & $\textbf{0.21}(0.14)$ & $2.39(0.18)$ & $2.39(0.19)$ \\ \hline
$j$ & $\textbf{0.22}(0.15)$ & $1.66(0.18)$ & $2.40(0.19)$ & $2.39(0.18)$ \\ \hline
\end{tabular}%
\caption{Comparison between our model and standard MHP model in full connection task.
Rows are source nodes, columns are target nodes.
Each cell shows the mean absolute error with standard deviation.
Unit in spikes/sec.
}
\label{tab:full_model_results}
\end{table}



\subsubsection{Multivariate Hawkes model}
It is natural to extend our bivariate regression-type method to a multivariate regression-type model.
The coupling effect in multivariate processes can be regarded as a form of graph structure recovery in graphical models, where each point process is considered a node.
From this perspective, e.g., \citep[sec. 19.4.4]{meinshausen2006high, murphy2012machine}, multivariate regression extends the bivariate case by studying \textit{pairwise}
conditional relations for all possible pairs.
More specifically, given a pair of random variables $X,Y$, let $Z$ represent the totality of all other random variables excluding $X,Y$. The multivariate regression infers if a \textit{bivariate} relation $X\perp Y|Z$ holds, also known as the \textit{global Markov property} \citep{koller2009probabilistic}.
A similar concept in standard MHP can be found in \citep{eichler2017graphical}.
In our MHP setting, this is equivalent to estimating the impact functions between $N_i$ and $N_j$ given the observations of all other processes and so that their effect enters as the dynamic background. Notice that the standard MHP cannot model this extension because even if the baseline intensity of each point process is constant, the totality of random effect from all other nodes excluding two nodes will not necessarily give a constant baseline to the nodes under consideration.
Consider the intensity function in the multivariate point process,
\begin{equation}
\begin{aligned}
\lambda_{j}(t)
&= \alpha_j
+ \int_0^t h_{i\to j}(t-s) dN_{i}(s) \\
&\underbrace{+ f_{j}(t) 
+ \sum_{r\neq i, j} \int_0^t h_{r\to j}(t-s) dN_{r}(s) 
}_{\tilde{f}_j(t) }
\end{aligned}
\label{eq:multivariate}
\end{equation}
where $f_j$ together with input from other processes are treated as a new background $\tilde{f}_j(t)$. This perspective exactly reduces the MHP to model \eqref{eq:regression_intensity}.

The performance of the model is evaluated using simulation dataset, which involves 6 processes and all processes are driven by fluctuating background. The coupling effects between nodes can be positive, negative or zero.
Table \ref{tab:multivariate} shows that our method outperforms the standard Hawkes model in multivariate processes scenario.
Details of the experiment are in Appendix C.9.
%
%
\begin{table}[H]
\centering
\begin{tabular}{ccc}
\hline
 & Bias (std)  & RMSE (std) \\ \hline
Hawkes & 1.52 (0.040)  & 1.54 (0.41) \\
Ours & \textbf{0.028} (0.040)  & \textbf{0.25} (0.33) \\ \hline
\end{tabular}
\caption{Comparison between the standard Hawkes model and our model. The unit is [spikes/sec]. }
\label{tab:multivariate}
\end{table}


\subsubsection{Other simulation scenarios} \label{subsec:other_simulations}
Other properties of the model and empirical verifications are briefly summarized in this section due to the page limit.

\textbf{Varying-timescale background.} See Appendix C.5.
We violate the settings in section \ref{subsec:background_kernel_smoothing} by relaxing the fixed background timescale $\sigma_I$ in \eqref{eq:linear_cox_main} to randomly changing timescale to test the robustness of the model. 

\textbf{Fast-changing background.} See Appendix C.6.
In extreme cases, the background activity $f_{i}$ can have fast-changing activities.
In this situation, the conditional inference-based method will be limited by its formalization of the null hypothesis, which implicitly assumes the timescale of the coupling effect is smaller than that of the background. Our model is still able to accurately estimate the cross-impact effect while the conditional inference-based method fails. 

\textbf{Asymptotic Normality.} See Appendix C.7.
% \citep{ogata1978asymptotic} proved that if there is no model misspecification, the MLE of the point process is asymptotically normal.
Similar to profile likelihood, the approximate normality of the estimator is observed in simulations. The property may be convenient for model inference, details are also in Appendix D.2. 

\textbf{Selection of impact function length.} See Appendix C.8. In practice, the timescale of the interaction effect is typically unknown. When users are not confident with the prior knowledge of the timescale of the coupling effect, our methods can be adapted to use a shorter impact function or non-parametric fitting first, as shown in Appendix D.1.


\subsection{Neuropixels data }

Spiking neural activities likely come with non-stationary background signals due to external stimuli or inter-area interactions.
With recent advances in high-density electrophysiological recording technologies, such as Neuropixels, hundreds of neurons from multiple brain regions can be recorded simultaneously. This offers opportunities to further investigate the interactions between brain areas \citep{siegle2021survey, chen2022population}.
However, point-to-point coupling effects on fine timescales across regions is not yet well studied.
Here, we apply our method to the hierarchical mouse visual system across 5 brain areas: V1, LM, RL, AL, and AM in ascending order with V1 as the primary visual cortex, thought to process simple visual features, and AM as the higher-order cortex thought to handle sophisticated signals \citep{harris2019hierarchical, siegle2021survey} (Figure \ref{fig:neuropixels}).
We aim to fit the coupling effect across brain regions and discover the excitatory or inhibitory interactions on a fine timescale.
Details are in Appendix F.

\begin{figure}[H]
\centering
\begin{subfigure}{0.48\textwidth}
\centering
\includegraphics[width=0.9\linewidth]{jitter_fig/neural_demo.pdf}
\end{subfigure}
\caption{Neuropixels data. 
\textbf{A}: Activities of three brain areas showing correlated backgrounds.
B,C,D are results of a pair of neurons.
\textbf{B}: CCG.
\textbf{C}: standard MHP.
\textbf{D}: Our method.
}
\label{fig:neural_demo}
\end{figure}
%
%
Figure \ref{fig:neural_demo}A demonstrates the averaged activities of 3 brain regions with large correlations providing a clue for the background artifact. Results are very similar to the simulation in Figure \ref{fig:sim_demo};  CCG in Figure \ref{fig:neural_demo}B shows some negative but not statistically significant effects.
Our method is more sensitive in detecting the effect between 0 and 50 ms lag.
In contrast, due to the background artifacts, the standard Hawkes model detects non-significant or slightly positive coupling effects.

Figure \ref{fig:neuropixels} shows the discovered neuronal network of 190 neurons.
Multiple significant impact functions are selected with Bonferroni correction at level 0.01. 
766 directed edges are split into bottom-up connections and top-down connections \citep{siegle2021survey,harris2019hierarchical}.
The impact function is fitted using a 50 ms square window determined by exploring CCG and non-parametric fitting (see examples in Appendix F).
Our main findings using MHP extension are: (a) Most edges concentrate at a few neurons, and (b)
the active senders or receivers are consistent across top-down and bottom-up networks. The real data has no ground truth so we cannot directly evaluate the performance of this multivariate extension. However, the findings directly corroborate previous neuroscience studies \citep{harris2019hierarchical,glickfeld2017higher} based on anatomical analysis, whereas our findings are entirely data-driven. The findings are also complementary to \citep{jia2020multi,siegle2021survey} using the traditional CCG method (in section 2, our method outperforms CCG in both computation and performance).
\begin{figure}[H]
\centering
\includegraphics[width=0.95\linewidth]{jitter_fig/neuropixels_graph.pdf}
\caption{Network of coupling neurons in mouse visual system.
Excitatory (positive) and inhibitory (negative) impact functions are shown in red and blue edges.
20\% randomly selected edges are shown.
The coupling filter connecting a lower-order region to a higher-order region, for example from V1 to RL, is categorized into the bottom-up graph on the left; the graph on the right shows the top-down connections \citep{siegle2021survey,harris2019hierarchical}.
The small graphs at the corner count the total number of edges between areas.
}
\label{fig:neuropixels}
\end{figure}
%
%
Figure \ref{fig:h_histogram} compares the histograms of the impact function amplitudes between the standard Hawkes model and our model.
% Because of the background artifacts shown in Figure \ref{fig:neural_demo}, 
It is suspected that the standard Hawkes model may falsely detect more positive relations.
Goodness-of-fit analysis and more details of the experiment can be found in Appendix F. As shown, the above discoveries are greatly facilitated by our method.
%
%
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{jitter_fig/h_hist.pdf}
\caption{Histograms of estimated impact function amplitudes.
}
\label{fig:h_histogram}
\end{figure}

\section{Conclusion }
% discussion related to causality.
We report and analyze the error of MLE from MHP in short-term temporal dependency detection due to heterogeneous background, which we believe is common but largely overlooked. We developed a flexible, robust, and computationally efficient model to address this problem in an attempt to generalize the use case for MHP in practice.
Finally, we applied the new tool to a neuroscience dataset and discovered the structure of a patterned neuronal network across visual cortices in the mouse visual system.

% \begin{contributions} % will be removed in pdf for initial submission 
% 					  % (without ‘accepted’ option in \documentclass)
%                       % so you can already fill it to test with the
%                       % ‘accepted’ class option
%     Briefly list author contributions. 
%     This is a nice way of making clear who did what and to give proper credit.
%     This section is optional.

%     H.~Q.~Bovik conceived the idea and wrote the paper.
%     Coauthor One created the code.
%     Coauthor Two created the figures.
% \end{contributions}

\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
    Yu Chen was supported by NIMH grant RO1 MH064537 when the work was done at Carnegie Mellon University.
    % \emph{All} acknowledgements go in this section.
\end{acknowledgements}


% References
\bibliography{chen_580.bib}


\end{document}
