\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)
\usepackage{subfig}
\usepackage{comment}
\usepackage{amsmath,amssymb,amsfonts,amsthm}
\usepackage{algorithmic}
\usepackage{algorithm}
\usepackage{wrapfig}
\usepackage{dsfont}
\usepackage{multirow}
%\usepackage{todonotes}
\usepackage{tabularx}
%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\newtheorem{theorem}{{\bf Theorem}}
\newtheorem{lemma}{{\bf Lemma}}
\newtheorem{proposition}{{\bf Proposition}}
\newtheorem{remark}{{\bf Remark}}
\newtheorem{corollary}{{\bf Corollary}}
\newtheorem{definition}{{\bf Definition}}
\newtheorem{assumption}{Assumption}
\title{Inference and Sampling of Point Processes from Diffusion Excursions \\(Supplementary Material)}

\author[1,2]{\href{mailto:<ali.hasan@duke.edu>?Subject=Your UAI 2022 paper}{Ali~Hasan}{}}
\author[2]{Yu~Chen}
\author[1]{Yuting~Ng}
\author[3]{Mohamed Abdelghani}
\author[2]{Anderson~Schneider}
\author[1]{Vahid~Tarokh}
% Add affiliations after the authors
\affil[1]{%
    Department of Electrical and Computer Engineering\\
    Duke University\\
    Durham, North Carolina, USA
}
\affil[2]{%
    Machine Learning Research\\
    Morgan Stanley
}

\affil[3]{%
Department of Mathematics\\
    University of Alberta\\
    Edmonton, Alberta, Canada
}

\usepackage{xr}
\makeatletter

\begin{document}
\onecolumn
\appendix
\maketitle

We provide some technical results that supplement the main results in the text, 
 including the calculation of the excursion length density, discussion about the connections between diffusion-based and intensity-based frameworks, and Lamperti transformation for formulating a constant variance diffusion process.
For all analysis, we will continue to work with the SDE given by
 \begin{equation}
    \label{eq:sde}
\mathrm{d} Z_t = \mu(Z_t, t) \mathrm{d}t + \sigma(t) \mathrm{d} dW_t.
\end{equation}

\section{Proofs}
\label{sec:proofs}

\subsection{Proof of Excursion Length Density via Change of Measure}
\label{sec:proof_com}
We restate the main proposition here:
\begin{proposition}[Diffusion Excursion Density]
Let $Z_t$ satisfy an SDE with drift $\mu$ such that $Z_t$ is recurrent at zero.
Then the density of the excursion lengths $\tau$ of $Z_t$ is given by:
\begin{equation}
p_{Z}(\tau) = p_{e}(\tau; \delta)\mathbb{E}_{\mathbb{Q}_\updownarrow^\delta}\left[\exp\left(\int_0^\tau \mu(e_t,t;\theta)\mathrm{d}e_t - \frac12 \int_0^\tau \mu^2(e_t,t;\theta) \mathrm{d}t \right) \right].
\end{equation}

%\yuchen{Add subscript $\mathbb{Q}$ to $\mathbb{E}$. }
\end{proposition}

\begin{proof}
The proof follows a straightforward change of measure argument.
Performing the change of measure, we can write the Radon Nikodym derivative as
$\frac{ \mathrm{d} \mathbb{P}^{\mu}}{\mathrm{d} \mathbb{Q}}$ where $\mathbb{Q}_\updownarrow$ is the excursion length density;
$\mathbb{Q}_\updownarrow^{\delta}$ is the corresponding length density of $\delta$-excursions. 
Under Novikov's condition, we can write the exponential martingale as
\begin{equation}
\frac{ \mathrm{d} \mathbb{P}^{\mu}}{\mathrm{d} \mathbb{Q}_\updownarrow^{\delta}} = \exp \left ( \int_0^{\tau\bigwedge T} \mu(e_t,t) \mathrm{d}e_t - \frac12 \int_0^{\tau\bigwedge T} \mu(e_t, t)^2 \mathrm{d} t \right ).
\label{eq:radon}
\end{equation}
%\vv{I would remind that Novikov's holds to make it clear}
\noindent Now we have that 
\begin{align}
\nonumber \mathbb{P}^{\mu, \delta}(\tau \in \mathrm{d} t; \delta) &= \mathbb{E}_{\mathbb{Q}_\updownarrow^\delta}\left[ \frac{\mathrm{d} \mathbb{P}^\mu}{\mathrm{d} \mathbb{Q}_\updownarrow^{\delta}}  \mathds{1}(\tau \in \mathrm{d} t)\right] \\
\nonumber &= \mathbb{E}_{\mathbb{Q}_\updownarrow^{\delta}}\left[\exp \left ( \int_0^{\tau\bigwedge T} \mu(e_t,t) \mathrm{d}e_t - \frac12 \int_0^{\tau\bigwedge T} \mu(e_t,t)^2 \mathrm{d} t \right ) \mathds{1}(\tau \in \mathrm{d} t)\right] \\
&= \mathbb{E}_{\mathbb{Q}_\updownarrow^{\delta}}\left[\exp \left ( \int_0^{\tau\bigwedge T} \mu(e_t,t) \mathrm{d}e_t - \frac12 \int_0^{\tau\bigwedge T} \mu(e_t,t)^2 \mathrm{d} t \right ) \right] p_e(\tau; \delta)
\label{eq:com}
\end{align}
with the  $e_t$ being excursions of length $\tau$.
This results in the final PDF of $\tau$ as
\begin{equation}
p_{Z}(\tau; \delta) = p_{e}(\tau; \delta)\mathbb{E}_{\mathbb{Q}_\updownarrow^\delta}\left[\exp\left(\int_0^\tau \mu(e_t,t;\theta)\mathrm{d}e_t - \frac12 \int_0^\tau \mu^2(e_t,t;\theta) \mathrm{d}t \right) \right].
\end{equation}

%\yuchen{Add subscript $\mathbb{Q}$ to $\mathbb{E}$. }
\end{proof}

\subsection{Proof of Approximation Distance}
\label{sec:proof_approx}
\begin{proof}
First, by~\citet[Theorem 4]{gibbs2002choosing} note that the $1$-Wasserstein distance between measures with support on $[0, T]$ is bounded by the TV distance according to
$$
W_{1}(p_\star, p_0) \leq T \frac12 \| p_\star - p_0 \|.
$$
Now using Pinsker's inequality, we can bound the TV distance in terms of the log of the Radon-Nikodym derivative.
$$
T \| p_\star - p_0 \| \leq T \sqrt{\frac12 \int d p_0 \left[ \int_0^T \mu dX_t - \frac12 \int_0^T \mu^2 dt \right]}.
$$
Taking the supremum over all functions within the function space that satisfies the positive recurrence condition leads to the result. 
\end{proof}

\section{Details of Excursion Length Density Given by Laplace Transform}
\label{sec:laplace}
State-dependent excursion densities can be calculated by inverting the Laplace transform of the solution of a differential equation.
In practice, this is difficult since the inverse Laplace transform is numerically unstable and intractable in general.
We do not follow this approach in the paper but state the details of the approach here for completeness. We define the procedure in the following proposition based on~\cite{pitman1999laplace}:
\begin{proposition}[\cite{pitman1999laplace}]
\label{prop:laplace}
Let $Z_t$ be the solution to the SDE in~\eqref{eq:sde}. 
Then define the speed and scale measures
\begin{equation}
s'(x) = \exp \left( -\int_0^x 2 \mu(u)/\sigma(u) \mathrm{d}u \right), \quad m(x) = \frac{2}{\sigma(x) s'(x)}.
\label{eq:speed_scale}
\end{equation}
Additionally, define the operator $\mathcal{A}$ as
$$
\mathcal{A} := \frac{1}{m(x)}\frac{\mathrm{d}}{\mathrm{d}x}\frac{1}{s'(x)}\frac{\mathrm{d}}{\mathrm{d}x}.
$$
Then the Laplace transform of the hitting time distribution is given by solving the eigenvalue problem of 
$$
\mathcal{A} \phi(x) = \lambda \phi(x)
$$
for the function $\phi(x)$ with Wiener-Hopf factorization given by $\phi_\pm(x)$: 
\begin{equation}
\mathbb{E}_x\left [e^{\left(-\lambda \tau\left(Z_t\right)\right)} \right] = \left \{ \begin{array}{cc}
    \phi_{\lambda,-}(x_0)/ \phi_{\lambda,-}(x_1) & x_0 < x_1\\
     \phi_{\lambda,+}(x_0)/ \phi_{\lambda,+}(x_1) & x_0 > x_1
\end{array} \right .
\label{eq:laplace_FHT}
\end{equation}
Then, using the strong Markov property, the total duration of the excursion from $x_0$ to $x_1$ and $x_1$ to $x_0$ (assuming $x_1 > x_0$) is given by

\begin{equation}
\mathbb{E}\left [e^{\left(-\lambda \left ( \tau \right ) \left(Z_t\right)\right) }\right] = \frac{\phi_{\lambda, -}(x_0)}{\phi_{\lambda, -}(x_1)}\frac{\phi_{\lambda, +}(x_1)}{\phi_{\lambda, +}(x_0)}.
\label{eq:laplace_ex}
\end{equation}
\end{proposition}
Notably, the inversion of the Laplace transform numerically is unstable, leading to difficulties in recovering the density from an optimized $\mu$ using this technique.

As the drift and the variance terms in the SDE defined above are only state-dependent, it is not straightforward to depict the general Ito process as in Eq \ref{eq:sde}.
For the forward problem, a remedy is: first, start with deriving the first-hitting-time property of a simple SDE with only state-dependent drift and variance such as Brownian motion, linear drift Brownian motion, Bessel process, OU process \citep[Part II]{borodin1997handbook}, so that the eigenvalues and eigenfunctions can be solved easily with closed-form; then apply the Girsanov's theorem to extend the property for more general SDE. 
But for the backward problem, the Laplace transformation of the hitting time density does not usually have closed-form except for a few cases where the Laplace transform of the first-hitting-time density can be calculated in closed-form and can be decomposed into increasing eigenfunction and decreasing eigenfunction as in \eqref{eq:laplace_FHT}, solving SDE drift or variance is thus not straightforward.

Here we present one example of solving the drift function through the Laplace transform for the first-hitting-time problem (not yet the excursion problem).
The example illustrates the difficulty of formulating the diffusion process via the Laplace transform from the modeling perspective as opposed to the proposed likelihood strategy in the main text.
We use the OU process for the forward problem, which is already well-known. 
Next, we focus on the backward problem, i.e. recovering the drift of an SDE given the first-hitting-time density.
Let the process start from $x_0$ and the hitting boundary be a constant $\alpha$.
The target first-hitting-time density is
\begin{equation}
p(t) = \frac{2|\alpha-x_0|}{\sqrt{2\pi}} (e^{2t}-1)^{-\frac{3}{2}}
    \exp\left\{ 2t - \frac{(\alpha-x_0)^2}{2 e^{2t} - 2} \right\}.
\end{equation}
Next, we solve the corresponding SDE by setting $a(x) = 1$ with constant variance.
The Laplace transform of the density is,
\begin{equation}
\mathbb{E}_{x_0}[\exp(-\lambda H_\alpha) ]
= \begin{cases}
\frac{\exp(x_0^2) D_{-\lambda}(- \sqrt{2}x_0) }
    { \exp(\alpha^2) D_{-\lambda}(- \sqrt{2} \alpha) }
= \frac{\Phi_{-,\lambda}(x_0 )}{\Phi_{-,\lambda}(\alpha )}, 
    \quad \alpha > x_0 \\
\frac{\exp(x_0^2) D_{-\lambda}(\sqrt{2}x_0) }
    { \exp(\alpha^2) D_{-\lambda}(\sqrt{2} \alpha) }
= \frac{\Phi_{+,\lambda}( x_0 )}{\Phi_{+,\lambda}( \alpha )}, 
    \quad x_0 > \alpha \\
\end{cases}
\label{eq:OU_fpt_laplace}
\end{equation}
where
\begin{align*}
D_{-\lambda}(x) :=& \frac{1}{\Gamma(\lambda)} e^{-\frac{x^2}{4}}
    \int_0^\infty t^{\lambda-1} e^{-x t - \frac{t^2}{2}} \mathrm{d}t
\end{align*}
which has the following properties
\begin{align*}
\frac{\mathrm{d} D_{-\lambda}(x) }{\mathrm{d} x}
=& -\frac{x}{2} D_{-\lambda}(x) - \lambda D_{-\lambda-1}(x)
    = \frac{x}{2} D_{-\lambda}(x) - D_{-\lambda+1}(x) \\
%%
\lambda D_{-\lambda-1}(x) 
=& - x D_{-\lambda}(x) + D_{-\lambda+1}(x) \\
%%
\frac{\mathrm{d}}{\mathrm{d} x} e^{x^2} D_{-\lambda}( \sqrt{2} x)
=& 2x e^{x^2} D_{-\lambda}( \sqrt{2} x) 
    + e^{x^2} D_{-\lambda}'( \sqrt{2} x) \\
=& 2x e^{x^2} D_{-\lambda}( \sqrt{2} x) 
    + \sqrt{2} e^{x^2} \left(- \frac{\sqrt{2} x}{2} D_{-\lambda}(\sqrt{2}x) 
    - \lambda D_{-\lambda-1}(\sqrt{2}x)  \right) \\
=& - \sqrt{2} \lambda e^{x^2}  D_{-\lambda-1}(\sqrt{2}x) \\
%%
\frac{\mathrm{d}^2}{\mathrm{d} x^2} e^{x^2} D_{-\lambda}( \sqrt{2} x)
=& - \sqrt{2} \lambda \frac{\mathrm{d}}{\mathrm{d} x} e^{x^2}  D_{-\lambda-1}(\sqrt{2}x)
= 2 \lambda (\lambda + 1) e^{x^2} D_{-\lambda-2}(\sqrt{2}x)
\end{align*}

According to Eq \eqref{eq:laplace_FHT} and \eqref{eq:OU_fpt_laplace}, the corresponding function can be defined as,
\begin{equation}
\Phi_{+,\lambda}(x) := e^{x^2} D_{-\lambda}( \sqrt{2} x).
\end{equation}
As defined above, $D_{-\lambda}(x) > 0$, and the derivative of $\Phi_{+,\lambda}(x)$ is negative, so the function is decreasing satisfying the Laplace transformation decomposition.
Taking this back to the infinitesimal generator eigenfunction in Eq \ref{eq:speed_scale}, we have the equality below for $\lambda \neq 0$
\begin{gather*}
\lambda (\lambda + 1) e^{x^2} D_{-\lambda-2}(\sqrt{2}x) 
- b(x) \sqrt{2} \lambda e^{x^2}  D_{-\lambda-1}(\sqrt{2}x)
= \lambda e^{x^2} D_{-\lambda}( \sqrt{2} x)  \\
\iff \quad 
- \sqrt{2}x D_{-\lambda-1}(\sqrt{2}x) + D_{-\lambda}(\sqrt{2}x)
- b(x) \sqrt{2} D_{-\lambda-1}(\sqrt{2}x)
=  D_{-\lambda}( \sqrt{2} x).
\end{gather*}
So the solution is $b(x) = -x$, and the SDE satisfies,
\begin{equation}
d Z_t = -Z_t d t + d W_t
\end{equation}
which is a standard OU. This is a simple example that the Laplace transform can be derived with closed-form and the solution is achievable and is only state-dependent. In general, such convenience is not guaranteed and there may not have a state-dependent solution.

\begin{comment}
\section{A Poisson Process Perspective}
We will now illustrate It\^o's excursion theory in terms of the original point process on the space of excursions. 
This largely follows the explanation in~\citet[Section 4]{ananova2020excursion} but we connect it to the proposed methods for self-containment.
We define the excursion measure as $\mathcal{E}$, the space of all continuous functions $f : [0, \infty) \to \mathbb{R}$ such that $f(0) = 0$ and $f(t) = 0, \:\: t > \tau$ where $\tau = \inf \{ t > 0 : f(t) = 0 \}$.
It\^o's excursion theory constructs a measure $\nu$ over $\mathcal{E}$ and considers the counting process $N_\lambda(\Gamma) = \sum_{t \leq \lambda} \mathds{1}_{\{e_t \in \Gamma\}}$ where $\mathbb{E}(N_\lambda(\Gamma)) = \nu(\Gamma)$.

We can think of $\nu$ as being parameterized by the drift $\mu$.
In practice, very few $\mu$ 
\end{comment}

\section{Experimental Details}
\label{sec:experiment_details}
\subsection{Drift Recovery}
The list of drifts $\mu$ for the multi-dimensional drift recovery experiments is provided in Table~\ref{tab:mult_d_exp}.
We first simulate the $d$-dimensional diffusion process then compute the times when each dimension has an excursion from the corresponding axis. 
The parameter $\delta$ is chosen to be zero, but due to discretization in simulation, an infinite number of excursions is not observed. 
We use $\Delta_t = 0.01$ and simulate until terminal time $T=10$.
The time points where the diffusion crosses the axis are recorded. 
For the excursion estimator, we consider both positive and negative excursions which we simulate using a Bernoulli random variable with a probability of $0.5$.
We compare this to the bridge estimator that interpolates between zeros using Brownian bridges. 
The hyperparameters for both models are equal to ensure a fair comparison.
We train for 200 epochs, with a learning rate of $1 \times 10^{-3}$ using the Adam optimizer.
The architecture is a 64 width, 6 depth multi-layer perceptron with \texttt{Softplus} activation function.

\paragraph{Kernel Coefficient Recovery for Exogenous and History Dependent Drift}
For these experiments, we consider an OU process influenced by either an exogenous process or the history by defining the drift as $ \mu = -x + w\varphi(\,\cdot\,)$ with $\varphi(z) = \exp( - z / \sigma)$. 
We choose $\sigma = 1$ for the exogenous dependence and $\sigma = 2$ for the history dependence.
For the both experiments, we simulate using Euler-Maruyama with $\Delta t=0.05$ up to $T = 50$ and observe the zero times.
The exogenous signal is generated according to 100 samples from a uniform random variable as $\mathcal{U}(0, 40)$ and sorted by the value.
For these experiments, we only optimize over the estimated parameter $\hat{w}$ due to the lack of identifiability if both components are unknown. 
We use the same parameters and only compare the standard SDE regression using Brownian bridges and the excursion based approach.
The models are trained using the \texttt{AdamW} optimizer with a learning rate of $1.0 \times 10^{-2}$.
Figure~\ref{fig:history_path} illustrates a sample path with dependence on the times when it reaches zero.


\begin{table}[]
    \centering
    \begin{tabular}{lll}
    Experiment & $\mu(X_t) $  \\ \toprule
       Cubic  &  $ = -X_t^3$ & $ X_t \in \mathbb{R}^{10}$ \\
       Circle & $ = \begin{cases}
       -X_{1,t} - X_{2,t} \\
       -X_{2,t} + 5 X_{1,t}
       \end{cases}$  & $ X_t \in \mathbb{R}^2$ \\
       Tanh & $= -\tanh(X_t) $ & $  X_t \in \mathbb{R}^{10}$ \\
       OU & $ = -X_t $ & $  X_t \in \mathbb{R}^5$
    \end{tabular}
    \caption{Table of drifts for multi-dimensional experiments. }
    \label{tab:mult_d_exp}
\end{table}

\subsection{Renewal Processes}
\begin{figure*}[h]
    \centering
    \includegraphics[width=0.23\textwidth]{figs/sampling/qq-exp.pdf}
    \includegraphics[width=0.23\textwidth]{figs/sampling/qq-weibull.pdf}
    \includegraphics[width=0.23\textwidth]{figs/sampling/qq-gamma.pdf}
    \includegraphics[width=0.23\textwidth]{figs/sampling/qq-lognormal.pdf}\hfill
    
    \includegraphics[width=0.23\textwidth]{figs/sampling/baseline_qq_exp_nc.pdf}
    \includegraphics[width=0.23\textwidth]{figs/sampling/baseline_qq_weibull_nc.pdf}
    \includegraphics[width=0.23\textwidth]{figs/sampling/baseline_qq_gamma_nc.pdf}
    \includegraphics[width=0.23\textwidth]{figs/sampling/baseline_qq_log-normal_nc.pdf}
    \caption{Comparison of QQ plots for Poisson process, Weibull, Gamma, and log-normal renewal processes. The top row are results from our excursion-based estimator. The bottom row are results from a intensity-based estimator. All are fit with the same architecture, optimization parameters and 200 samples from the specified renewal process.}
    \label{fig:qq_baseline}
\end{figure*}
We consider four canonical renewal processes with parameters described in Table~\ref{tab:renewal}.
The architecture is a basic MLP with width 16 and 6 layers and the \texttt{Softplus} activation function.
They were trained for 2000 epochs using the \texttt{AdamW} optimizer with learning rate of $1 \times 10^{-3}$.
The learning rate for the $\delta$ parameter was $1 \times 10^{-2}$.
For reference on how the proposed estimator is performing compared to a standard deep learning-based point process estimator, we provide~\autoref{fig:qq_baseline} which compares the performance of an intensity-based model~\citet{shchur2019intensity} to our excursion-based model.

\begin{table}[]
    \centering
    \begin{tabular}{ll}
    Distribution & Parameters \\ \toprule
        Exponential & $\lambda =1$ \\
        Gamma & $\alpha=9, \, \beta =1$ \\
        Log-Normal & $\mu=0, \,\sigma=1$ \\
        Weibull & $\lambda=1, \, k=1.5$
    \end{tabular}
    \caption{Renewal distributions with the corresponding parameters for the experiments in main text.}
    \label{tab:renewal}
\end{table}

\subsection{Real Data}  \label{subsec:real_data}

For the real data experiment, we consider a modification of the neural network that includes a positional encoding layer. 
This allows the network to learn higher frequency functions~\citep{tancik2020fourier}.
We use a \texttt{LeakyReLU} activation function. 

\begin{figure}
    \centering
\includegraphics[width=0.48\textwidth]{figs/learned_stim_1.pdf}
\includegraphics[width=0.48\textwidth]{figs/learned_stim_2.pdf} 
\includegraphics[width=0.48\textwidth]{figs/learned_stim_3.pdf}
\includegraphics[width=0.48\textwidth]{figs/learned_stim_4.pdf}
    \caption{Scaled learned stimulus for all time points.}
    \label{fig:learned_stim_all}
\end{figure}

\paragraph{Dataset Description}
The neuroscience dataset is the same as \citep{tripathy2013intermediate}, which is composed of \textit{in vitro} whole-cell patch clamp recordings of mitral cells from mouse olfactory bulb slices.
Spikes were recorded with 100 trials of a 2 seconds duration by simulating the neuron with repeated frozen noise current, which is generated by convolving a white-noise with an alpha function with $\tau = 3$ ms.



\paragraph{Transforming the Learned Stimulus}

Since the proposed method models the continuous as excursions, we invoke a transformation such that the zeros are peaks. 
In particular, we compute $\tilde{Z_t} = a(\mathbb{E}[\log Z_t]) + b$ where $a, b$ are found using least squares. 
Note that due to the $\log$, this will result in $\tilde{Z_t}$ being either positive or negative.
In that sense, we cannot recover the sign of the excursion unless we're provided additional information. 
However, the main components we want to match are the peaks of the true stimulus and the learned stimulus. 
Figure~\ref{fig:learned_stim_all} shows that this is effectively accomplished. 

\section{Further Modeling Considerations}

\subsection{Incorporating History} \label{subsec:history_dependent}
\begin{figure}[h]
\centering
\includegraphics[width=0.4\textwidth]{figs/history_dependency_path.pdf}
\caption{Example of a sample path affected by its history.}
\label{fig:history_path}
\end{figure}

In practice, many point processes depend on the distribution of the history for a certain window.
We can include this in the model by making the drift dependent on the history by, for example, considering the structure of the drift to be a recurrent neural network.
Then the likelihood becomes a function of the history with the change of measure being the same as in~\eqref{eq:com}.
Specifically, we can change the drift function from $\mu(Z_t, t)$ in \eqref{eq:sde} to
$\mu(Z_t, t, \mathcal{H}_t)$, where $\mathcal{H}_t$ represents the history of the process up to time $t$. $\mathcal{H}_t$ can be the history of the whole path of $Z_t$, or it can be the history of discrete events 
$\{s \;|\; d L^{f}(s) = 1, s < t \}$, or subset of the history events.
The challenge of the model extension raises from more complicated modeling of the drift function. We leave such extension as the future work.

To illustrate the point, here we consider a special example of history-dependent SDE, which mimics the intensity function of the Hawkes process,
\begin{align}
d Z_t =& \mu(t, \mathcal{H}_t) d t + d W_t \\
\mu(t, \mathcal{H}_t) =& \mu_0 t + h(t - t_{h,1} ) + h(t - t_{h,2})
\end{align}
where $\mu_0 t$ is the baseline drift with a constant slope, and the drift function depends on the last event $t_{h,1}$ and the last second event $t_{h,2}$, so the diffusion process is history-dependent. $h(\cdot)$ is the kernel function describing how past events affect the future drift. In the example, the process only depends on the past two events before $t$. For example, if $h(t) = e^t \cdot \mathbb{I}(t \geq 0)$, the past events will have a positive influence on the drift, and such influence decays as the events stay further back.
As shown in the examples in Appendix \ref{appendix:connection_intensity}, the linear Hawkes process is not equivalent to this problem, even though both of them consider the additive history effects.
An example of a sample path exhibiting this behavior is given in Figure~\ref{fig:history_path}.


\subsection{Connection to More General Point Processes } \label{subsec:connection_point_process}
In this section, we illustrate the modeling connection between the diffusion-based framework and the intensity-based framework.
The bridge between the two groups of methods is the conditional density of the next event given the history up to the current time.
For the intensity-based model, the conditional density of the next event is~\citep{daley2003introduction},
\begin{align*}
g\left(t \mid t_1, \ldots , t_{n-1} \right)& = \lambda\left(t \mid t_1, \ldots, t_{n-1}\right) \times 
\exp \left \{ - \int_{t_{n-1}}^t \lambda \left(u \mid t_1, \ldots , t_{n-1} \right) \mathrm{d}u \right \}
\end{align*}
for $t \geq t_{n-1}$ and where $\lambda$ is the conditional intensity function, 
$t_i$ are timestamps,
$g$ is the density of the next event.
Reversely, given the density $g$, the intensity function can also be derived. See Appendix \ref{appendix:connection_intensity}.
For the diffusion-based model, suppose the point process is generated by an underlying SDE, where the events are a sequence of first-hitting times that reset the process to the initial level after every hit. 
Let $g(\tau), \tau \in \mathbb{R}_+$ be the density of the next event, assuming the current event occurs at $t=0$ ($g(\tau)$ can be different for every hitting time).
For the next event, there exists an SDE with a time-varying drift function $\mu(t)$ such that its first-hitting-time density is $g$ (again, $\mu(t)$ can be different for every hitting time).
The equivalent model is the driftless diffusion, such as Brownian motion, but with a time-varying boundary (the boundary can be different for every hitting time).
If the drift function or the boundary depends on the history, the model will include history dependency.
We leave more detailed discussions and numerical examples, such as the renewal process and Hawkes process, in Appendix \ref{appendix:connection_intensity}.
\subsection{Interpreting the Learned Drift}
A byproduct of the proposed method is a parameterization of a diffusion that generates the data. 
In particular, the drift that is recovered can be subsequently analyzed using traditional methods from stochastic calculus.
In general, the method should be used in cases where such interpretation is useful, such as in health care or in finance. 
The drift may provide clinically useful insight into the distribution of action potentials for a diseased versus healthy patient.
The drift provides an understanding at a multiscale level, leading to potential therapies that correspond to modulating the drift function.

In a similar example as described in the introduction, bursty transcription could be investigated again at a finer, molecular level.
The drift can correspond to some production rate or movement of the underlying molecules which can again lead to potential pathways for developing medications.

Finally, in a financial setting, arrivals of aks orders in a market may be related to excursions of the drawdown process of the perceived fair price. 
As the fair price reaches the running maximum, the drawdown process reaches zero, signaling an appropriate time to sell the financial instrument.
Developing and analyzing the drift of the latent fair price could lead to better risk management for market makers or more effective trading strategies. 

\section{Example Applications}
\label{sec:examples}
To motivate the proposed model, we include a few examples where the method may be appropriate and an intuitive interpretation is present.
We study some of these experiments in greater detail by considering the representational capabilities of the method in describing the data.

\paragraph{Fair Pricing from Bids and Asks of Illiquid Assets.}
Suppose we observe sets of point processes for a given asset on an exchange. 
Denote the set of samples associated to bids as $\mathbb{B} = \{t_i^{(b)}, b_i\}_{i=1}^N$ and the process for asks as $\mathbb{A} = \{t_i^{(a)}, a_i\}_{i=1}^M$. 
We assume the following properties of observations based on a latent fair price, $Z_t$:
\begin{enumerate}
    \item Bids are generated when $Z_t < \mathbb{E}\left[Z_t \, \bigg | \, Z_{t_i^{(b)}} = b_i\right] - \delta$ for $t_i^{(b)} < t$. 
    That is, the fair price should not exceed the expected fair price following the last bid.
    \item The fair price does not cross the expectation of the diffusion bridge between any two arrivals.
\end{enumerate}
$Z_t$ satisfies an SDE with unknown drift $\mu(Z_t, t)$, we consider the excursions above and below the curves:
\\
\begin{tabularx}{\textwidth}{XX}
{\begin{align*}
    f_\text{bid} &= \mathbb{E}[Z_t \mid Z_{t_i^{(b)}} = b_i] \\
    &= \int_{t_i^{(b)}}^t\mu(z, t) dt + b_i 
\end{align*} 
}&{
\begin{align*}
        f_\text{ask} &= \mathbb{E}[Z_t \mid Z_{t_i^{(a)}} = a_i] \\
    &= \int_{t_i^{(a)}}^t\mu(z, t) dt + a_i
\end{align*}
}
\end{tabularx}

Specifically, since $a_i \geq b_i$, $ f_\text{ask}(t) \geq f_\text{bid}(t)$ for all $t >0$. 
Qualitatively, the model suggests that a \emph{new bid} occurs when the fair price exceeds the expected value conditioned on starting at the \emph{last bid} price and the fair price $Z_t$ being at least $\delta > f_\text{bid}(t)$ within that interval with the opposite characterization for the process involving asks.
This is characterized by a $\delta$-excursion above $f_\text{bid}$ or below $f_\text{ask}$, leading to the interarrival time being the excursion length of the fair price above or below the curve. 
We illustrate this behavior and the generated point process in Figure~\ref{fig:bidask}.

\begin{figure}[h]
\vspace{-10pt}
\centering
\includegraphics[width=0.4\textwidth]{figs/run_path.pdf}
\caption{Examples of bids and asks generated according to the intersection of the fair price (blue) with the expected price from the last bid (green) and ask (orange).
}
\label{fig:bidask}
\end{figure}

\paragraph{Heart Rate Variability}
In~\citet{barbieri2005point}, the authors consider a renewal process for modeling heart rate variability based on a drifted Brownian motion.
We consider a similar model but based on excursions reaching a minimum height. 
This allows us to make an interpretation of the latent path being a continuous path instead of discontinuous as in the first hitting time case. 
The continuous path should resemble the ensemble of cardiac action potentials that generate the electrical signal whose peaks are the arrival times we observe. 
Analyzing the drift that governs the excursions can then be used for qualitative reasoning about the distributions of action potentials. 

\paragraph{EDA Data}
On another front considering electrophysiology, ~\citet{subramanian2020point} describes a point process structure that describes the spiking properties in electrodermal activity (EDA) data. 
The authors posit the existence of a latent Brownian motion with first crossing times that are distributed according to the observed data. 
The Brownian motion is seen to provide a description of the underlying electrophysiology related to the release of sweat that in turn generates the points in the point process. 
We generalize this argument by considering a series of excursions above a threshold limit. 
This provides a continuous description relating to the release of sweat that generates the arrival of the point process.
The data are in the form of the arrival time of the pulse times derived from EDA data for the control group of patients (\textsc{EDA}), and for the patients under the influence of the drug Propofol (\textsc{EDA-P}). 

\section{Connection Between Diffusion-Based Model and Intensity-Based Model} \label{appendix:connection_intensity}

As introduced in section~\ref{subsec:connection_point_process}, the conditional intensity of the next event relates the two groups of modeling methods of the diffusion-based method and the intensity-based method.
Recall the relationship between the joint arrival time distribution and the intensity function is given by
$$
\lambda(t_n | t_1, \ldots, t_{n-1}) = \frac{p(t_n | t_1, \ldots, t_{n-1})}{1 - \int_{t_{n-1}}^{t_n}p(s | t_1, \ldots, t_{n-1})ds}.
$$
Additionally, we have that $p(t_n | t_{i < n}) = p(t_n, \ldots, t_1)/p(t_{n-1}, \ldots, t_1)$.
Numerically, when considering the proposed estimator, we must compute the ratio of two expectations given our observations. 
The variance of this becomes a bit high, but we can use common random numbers to reduce this. 
That is, we compute the same set of bridges for both expectations.
This leads to only computing the expectation of the martingale portion multiplied by the original measure.
Specifically, 
$$
\lambda(t_n | t_1, \ldots, t_{n-1}) =\lambda(t_n | t_{n-1}) = \frac{p(t_n - t_{n-1} | t_{n-1})}{1 - \int_{t_{n-1}}^{t_n}p(t_n - t_{n-1} | t_{n-1})}.
$$
Of course, if $\mu$ does not depend on $t$ then the conditional term is not needed.
We then approximate the integral in the denominator by a Riemann sum. 

On the other hand, the density of the next event can be derived from the intensity function.
\begin{equation} \label{eq:intensity_to_fpt}
p_n(t | t_1,..., t_{n-1}) = \lambda(t | t_1,..., t_{n-1}) 
\exp\left\{ - \int_{t_{n-1}}^t \lambda(u | t_1,..., t_{n-1}) \mathrm{d}u \right\}.
\end{equation}


Next, we provide a few examples.
Let $p(x, t | x_{t_{(n-1})}, 0)$ be the transition density of the dynamics $X_t$ start from the origin,
$g(u | x_{t_{(n-1})}, 0)$ be the density of the first hitting time of the process. 
Consider the random walk from $(x_{t_{(n-1})}, 0)$ to $(x_t, t)$ through first hitting point $(S(u), u)$.
By marginalizing out the first hitting time $(S(u), u)$ (Chapman–Kolmogorov equation), we have
\begin{equation}
p(x, t | x_{t_{(n-1})}, 0) = \int_0^t p(x, t | S(u), u) \cdot g(u | x_{t_{(n-1})}, 0) \mathrm{d}u.
\label{eq:fortet}
\end{equation}
The above is called the Fortet equation, which can be used to solve the density of the first hitting time \citep{sacerdote2013stochastic}. $g(u | x_{t_{(n-1})}, 0)$ depends on the design of $S(u)$.
In general condition, there exists such a boundary \citep{potiron2021existence}, which is proved using a piecewise linear representation that is also the technique used by \citep{sacerdote2003threshold}. The approximation accuracy can be arbitrarily high (the error is $O(h^2)$ where $h$ is the knot distance) \citep{zucca2009inverse}.


Take the target density $g$ into Eq \eqref{eq:fortet} to solve the boundary function $S$.
If $g$ includes the history of the process, such as the Hawkes process, the solution $S$ becomes a history-dependent and time-varying boundary function.
$p(x, t | x_{t_{(n-1})}, 0)$ and $ p(x, t | S(u), u)$ can be easily obtained using the basic property of Brownian motion.
\citep{sacerdote2003threshold} provides an approximation solution to the problem.

\section{Lamperti Transform} \label{sec:lamperti}
In this section, we show that if $\sigma$ is not a constant, the process can be transformed into an equivalent process with constant $\sigma$. So model in the main text  assumes the constant drift $\sigma = 1$ without loss of generality.
The transformation is described in \citet{ait2002maximum}.
Let $Z_t$ represent the original process satisfying \eqref{eq:sde}.
Let $Y_t := \gamma(X_t,t) = \int \frac{dx}{ \sigma(X_t, t)} $. 
From Ito's lemma,  
\begin{align*}
    dY_t &= \mu_Y(Y_t, t)dt + dW_t \\ 
    \mu_Y(Y_t, t) &= \frac{\partial \gamma}{\partial t} \left (\gamma^{-1}\left(Y_t, t\right), t\right) + \frac{\mu\left(\gamma^{-1}(Y_t, t)\right)}{\sigma\left(\gamma^{-1}\left(Y_t, t\right)\right)} - \frac12 \frac{\partial \sigma}{\partial t} \left ( \gamma^{-1}\left (Y_t, t \right ), t \right ) 
\end{align*}
has unit diffusion. 
To overcome identifiability issues, we assume that this transformation implicitly occurs, and we only wish to recover the unit variance process $Y_t$. 

% \subsection{Multivariate Lamperti Transform}
% \yuchen{(You can't have only one subsection in a section, so I embed the multivariate case in to the text)}
The multivariate Lamperti transform is described  by \citet{ait2008closed}.
Let $\sigma(y)$ be a symmetric positive matrix for all $y \in \mathbb{R}^d$. 
Additionally let $\sigma$ be differentiable everywhere and for all $y$
$$
\frac{\partial \sigma(y)}{\partial y_k}\sigma(y)^{-1}e_j = \frac{\partial \sigma(y)}{\partial y_j}\sigma(y)^{-1}e_k
$$
where $e_j$ is the $j^{\text{th}}$ canonical basis vector of $\mathbb{R}^d$.
Then the SDE given by Lipshitz $\mu, \sigma$ is a reducible SDE with unit diffusion. 

\bibliography{refs}

\end{document}
