\section{Problem Formulation} \label{sec:problem}
The aim of this work is state uncertainty propagation with quantified error bounds for continuous time
and space stochastic processes using deep neural networks. We specifically focus on 
(possibly nonlinear) Stochastic Differential Equations (SDEs) described by
\begin{equation}\label{eq:sde_general}
    d\bm{x}(t) = f(\bm{x}(t),t) dt + g(\bm{x}(t),t) d\bm{w}(t),
\end{equation}
where 
$t \in T \subseteq \mathbb{R}_{\geq 0}$ is time,
$\bm{x}(t) \in X \subseteq \mathbb{R}^n$ is the system state at time $t$, and $\bm{w}(t) \in \mathbb{R}^m$ is a standard Brownian motion. 
For $\Omega= X\times T$, function $f: \Omega \rightarrow \mathbb{R}^n$ represents the deterministic evolution of the system, and function $g: \Omega \rightarrow \mathbb{R}^{n \times m}$ is a term that defines the coupling of the noise.
We assume that $f(x,t)$ and $g(x,t)$ satisfy the usual regularity conditions (e.g., see \citet[Ch.~7.1]{evans2022partial}), and
denote the $i$-th dimension of $f$ and $(j,k)$-th element of $g$ by $f_i$ and $g_{jk}$, respectively.
The initial state $\bm{x}(0)$ is a random variable distributed according to a given probability density function (PDF) $p_0:X\rightarrow \mathbb{R}_{\geq 0}$, i.e., $\bm{x}(0) \sim p_0$. We assume that $p_0$ is bounded and smooth.

The solution to the SDE in Eq.~\eqref{eq:sde_general} is a stochastic process $\bm{x}$ 
with a corresponding PDF $p:\Omega \rightarrow \mathbb{R}_{\geq 0}$  
over space and time, i.e., $\bm{x}(t) \sim p(\cdot, t)$ \citep{oksendal2003stochastic}.
PDF $p$
is governed by the Fokker-Planck partial differential equation (FP-PDE):
% \begin{equation}
\begin{multline}\label{eq:fp_pde}
    \frac{\partial p(x,t)}{\partial t} + \sum_{i=1}^n \frac{\partial}{\partial x_i} [f_i p(x,t)] - \\
    \frac{1}{2} \!\! \sum_{i=1,j=1}^n \!\! \frac{\partial ^2}{\partial x_i \partial x_j} \left[ \sum_{k=1}^m g_{ik}g_{jk}p(x,t) \right] = 0,
\end{multline}
and must satisfy the initial condition
\begin{equation}
    p(x,0) = p_0(x) \qquad \forall x \in X.
    \label{eq:fp_pde_init_cond}
\end{equation}
% To simplify notation, we denote by $\mathcal{D}[\cdot]$ the differential operator associated with the FP-PDE:
% $$\mathcal{D}[\cdot]:= \frac{\partial}{\partial t}[\cdot] + \sum_{i=1}^n \frac{\partial}{\partial x_i} [f_i \cdot] - \frac{1}{2} \sum_{i=1,j=1}^n \frac{\partial ^2}{\partial x_i \partial x_j} \left[ \sum_{k=1}^m g_{ik}g_{jk} \cdot \right].$$ Then,~\eqref{eq:fp_pde} and \eqref{eq:fp_pde_init_cond} can be rewritten in a compact form  as
% \begin{equation}
%     \mathcal{D}[p(x,t)] = 0, \quad \text{ subject to } \quad p(x,0) = p_0(x).
%     \label{eq:fp_pde_compact}
% \end{equation}
To simplify notation, we denote by $\mathcal{D}[\cdot]$ the differential operator associated with the FP-PDE, i.e.,
$$\mathcal{D}[\cdot]:= \frac{\partial}{\partial t}[\cdot] + \sum_{i=1}^n \frac{\partial}{\partial x_i} [f_i \cdot] - \frac{1}{2} \! \sum_{i,j=1}^n \frac{\partial ^2}{\partial x_i \partial x_j} \left[ \sum_{k=1}^m g_{ik}g_{jk} \cdot \right].$$ Then, Eqs.~\eqref{eq:fp_pde} and \eqref{eq:fp_pde_init_cond} can be rewritten in a compact form  as
\begin{equation}
    \mathcal{D}[p(x,t)] = 0 \quad \text{ subject to } \quad p(x,0) = p_0(x).
    \label{eq:fp_pde_compact}
\end{equation}
Note that, since $f$ and $g$ are assumed to be regular, the PDE in Eq.~\eqref{eq:fp_pde_compact} is well-posed, i.e., there exists a smooth and unique solution.

Obtaining solution $p(x,t)$ to Eq.~\eqref{eq:fp_pde_compact} in closed-form is generally not possible, and even numerical approaches are limited to simple SDEs \citep{spencer1993numerical,drozdov1996solution,masud2005application,chakravorty2006homotopic,pichler2013numerical,qian2019conservative,urena2020non}.
In this work, we focus on using PINNs to approximate $p$, and crucially, we aim to formally bound the resulting approximation error.
\begin{problem}\label{prob:1}
    Given FP-PDE in Eq.~\eqref{eq:fp_pde_compact}, a bounded subset $X' \subset X$, and a bounded time interval $T' \subset T$,
    train a neural network $\hat{p}(x,t)$ that approximates the solution $p(x,t)$, and for every $t \in T'$ construct an error bound $B:T' \rightarrow \mathbb{R}_{> 0}$ such that 
    \begin{equation}
        \sup_{x\in X'} |p(x,t)-\hat{p}(x,t)| \leq B(t).
    \end{equation}
\end{problem}

% \add{Note that we assume explicit knowledge of the system (i.e., the functions $f$ and $g$ in Eq.~\eqref{eq:fp_pde} are known).
% For partially known systems, inverse PINNs can be applied as in \citep{lu2021deepxde} to simultaneously identify system parameters and learn solution with training data; however, this setting is outside the scope of this paper.}
% which derives from a presumption that a sufficiently good system identification is done a-priori.
% To have a well-posed Problem~\ref{prob:1}, we require bounded initial distribution (e.g. $p_0(x)$ cannot be a delta function). 
% \add{We further assume smooth $p_0(x)$ to simplify the problem.}
% \add{Problem~\ref{prob:1} is challenging because the true PDF $p(x,t)$ is unknown except for the initial time.} 

% \ml{remove this!}
% \rev{In Problem~\ref{prob:1}, $X'$ is a bounded subset of the state‐space of interest (e.g., regions to avoid). Meanwhile, $T'$ is a bounded time interval.}
Note that no data is assumed on $p$.
Instead, our approach leverages the governing 
Eq.~\eqref{eq:fp_pde_compact} for both training $\hat{p}$ and quantifying its error. 
Specifically, we first 
show that existing PINN training methods for PDE solutions can be adapted to approximate $p$ effectively if the training loss is sufficiently small. 
Then, 
we show that the resulting approximation error can be written as a series of approximate error functions, each of which satisfying a PDE similar to Eq.~\eqref{eq:fp_pde_compact}. This implies that each error function itself can be approximated using a PINN. Then, we derive conditions, under which only a finite number of such PINNs is needed to obtain a guaranteed error bound $B(t)$.

\begin{remark}
    While we focus on $\hat{p}$ being a neural network, our method of deriving error bound $B(t)$ is not limited to neural networks and generalizes to any smooth function $\hat{p}$ that approximates the true solution $p$.
\end{remark}