\section{PDF Approximation and Error Characterization via PINNs
% Learn Solutions and Recursive Errors
}\label{sec:learn}

Here, we first describe a method for training a neural network to approximate PDF $p(x,t)$ for Problem~\ref{prob:1}, and then derive a recursive error-learning approach to estimate the approximation error.

\paragraph{Learning PDF $p$}\label{sec:learn_pdf}
% Given the PDE in Eq.~\eqref{eq:fp_pde_compact}, as common in physics-informed deep learning, 
We approximate $p(x,t)$ by learning a neural network $\hat{p}(x,t;\theta)$, where $\theta$ represents the parameters. 
During training, 
spatio-temporal data points $\{(x_j,0)_j \}_{j=1}^{N_0}, \{(x_j,t_j)_j \}_{j=1}^{N_r} \subset \Omega$,
for some $N_0,N_r \in \mathbb{N}$, are sampled, and the loss function is derived from the governing physics in Eq.~\eqref{eq:fp_pde_compact} as 
\begin{align}\label{eq:pinn_loss}
    \mathcal{L} = w_{0}\mathcal{L}_{0} + w_{r}\mathcal{L}_{r},
\end{align}
where  $w_0,w_r \in \mathbb{R}^+$ are the weights, and
\begin{subequations}\label{eq:pinn_sub_loss}
    \begin{align} 
        \mathcal{L}_{0} &= \frac{1}{N_0}\sum_{j=1}^{N_0}\big( p_0(x_j) - \hat{p}(x_j,0;\theta) \big)^2, \\
        \mathcal{L}_{r} &= \frac{1}{N_r}\sum_{j=1}^{N_r}\big(\mathcal{D}[\hat{p}(x_j,t_j;\theta)]\big)^2.
    \end{align}
\end{subequations}
The loss function in Eq.~\eqref{eq:pinn_loss} quantifies the deviation of the true and approximate solutions in terms of the initial condition ($\mathcal{L}_0$) and the infinitesimal variation over space and time ($\mathcal{L}_r$).
The parameters of $\hat{p}(x,t;\theta)$ are learned by minimizing the loss function, i.e., $\theta^{*} = \arg\min \mathcal{L}.$ 

We note that overfitting can arise if the training samples in Eqs.~\eqref{eq:pinn_sub_loss} do not sufficiently cover the domain $\Omega$. 
To address this, we rely on common sampling practices in PINNs (e.g., uniformly sampled \citep{sirignano2018dgm}) along with an adaptive sampling method in \citet{lu2021deepxde} to improve sampling efficiency (see Appendix~\ref{appendix:train_and_results} for details).

\begin{assumption}\label{assumption:phat}
    $\hat{p}:\Omega \rightarrow \mathbb{R}$ is assumed to be
    smooth. 
\end{assumption}
Assumption~\ref{assumption:phat} is present because $\hat{p}$ is trained by the physics-informed loss in Eq.~\eqref{eq:pinn_loss}, in which the second term $\mathcal{L}_r$ requires the computation of the first and second derivatives with respect to time and space, respectively.
To satisfy Assumption~\ref{assumption:phat}, smooth activation functions (e.g., $\mathrm{Tanh}$ and $\mathrm{Softplus}$) can be used in the architecture of $\hat{p}(x,t;\theta)$.
While \(\hat{p}\) here is real-valued, one can further ensure \(\hat{p}\ge 0\) using non-negative activation functions (e.g., exponent or squared) for the last layer.

% To train a neural network $\hat{p}$ using the physics-informed loss in Eq.~\eqref{eq:pinn_loss}, it has to be at least twice continuously differentiable with respect to $x$ and continuously differentiable with respect to $t$. For instance, smooth activation functions (e.g., $\mathrm{Tanh}$ and $\mathrm{Sigmoid}$) can be used in the architecture of $\hat{p}$ (see \citep{sirignano2018dgm} for detailed description of PINNs).
    
% The weights $w_{0}$ and $w_r$ in Eq.~\ref{eq:pinn_loss} determine the relative importance of the system's initial condition and evolution, which are chosen manually in standard PINNs research (e.g. $w_0=w_r=1$).
% Although the weight association may affect convergence rate, \citeauthor{shin2020convergence,mishra2023estimates} justify the theoretical convergence of PINNs output to the true solution as the loss is minimized.
The weights \(w_0\) and \(w_r\) in Eq.~\ref{eq:pinn_loss} balance the initial-condition and PDE-residual terms. Although tuning these weights can affect the speed of convergence in practice, the training convergence does not rely on finding optimal weights \citep{shin2020convergence,mishra2023estimates}.


% We highlight that 
% % the true solution $p(x,t)$ is unknown for PINNs training. In fact, 
% the above training only requires the initial distribution $p_0(x)$ and the explicit functional form of the differential operator $\mathcal{D}$.
% Then, the losses in Eq.~\eqref{eq:pinn_sub_loss} can be evaluated on as many spatial-temporal samples as deried.
% This is a key distinction of PINNs from data-driven learning where a limited data regarding the true solution is often provided for training.

We emphasize that this method of training requires only the initial PDF $p_0(x)$ and differential operator $\mathcal{D}$, allowing loss evaluation on unlimited space-time samples. This key distinction sets PINNs apart from data-driven learning, which relies on (limited) data of (unavailable) true solution $p$.

\paragraph{Recursive Learning of Approximation Error}\label{sec:learn_issue}

% To quantify a PINNs output $\hat{p}(x,t)$ approximates the unknown true solution $p(x,t)$, 
Given trained $\hat{p}$, we show that its approximation error can be characterized as a series of approximate solutions to PDEs. Specifically, we define the error as
\begin{align}
    \label{eq:err_def}
    e_1(x,t) := p(x,t) - \hat{p}(x,t).
\end{align}
% Here, we show that $e_1(x, t)$ can be characterized as a series of approximate solutions to PDEs.
% For simplicity, we remove the independent variable $(x,t)$ if the context is clear.
Note that FP-PDE operator $\mathcal{D}$ is a linear operator; hence, by applying it to $e_1$, we obtain:
\begin{equation*}
    \mathcal{D}[e_1] = \mathcal{D}[p-\hat{p}] = \mathcal{D}[p] - \mathcal{D}[\hat{p}].
    \label{eq:apply_fppde_on_error}
\end{equation*}
% because FP-PDE is a linear differential operator, and $\mathcal{D}[p]=0$ by Eq.~\eqref{eq:fp_pde_compact}.
As $\mathcal{D}[{p}]=0$, we can see that the error is essentially related to the residue of $\mathcal{D}[\hat{p}]$.  Then, 
we can define the governing PDE of $e_1(x,t)$ as
% From Eq.~\eqref{eq:apply_fppde_on_error}, we observe that the dynamics of the approximation error $e_1$ is essentially related to $\mathcal{D}[\hat{p}]$, the residual of training $\hat{p}$. 
% We can thus form another PDE of $e_1(x,t)$ as 
\begin{equation}
    \mathcal{D}[e_1] + \mathcal{D}[\hat{p}] = 0
    \; \text{ s.t. } \;
    e_1(x,0) = p_0(x) - \hat{p}(x,0).
    \label{eq:error_1_e1_func_pde}
\end{equation}
Hence, using a similar approach as in Eqs.~\eqref{eq:pinn_loss} and ~\eqref{eq:pinn_sub_loss}, a PINN can be trained to approximate $e_1(x,t)$ using its governing physics in Eq.~\eqref{eq:error_1_e1_func_pde}.
Based on this, we can define the $i$-th error and its associated approximation in a recursive manner.
\begin{definition}[$i$-th error and approximation]\label{def:1}
    Let $e_0 := p$ and $\hat{e}_0 := \hat{p}$. For $i \geq 1$, we define the $i$-th error $$e_i(x,t):=e_{i-1}(x,t)-\hat{e}_{i-1}(x,t),$$ where each $\hat{e}_i$ is a smooth and bounded function constructed by PINN to approximate $e_i$. Each $e_i$ is the solution to the recursive PDE
    \begin{multline*}
        \mathcal{D}[e_i(x,t)] +  \sum_{j=1}^{i} \mathcal{D}[\hat{e}_{j-1}(x,t)] = 0 \quad {s.t.} \\
        e_i(x,0) = e_{i-1}(x,0) - \hat{e}_{i-1}(x,0).
    \end{multline*}
\end{definition}
By the construction in Definition~\ref{def:1}, the approximation error $e_1(x,t)$, for every choice of $n \geq 0$, is given by
\begin{align}
    e_1(x,t) &= p(x,t)-\hat{p}(x,t) \nonumber \\
    &= \sum_{i=1}^{n} \hat{e}_i(x,t) + e_{n+1}(x,t).
    \label{eq:error_series}
\end{align}
% Although this provides a recursive procedure to estimate the unknown approximation error $e_1$, it does not directly generate a worst-case error bound. The reason is that no matter how many $\hat{e}_i$, $i=1,2,\dots,n$ we construct, there always remains an unquantified error term $e_{n+1}$. To address this issue, we present our error bound theory below.
% Below, we derive an upper bound for this $e_1$.
Although this recursive procedure estimates the unknown approximation error $e_1$, it does not directly provide a worst-case error bound. This is because, regardless of how many $\hat{e}_i$, $i=1,2,\dots,n$, are constructed, an unquantified error term $e_{n+1}$ always remains. To address this, we present our error bound theory in the next section.

