\section{First-order error bound}\label{sec:tight_error_bound}


% Here, we present a first order error bound by learning just one PINN, addressing the practical challenge of obtaining $B_2$. We also show an implicit formula that is able to check the training termination. We discuss the feasibility of this approach, present our training scheme, and illustrate relevant extensions. See Appendix~\ref{proof:coro1} and~\ref{proof:prop1} for complete proofs.

Here, we introduce a first-order error bound using a single PINN, overcoming the challenge of obtaining $B_2$. We also provide an implicit formula to determine training termination. Additionally, we discuss the feasibility of this approach, outline our training scheme, and explore relevant extensions. Complete proofs can be found in Appendices~\ref{proof:coro1} and~\ref{proof:prop1}.


% \paragraph{First-Order Error Bound}
By considering Eq.~\eqref{eq:error_series} with $n=2$ and using Lemmas~\ref{lemma:bouding series of ratios} and~\ref{lemma:bounds_on_ratios}, we can derive the first-order error bound, as stated in the following corollary.
\begin{corollary}[First-order error bound]\label{corollary:special_error_bound}
    Consider Problem~\ref{prob:1}, and let $\hat{e}_1$ be trained such that $\alpha_1(t)<1$ for all $t \in T'$. Then
    \begin{equation}
        |p(x,t)-\hat{p}(x,t)| < B_1(t) = 2\hat{e}_1^*(t).
        \label{eq:tight_error_bound}
    \end{equation}
\end{corollary}
% Note that, while the first-order error bound $B_1(t)$ is at most twice larger than the second-order error bound $B_2(t)$ in Theorem~\ref{theorem:temporal_error_bound}, it has significant practical uses.  Firstly, it only requires training of one PINN, i.e., $\hat{e}_1$.  Secondly, the condition $\alpha_1(t) < 1$ can be checked during training of $\hat{e}_1$ using properties of the FP-PDE as detailed below.
Note that the first-order error bound $B_1(t)$ can be at most twice as large as the arbitrary tight second-order bound $B_2(t)$ in Theorem~\ref{theorem:temporal_error_bound}, but it offers significant practical advantages. 
Firstly, the second-order bound $B_2(t)$ requires training a second PINN \(\hat{e}_2\) after training \(\hat{e}_1\). However, achieving the required accuracy for \(\hat{e}_2\) in practice is quite challenging. In contrast, the first-order bound in Corollary~\ref{corollary:special_error_bound} relies solely on the approximation provided by a single PINN, \(\hat{e}_1\). 
Secondly, the condition $\alpha_1(t) < 1$ can be verified during $\hat{e}_1$ training using properties of the FP-PDE, as detailed below.

\paragraph{Checking $\alpha_1 < 1$ Condition}
From the definition of $\alpha_1(t)$ in Eq.~\eqref{eq:alpha_def}, it suffices to bound the unknown term $|e_1(x,t)-\hat{e}_1(x,t)|$ for all $(x,t) \in \Omega$ to check for $\alpha_1(t)<1$.
We do this by using three constants: the first two constants are related to PDE stability and quadrature rules \citep{mishra2023estimates}, and the third constant comes from Sobolev embedding theorem 
\citep[Theorem 12.71]{hunter2001applied}\citep{mizuguchi2017estimation}.

The first constant $C_{pde}$ is related to the \emph{stability} of the first error PDE, which is defined as
\begin{equation*}
    \|e_1 - \hat{e}_1\|_Z \leq C_{pde} \| (\mathcal{D}[e_1] + \mathcal{D}[\hat{p}]) - (\mathcal{D}[\hat{e}_1]+\mathcal{D}[\hat{p}]) \|_Y,
\end{equation*}
where 
$Z=W^{k,q}$ norm , $Y = L^s$ norm, $1\leq s,q < \infty,$ and $k\geq 0$.
Note that since $e_1,\hat{e}_1$ and $(\mathcal{D}[e_1] + \mathcal{D}[\hat{p}]) - (\mathcal{D}[\hat{e}_1]+\mathcal{D}[\hat{p}]) = 0-(\mathcal{D}[\hat{e}_1]+\mathcal{D}[\hat{p}])$
are bounded
% \footnote{$\hat{p},\hat{e}_1$ are approximate functions with bounded derivatives}
,
such constant $C_{pde}$ exists.

The second constant $C_{quad} > 0$ is related to the deviation between integral and its approximation with finite samples. Let $\mathcal{I} = \int_{\Omega} \Big(\mathcal{D}[\hat{e}_1(x,t)] + \mathcal{D}[\hat{p}(x,t)]\Big) dxdt$ be the integral of interest, and $\bar{\mathcal{I}} = \sum_{j=1}^N w_j \Big(\mathcal{D}[\hat{e}_1(x_j,t_j)] +  \mathcal{D}[\hat{p}(x_j,t_j)] \Big)$ be its associated approximation, where $\{(x_j,t_j)_j\}_{j=1}^N \in \Omega$ is a set of $N$ quadrature points, and $w_j \in \mathbb{R}_{>0}$ are weights according to the quadrature rules. Then $C_{quad}$ is defined such that, for some $\beta > 0$,
$
    \left|\mathcal{I} - \bar{\mathcal{I}} \right| \leq C_{quad}N^{-\beta}.
$
% The procedure of deriving these constants for general PDEs is shown in \citep{mishra2023estimates}.

The third constant $C_{embed}$ from Sobolev embedding theorem is defined as
\begin{equation*}
    \|e_1(x,t)-\hat{e}_1(x,t)\|_{\infty} \leq C_{embed} \|e_1(x,t)-\hat{e}_1(x,t)\|_{W^{1,q}}.
\end{equation*}
Constant $C_{embed}$ exists because $e_1(x,t)$ and $\hat{e}_1(x,t)$ are bounded (per Definition~\ref{def:1}), and the first derivatives of $e_1(x,t)$ and $\hat{e}_1(x,t)$ are also bounded over the considered domain of Problem~\ref{prob:1}.
With these constants, we propose an implicit checking formula for $\alpha_1(t)<1$.
\begin{proposition}[Checking $\alpha_1(t)<1$]    \label{prop:checking_alpha1}
    Let $\{(x_j,t_j)_j\}_{j=1}^N \in \Omega$ be $N$ space-time samples based on quadrature rules, $\hat{e}_1(x,t)$ be the first error approximation, and $\mathcal{L}^{(1)}$ be the physics-informed loss of $\hat{e}_1(x,t)$ evaluated on the set $\{(x_j,t_j)_j\}_{j=1}^N$. Then for some $q\geq 2$ and $\beta > 0$, $\alpha_1(t) < 1$ for all $t \in T'$ if
    \begin{align}
        C_{embed}C_{pde}\Big( \mathcal{L}^{(1)} + C_{quad}^{\frac{1}{q}}N^{\frac{-\beta}{q}} \Big) < \min_t \hat{e}_1^*(t).
        \label{eq:checking_alpha1}
    \end{align}
\end{proposition}
By Proposition~\ref{prop:checking_alpha1}, it is clear that as the training loss decreases ($\mathcal{L}^{(1)} \rightarrow 0$) with sufficiently large number of samples ($N \rightarrow \infty$), the left-hand side of Eq.~\eqref{eq:checking_alpha1} approaches zero. Hence, condition $\alpha_1 < 1$ can be satisfied
as validated in our numerical evaluations.

% \add{Below we further analyze the effect of each constant in  Proposition~\ref{prop:checking_alpha1}.}

% \paragraph{Effect of Each Constant in Proposition~\ref{prop:checking_alpha1}}
% \add{
% We acknowledge that Proposition~\ref{prop:checking_alpha1} is an \emph{implicit} checking formula because the explicit values of the constants $C_{pde},C_{quad}$ and $C_{embed}$ may not be easily computed. 
% In \citep{mishra2023estimates}, Cauchy-Schwarz and Grönwall inequalities are used to (over-)estimate the first constant $C_{pde}$, which typically grows exponentially with time. Their approach is applied to several PDEs, including nonlinear ones. The second constant $C_{quad}$ in Eq.~\eqref{eq:checking_alpha1} is multiplied by $N^{-\beta}$, where $N$ is the number of quadrature points and $\beta > 0$ is the convergence rate. Thus, a sufficiently large $N$ can ensure $C_{quad}N^{-\beta} \ll 1$. For instance, $C_{quad}N^{-\beta}$ is proportional to $\frac{1}{2N!}$ by standard Gauss quadrature rules \citep{stoer1980introduction}, and $C_{quad}N^{-\beta}$ converges at a rate of $(\log(N))^n N^{-1}$ with low-discrepancy sequences \citep{mishra2023estimates}. The third constant $C_{embed}$ depends on the domain geometry and generally grows with domain size as in \citep{mizuguchi2017estimation}. For example, $C_{embed}=5.61$ for unit square domain and $C_{embed}=22.62$ for unit cube.
% In summary, the values of $C_{pde}$ and $C_{embed}$ increase with the state dimension and the size of the domain, making the condition in Proposition 1 more challenging to satisfy.
% Since this work focus on theoretical foundation, finding the explicit values of these constants is left for future work.
% }

% Note that values of the constants $C_{pde},C_{quad}$ and $C_{embed}$ may not be easily computed. 

Note that, for the constants in Proposition~\ref{prop:checking_alpha1}, it is sufficient to have upper bounds.
Specifically, \citet{mishra2023estimates} show a method of over-estimating $C_{pde}$. 
% Further, note that $C_{quad}$ in Eq.~\eqref{eq:checking_alpha1} is multiplied by $N^{-\beta}$, where $N$ is the number of quadrature points and $\beta > 0$ is the convergence rate. Thus, a sufficiently large $N$ can ensure $C_{quad}N^{-\beta} \ll 1$. For instance, $C_{quad}N^{-\beta}$ is proportional to $\frac{1}{2N!}$ by standard Gauss quadrature rules \citep{stoer1980introduction}, and $C_{quad}N^{-\beta}$ converges at a rate of $(\log(N))^n N^{-1}$ with low-discrepancy sequences \citep{mishra2023estimates}. The third 
Constant $C_{embed}$ depends on the domain geometry \citep{mizuguchi2017estimation}.  
Also note that a sufficiently large $N$ can ensure $C_{quad}N^{-\beta} \ll 1$. 

% For the constants in Proposition~\ref{prop:checking_alpha1}, it is sufficient to obtain upper bounds. Specifically, \citep{mishra2023estimates} provides a method for overestimating $C_{pde}$. Additionally, $C_{quad}$ in Eq.~\eqref{eq:checking_alpha1} is scaled by $N^{-\beta}$, where $N$ is the number of quadrature points and $\beta > 0$ is the convergence rate. Thus, choosing a sufficiently large $N$ ensures $C_{quad}N^{-\beta} \ll 1$. 
% For example, by standard Gauss quadrature rules \citep{stoer1980introduction}, \( C_{quad}N^{-\beta} \) is proportional to \( \frac{1}{2N!} \), while with low-discrepancy sequences, it converges at a rate of \( (\log N)^n N^{-1} \) \citep{mishra2023estimates}. 
% The third constant, $C_{embed}$, depends on the domain geometry and generally increases with domain size, as shown in \citep{mizuguchi2017estimation}.
% Specifically, our show that as loss of $\hat{e}_1$ decreases, $\alpha_1 <1$ is achieved.

% For example, $C_{embed}=5.61$ for unit square domain and $C_{embed}=22.62$ for unit cube. In summary, the values of $C_{pde}$ and $C_{embed}$ increase with the state dimension and the size of the domain, making the condition in Proposition 1 more challenging to satisfy.
% Since this work focus on theoretical foundation, finding the explicit values of these constants is left for future work.
% Specifically, our evaluations show that as loss of $\hat{e}_1$ decreases, $\alpha_1 <1$ is achieved.


% \begin{figure*}[t] % Use figure* for wide figures spanning both columns
%     \centering
%     \begin{subfigure}[b]{0.30\textwidth} % First row, first column
%         \includegraphics[width=\textwidth]{figs/1dou_compare_error_bounds.pdf} % Replace with your image
%         \caption{$\max_x|e_1(x,t)|< B_2(t) < B_1(t)$.}
%         \label{fig:sub1a}
%     \end{subfigure}
%     \hspace{0em}
%     \begin{subfigure}[b]{0.36\textwidth} % First row, second column
%         \includegraphics[width=\textwidth]{figs/1dou_conditions.pdf} % Replace with your image
%         \caption{Satisfy Conditions~\ref{eq:alpha_conditions}.}
%         \label{fig:sub1b}
%     \end{subfigure}
%     \caption{Example of arbitrary first order error bound $B_2(t)$, illustrating its tightness and training conditions.}
%     \label{fig:1}
% \end{figure*}

\begin{figure*}[t]
    \centering
    % Minipage for Figure 1 (1x2 subfigures)
    \begin{minipage}{0.59\textwidth}
        \centering
        \begin{subfigure}{0.40\textwidth}
            \centering
            \includegraphics[width=\linewidth]{figs/1dou_compare_error_bounds.pdf}
            \caption{$e_1^*(t) < B_2(t) < B_1(t)$}
            \label{fig:sub1a}
        \end{subfigure}
        \hspace{0em}
        \begin{subfigure}{0.58\textwidth}
            \centering
            \includegraphics[width=\linewidth]{figs/1dou_conditions.pdf}
            \caption{Satisfy Conditions~\ref{eq:alpha_conditions}.}
           \label{fig:sub1b}
        \end{subfigure}
        \caption{second-order error bound $B_2(t)$ on 1D Linear system}
        \label{fig:1}
    \end{minipage}
    \hspace{0em}
    % Minipage for Figure 2
    \begin{minipage}{0.39\textwidth}
        \centering
        \includegraphics[width=1.0\linewidth]{figs/meta_plot.pdf}
        \caption{$\alpha_1(t),\;t \in T'$ vs train loss of $\hat{e}_1$, for all first-order error bound experiments.}
        \label{fig:meta}
    \end{minipage}
\end{figure*}

\paragraph{Training Scheme for First-Order Error Bound}\label{sec:train_scheme}
Guided by Proposition~\ref{prop:checking_alpha1}, our goal is to train $\hat{e}_1$ to achieve sufficiently small loss.
By the PINN loss in Eq.~\eqref{eq:pinn_loss} and the PDE of the first error in Eqs.~\eqref{eq:err_def} and~\eqref{eq:error_1_e1_func_pde}, 
% and recursive PDE in Definition~\ref{def:1}, 
the training loss of $\hat{e}_1$
% with weights $w_0,w_r \in \mathbb{R}^+$ 
is:
\begin{subequations}\label{eq:pinn_loss_e1}
    \begin{align} 
    \mathcal{L}^{(1)} & = w_{0}\mathcal{L}_{0}^{(1)} + w_{r}\mathcal{L}_{r}^{(1)}, \\ 
    \label{eq:pinn_loss_0_e1}
    \mathcal{L}_{0}^{(1)} 
    & = \frac{1}{N_0}\sum_{k=1}^{N_0}\big( p_0(x_k)-\hat{p}(x_k,0) - \hat{e}_1(x_k,0) \big)^2, \\
    \label{eq:pinn_loss_r_e1}
    \mathcal{L}_{r}^{(1)} 
    & = \frac{1}{N_r}\sum_{k=1}^{N_r}\big(\mathcal{D}[\hat{e}_1(x_k,t_k)] + \mathcal{D}[\hat{p}(x_k,t_k)]\big)^2.
    \end{align}
\end{subequations}
By Eq.~\eqref{eq:pinn_loss_0_e1}--\eqref{eq:pinn_loss_r_e1}, we see that training $\hat{e}_1$ requires inputs from neural network $\hat{p}$ and its derivatives $\mathcal{D}[\hat{p}]$.
This could lead to difficult training for $\hat{e}_1$ if the input $\mathcal{D}[\hat{p}]$ is highly oscillating even when $\hat{p}$ is smooth by construction~\citep{zhao2023pinnsformer}.
To address this issue, we implement a regularization loss to prevent rapid changes in $\mathcal{D}[\cdot]$ of the first PINN $\hat{p}$. Specifically, we train $\hat{p}$ by adding the following regularization loss to Eq.~\eqref{eq:pinn_loss}: 
\begin{equation}\label{eq:grad_reg}
\mathcal{L}_{\nabla}=\frac{1}{N_r}\sum_{k=1}^{N_r}\| \nabla \Big( \mathcal{D}[\hat{p}(x_k,t_k)] \Big) \|^2_2,
\end{equation}
where $\nabla$ is the gradient operator and $\|\cdot\|_2$ is the L2 norm.
We note that this regularization loss does not violate the paradigm of physics-informed learning, because as the residual of $\mathcal{D}[\cdot] \rightarrow 0$ for all $(x,t) \in \Omega$, the gradient of the differential residual $\nabla(\mathcal{D}[\cdot])$ also converges to zero.
In fact, this regularization is investigated in \citet{yu2022gradient} to improve training stability of PINNs. 
Note that such gradient regularization loss in Eq.~\eqref{eq:grad_reg} is not applied to the training of $\hat{e}_1$ in Eqs.~\eqref{eq:pinn_loss_e1} because $\mathcal{D}[\hat{e}_1]$ is not used to train subsequent error functions.
% Our error bound theory could provide a partial reason to explain why gradient , because solutions that lead to complex error dynamics are penalized via the regularization loss.
A detailed description of our training scheme is provided in Appendix~\ref{appendix:train_and_results}.


% \paragraph{Extensions to Other PDEs}
% \ada{
% Although we present the error bound theory in the context of the FP-PDE, it is readily applicable for other PDEs with linear differential operator. This is achieved by adding boundary conditions (e.g. Dirichlet and Neumann conditions) in the recursive PDEs in Definition~\ref{def:1}. Accordingly, the recursive error learning in Eq.~\eqref{eq:pinn_loss_general} has extra loss term for satisfying the boundary conditions. We provide a complete derivation and numerical experiment of a heat PDE with Dirichlet boundary condition in Appendix~\ref{proof:1D_Heat}
% }
% \old{original Remark 5}

\begin{remark}\label{remark:3}
    Finally, we note that, while the presented approach focuses on FP-PDE and training an approximate PDF $\hat{p}$ and bounding its error, the only essential requirement is that the FP-PDE operator $\mathcal{D}[\cdot]$ is linear. Therefore, this approach naturally extends to other linear PDEs subject to initial and boundary conditions (e.g., Dirichlet and Neumann conditions). We illustrate this in a case study in Appendix~\ref{proof:1D_Heat},~\ref{appendix:1D-HEAT-sys}, and~\ref{appendix:1D-HEAT-exp}.
\end{remark}