\section{Training Configurations and Additional Results}\label{appendix:train_and_results}
Here we first discuss the general training setting across all experiments. Then we discuss each experiment in details and present additional results and visualizations.
In terms of training scheme, we use Adam optimizer for all experiments. For the systems in Section~\ref{sec:exp_tight_error_bound}, we use the regularization technique discussed in Section~\ref{sec:train_scheme} to train the PDF PINN $\hat{p}$. We also employ adaptive sampling to make training of both $\hat{p}$ and the error PINN $\hat{e}_1$ more efficient (see \citet{lu2021deepxde} for detailed explanation). As for the 1D Linear system in Section~\ref{sec:arb_tight_error_bound_example}, we simply sample random space-time points at every training iteration. For the architecture of the neural network, we use simple fully-connected feed-forward neural networks for both the solution PINN $\hat{p}$ and the error PINN $\hat{e}_1$. Information of the number of hidden layers, neurons, and activation functions for each experiment is provided below.

\subsection{1D Linear Experiment}\label{appendix:1D-L-exp}
The neural networks of $\hat{p}$ and $\hat{e}_1$ are summarized in Table~\ref{tab:nnphat_1D-L} and~\ref{tab:nne1hat_1D-L}. For each training iteration, $N_0=N_r=500$ space-time points are uniformly sampled as in Eq.~\ref{eq:pinn_loss_general}, with weights $w_0=1$ and $w_r=|T|=2$. 
The training losses of $\hat{p}$ and $\hat{e}_1$ are illustrated in Fig.~\ref{fig:1dl_trainloss}. The training results of $\hat{p}$ vs $p$ and $\hat{e}_1$ vs $e_t$ are shown in Fig.~\ref{fig:1dl_surfaces}, and the constructed (and synthesized) error bounds at some time instances are visualized in Fig.~\ref{fig:1dl_errorboundsresult}.

\begin{table}[htbp]
    \centering
    \begin{tabular}{lccc}
        \toprule
        \textbf{Layer Connection} & \textbf{Type} & \textbf{\# Neurons (Output)} & \textbf{Activation Function} \\
        \midrule
        Input Layer $\rightarrow$ Hidden Layer 1  & Fully Connected & 32  & Softplus \\
        Hidden Layer 1 $\rightarrow$ Hidden Layer 2  & Fully Connected & 32  & Softplus \\
        Hidden Layer 2 $\rightarrow$ Output Layer  & Fully Connected & 1  & Softplus \\
        \bottomrule
    \end{tabular}
    \caption{Neural Network Architecture and Hyperparameters of $\hat{p}$}
    \label{tab:nnphat_1D-L}
\end{table}

\begin{table}[htbp]
    \centering
    \begin{tabular}{lccc}
        \toprule
        \textbf{Layer Connection} & \textbf{Type} & \textbf{\# Neurons (Output)} & \textbf{Activation Function} \\
        \midrule
        Input Layer $\rightarrow$ Hidden Layer 1  & Fully Connected & 32  & Softplus \\
        Hidden Layer 1 $\rightarrow$ Hidden Layer 2  & Fully Connected & 32  & Softplus \\
        Hidden Layer 2 $\rightarrow$ Output Layer  & Fully Connected & 1  & N/A \\
        \bottomrule
    \end{tabular}
    \caption{Neural network architecture and hyper-parameters of $\hat{e}_1$}
    \label{tab:nne1hat_1D-L}
\end{table}

\begin{figure}[htbp!]
    \centering
    \includegraphics[width=0.7\linewidth]{figs/1dl_trainloss.pdf}
    \caption{Training losses of $\hat{p}$ and $\hat{e}_1$}
    \label{fig:1dl_trainloss}
\end{figure}

\begin{figure}[htbp!] % Use figure* for wide figures spanning both columns
    \centering
    \begin{subfigure}[b]{0.47\textwidth} % First row, first column
        \includegraphics[width=\textwidth]{figs/1dl_phatsurface.pdf}
        \caption{$\hat{p}$ vs $p$}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.47\textwidth} % First row, second column
        \includegraphics[width=\textwidth]{figs/1dl_e1hatsurface.pdf}
        \caption{$\hat{e}_1$ vs $e_1$}
    \end{subfigure}
    \caption{Trained PINNs $\hat{p}(x,t)$ and $\hat{e}_1(x,t)$ v.s. true PDF $p(x,t)$ and error $e_1(x,t)$ for all $x$ and $t$.}
    \label{fig:1dl_surfaces}
\end{figure}

\begin{figure}[htbp!]
    \centering
    \includegraphics[width=0.7\linewidth]{figs/1dl_errorboundsresult.pdf}
    \caption{$p$, $\hat{p}$, $B_1$, and $B_2$ at some $t$.}
    \label{fig:1dl_errorboundsresult}
\end{figure}

\clearpage
\subsection{1D Nonlinear Experiment}\label{appendix:1D-NL-exp}
The neural networks of $\hat{p}$ and $\hat{e}_1$ are summarized in Table~\ref{tab:nnphat_1D-NL} and~\ref{tab:nne1hat_1D-NL}. 
The training starts from $N_0=N_r=1000$ uniformly distributed samples for both $\hat{p}$ and $\hat{e}_1$.
We regularize the training of $\hat{p}$ by setting the weights $w_0=1$ and $w_r=w_{\nabla}=|T|=5$. For $\hat{e}_1$, the weights are $w_0=1,w_r=|T|$, and $w_{\nabla}=0$.
The training losses of $\hat{p}$ and $\hat{e}_1$ are illustrated in Fig.~\ref{fig:1dnl_trainloss}. Note that the periodic spikes are not due to unstable training. Instead they are due to the adaptive sampling scheme that periodically adds space-time points at which the residual values are large.
The training results of $\hat{p}$ vs $p$ and $\hat{e}_1$ vs $e_t$ are shown in Fig.~\ref{fig:1dnl_surfaces}; the constructed error bounds at some time instances can be seen in Fig.~\ref{fig:representative_results}a.

\begin{table}[htbp]
    \centering
    \begin{tabular}{lccc}
        \toprule
        \textbf{Layer Connection} & \textbf{Type} & \textbf{\# Neurons (Output)} & \textbf{Activation Function} \\
        \midrule
        Input Layer $\rightarrow$ Hidden Layer 1  & Fully Connected & 50  & Softplus \\
        Hidden Layer 1 $\rightarrow$ Hidden Layer 2  & Fully Connected & 50  & Softplus \\
        Hidden Layer 2 $\rightarrow$ Hidden Layer 3  & Fully Connected & 50  & Softplus \\
        Hidden Layer 3 $\rightarrow$ Output Layer  & Fully Connected & 1  & Softplus \\
        \bottomrule
    \end{tabular}
    \caption{Neural Network Architecture and Hyperparameters of $\hat{p}$}
    \label{tab:nnphat_1D-NL}
\end{table}

\begin{table}[htbp]
    \centering
    \begin{tabular}{lccc}
        \toprule
        \textbf{Layer Connection} & \textbf{Type} & \textbf{\# Neurons (Output)} & \textbf{Activation Function} \\
        \midrule
        Input Layer $\rightarrow$ Hidden Layer 1  & Fully Connected & 50  & GeLU \\
        Hidden Layer 1 $\rightarrow$ Hidden Layer 2  & Fully Connected & 50  & GeLU \\
        Hidden Layer 2 $\rightarrow$ Hidden Layer 3  & Fully Connected & 50  & GeLU \\
        Hidden Layer 3 $\rightarrow$ Hidden Layer 4  & Fully Connected & 50  & GeLU \\
        Hidden Layer 4 $\rightarrow$ Hidden Layer 5  & Fully Connected & 50  & GeLU \\
        Hidden Layer 5 $\rightarrow$ Hidden Layer 6  & Fully Connected & 50  & GeLU \\
        Hidden Layer 6 $\rightarrow$ Output Layer  & Fully Connected & 1  & N/A \\
        \bottomrule
    \end{tabular}
    \caption{Neural network architecture and hyper-parameters of $\hat{e}_1$}
    \label{tab:nne1hat_1D-NL}
\end{table}

\begin{figure}[htbp!]
    \centering
    \includegraphics[width=0.7\linewidth]{figs/1dnl_trainloss.pdf}
    \caption{Training losses of $\hat{p}$ and $\hat{e}_1$}
    \label{fig:1dnl_trainloss}
\end{figure}

\begin{figure}[htbp!] % Use figure* for wide figures spanning both columns
    \centering
    \begin{subfigure}[b]{0.46\textwidth} % First row, first column
        \includegraphics[width=\textwidth]{figs/1dnl_phatsurface.pdf}
        \caption{$\hat{p}$ vs $p$}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.46\textwidth} % First row, second column
        \includegraphics[width=\textwidth]{figs/1dnl_e1hatsurface.pdf}
        \caption{$\hat{e}_1$ vs $e_1$}
    \end{subfigure}
    \caption{Trained PINNs $\hat{p}(x,t)$ and $\hat{e}_1(x,t)$ v.s. true PDF $p(x,t)$ and error $e_1(x,t)$ for all $x$ and $t$.}
    \label{fig:1dnl_surfaces}
\end{figure}

\clearpage
\subsection{2D Inverted Pendulum Experiment}\label{appendix:2D-PEND-exp}
The neural networks of $\hat{p}$ and $\hat{e}_1$ are summarized in Table~\ref{tab:nnphat_2D-PEND} and~\ref{tab:nne1hat_2D-PEND}. 
As in Appendix~\ref{appendix:1D-NL-exp}, the training starts from $N_0=N_r=1000$ uniformly distributed samples for both $\hat{p}$ and $\hat{e}_1$.
We regularize the training of $\hat{p}$ by setting the weights $w_0=1$ and $w_r=w_{\nabla}=|T|=5$. For $\hat{e}_1$, the weights are $w_0=1,w_r=|T|$, and $w_{\nabla}=0$.
The training losses of $\hat{p}$ and $\hat{e}_1$ are illustrated in Fig.~\ref{fig:2dpend_trainloss}. 
Again, the periodic spikes in training loss are due to the adaptive sampling scheme that periodically adds space-time points, which becomes more effective as the system dimension grows.
The training results of $\hat{p}$ vs $p$ and $\hat{e}_1$ vs $e_t$ are shown in Fig.~\ref{fig:2dpend_results}. 
The constructed tight error bounds at some time instances are visualized in Fig.~\ref{fig:2dpend_errorbound}.

\begin{table}[htbp]
    \centering
    \begin{tabular}{lccc}
        \toprule
        \textbf{Layer Connection} & \textbf{Type} & \textbf{\# Neurons (Output)} & \textbf{Activation Function} \\
        \midrule
        Input Layer $\rightarrow$ Hidden Layer 1  & Fully Connected & 50  & Softplus \\
        Hidden Layer 1 $\rightarrow$ Hidden Layer 2  & Fully Connected & 50  & Softplus \\
        Hidden Layer 2 $\rightarrow$ Hidden Layer 3  & Fully Connected & 50  & Softplus \\
        Hidden Layer 3 $\rightarrow$ Output Layer  & Fully Connected & 1  & Softplus \\
        \bottomrule
    \end{tabular}
    \caption{Neural Network Architecture and Hyperparameters of $\hat{p}$}
    \label{tab:nnphat_2D-PEND}
\end{table}

\begin{table}[htbp]
    \centering
    \begin{tabular}{lccc}
        \toprule
        \textbf{Layer Connection} & \textbf{Type} & \textbf{\# Neurons (Output)} & \textbf{Activation Function} \\
        \midrule
        Input Layer $\rightarrow$ Hidden Layer 1  & Fully Connected & 50  & Softplus \\
        Hidden Layer 1 $\rightarrow$ Hidden Layer 2  & Fully Connected & 50  & Softplus \\
        Hidden Layer 2 $\rightarrow$ Hidden Layer 3  & Fully Connected & 50  & Softplus \\
        Hidden Layer 3 $\rightarrow$ Hidden Layer 4  & Fully Connected & 50  & Softplus \\
        Hidden Layer 4 $\rightarrow$ Hidden Layer 5  & Fully Connected & 50  & Softplus \\
        Hidden Layer 5 $\rightarrow$ Hidden Layer 6  & Fully Connected & 50  & Softplus \\
        Hidden Layer 6 $\rightarrow$ Output Layer  & Fully Connected & 1  & N/A \\
        \bottomrule
    \end{tabular}
    \caption{Neural network architecture and hyper-parameters of $\hat{e}_1$}
    \label{tab:nne1hat_2D-PEND}
\end{table}

\begin{figure}[htbp!]
    \centering
    \includegraphics[width=0.7\linewidth]{figs/2dpend_trainloss.pdf}
    \caption{Training losses of $\hat{p}$ and $\hat{e}_1$}
    \label{fig:2dpend_trainloss}
\end{figure}

\begin{figure}[htbp!] % Use figure* for wide figures spanning both columns
    \centering
    \begin{subfigure}[b]{0.98\textwidth} % First row, first column
        \includegraphics[width=\textwidth]{figs/2dpend_phatresult.pdf}
        \caption{$\hat{p}$ vs $p$}
    \end{subfigure}
    \vfill
    \begin{subfigure}[b]{0.98\textwidth} % First row, second column
        \includegraphics[width=\textwidth]{figs/2dpend_e1hatresult.pdf}
        \caption{$\hat{e}_1$ vs $e_1$}
    \end{subfigure}
    \caption{Trained PINNs $\hat{p}(\theta,\omega,t)$ and $\hat{e}_1(\theta,\omega,t)$ v.s. true PDF $p(\theta,\omega,t)$ and error $e_1(\theta,\omega,t)$ for all $\theta,\omega$ at some $t$.}
    \label{fig:2dpend_results}
\end{figure}

\begin{figure}[htbp!]
    \centering
    \includegraphics[width=1.0\linewidth]{figs/2dpend_errorbound.pdf}
    \caption{$|e_1|$, $|\hat{e}_1|$, and $B_1$at some $t$.}
    \label{fig:2dpend_errorbound}
\end{figure}


\clearpage
\subsection{2D Duffing Oscillator Experiment}\label{appendix:2D-DUFF-exp}
The neural networks of $\hat{p}$ and $\hat{e}_1$ are summarized in Table~\ref{tab:nnphat_2D-DUFF} and~\ref{tab:nne1hat_2D-DUFF}. 
The training starts from $N_0=N_r=1000$ samples for both $\hat{p}$ and $\hat{e}_1$. Half of these samples are drawn uniformly, and the other half follow the normal distribution specified by the initial condition.
We regularize the training of $\hat{p}$ by setting the weights $w_0=1$ and $w_r=w_{\nabla}=|T|=5$. For $\hat{e}_1$, the weights are $w_0=1,w_r=|T|$, and $w_{\nabla}=0$.
The training losses of $\hat{p}$ and $\hat{e}_1$ are illustrated in Fig.~\ref{fig:2dduff_trainloss}. 
Again, the periodic spikes in training loss are due to the adaptive sampling scheme that periodically adds space-time points, which becomes more effective as the system dimension grows.
The training results of $\hat{p}$ vs $p$ and $\hat{e}_1$ vs $e_t$ are shown in Fig.~\ref{fig:2dduff_results}. 
The constructed tight error bounds at some time instances are visualized in Fig.~\ref{fig:2dduff_errorbound}.

\begin{table}[htbp]
    \centering
    \begin{tabular}{lccc}
        \toprule
        \textbf{Layer Connection} & \textbf{Type} & \textbf{\# Neurons (Output)} & \textbf{Activation Function} \\
        \midrule
        Input Layer $\rightarrow$ Hidden Layer 1  & Fully Connected & 60  & GeLU \\
        Hidden Layer 1 $\rightarrow$ Hidden Layer 2  & Fully Connected & 60  & GeLU \\
        Hidden Layer 2 $\rightarrow$ Hidden Layer 3  & Fully Connected & 60  & GeLU \\
        Hidden Layer 3 $\rightarrow$ Hidden Layer 4  & Fully Connected & 60  & GeLU \\
        Hidden Layer 4 $\rightarrow$ Hidden Layer 5  & Fully Connected & 60  & GeLU \\
        Hidden Layer 5 $\rightarrow$ Output Layer  & Fully Connected & 1  & Softplus \\
        \bottomrule
    \end{tabular}
    \caption{Neural Network Architecture and Hyperparameters of $\hat{p}$}
    \label{tab:nnphat_2D-DUFF}
\end{table}

\begin{table}[htbp]
    \centering
    \begin{tabular}{lccc}
        \toprule
        \textbf{Layer Connection} & \textbf{Type} & \textbf{\# Neurons (Output)} & \textbf{Activation Function} \\
        \midrule
        Input Layer $\rightarrow$ Hidden Layer 1  & Fully Connected & 100  & GeLu \\
        Hidden Layer 1 $\rightarrow$ Hidden Layer 2  & Fully Connected & 100  & GeLu \\
        Hidden Layer 2 $\rightarrow$ Hidden Layer 3  & Fully Connected & 100  & GeLu \\
        Hidden Layer 3 $\rightarrow$ Hidden Layer 4  & Fully Connected & 100  & GeLu \\
        Hidden Layer 4 $\rightarrow$ Hidden Layer 5  & Fully Connected & 100  & GeLu \\
        Hidden Layer 5 $\rightarrow$ Output Layer  & Fully Connected & 1  & N/A \\
        \bottomrule
    \end{tabular}
    \caption{Neural network architecture and hyper-parameters of $\hat{e}_1$}
    \label{tab:nne1hat_2D-DUFF}
\end{table}

\begin{figure}[htbp!]
    \centering
    \includegraphics[width=0.7\linewidth]{figs/2dduff_trainloss.pdf}
    \caption{Training losses of $\hat{p}$ and $\hat{e}_1$}
    \label{fig:2dduff_trainloss}
\end{figure}

\begin{figure}[htbp!] % Use figure* for wide figures spanning both columns
    \centering
    \begin{subfigure}[b]{0.98\textwidth} % First row, first column
        \includegraphics[width=\textwidth]{figs/2dduff_phatresult.pdf}
        \caption{$\hat{p}$ vs $p$}
    \end{subfigure}
    \vfill
    \begin{subfigure}[b]{0.98\textwidth} % First row, second column
        \includegraphics[width=\textwidth]{figs/2dduff_e1hatresult.pdf}
        \caption{$\hat{e}_1$ vs $e_1$}
    \end{subfigure}
    \caption{Trained PINNs $\hat{p}(x_1,x_2,t)$ and $\hat{e}_1(x_1,x_2,t)$ v.s. true PDF $p(x_1,x_2,t)$ and error $e_1(x_1,x_2,t)$ for all $x_1,x_2$ at some $t$.}
    \label{fig:2dduff_results}
\end{figure}

\begin{figure}[htbp!]
    \centering
    \includegraphics[width=1.0\linewidth]{figs/2dduff_errorbound.pdf}
    \caption{$|e_1|$, $|\hat{e}_1|$, and $B_1$at some $t$.}
    \label{fig:2dduff_errorbound}
\end{figure}

\clearpage
\subsection{3D Time-Varying OU Experiment}\label{appendix:3D-TVOU-exp}
The neural networks of $\hat{p}$ and $\hat{e}_1$ are summarized in Table~\ref{tab:nnphat_3D-TVOU} and~\ref{tab:nne1hat_3D-TVOU}. 
For training $\hat{p}$, we begins with $N_0=N_r=2000$ samples. Half of these samples are drawn uniformly, and the other half follow the normal distribution specified by the initial condition. During training, we gradually add samples using adaptive sampling. 
For training $\hat{e}_1$, we begins with $N_0=N_r=300$ samples (with same distributions as training $\hat{p}$), and gradually add samples using adaptive sampling. 
The weights of training both $\hat{p}$ and $\hat{e}_1$ are $w_0=1,w_r=|T|=1$, and $w_{\nabla}=0$ without regularization.
The training losses of $\hat{p}$ and $\hat{e}_1$ are illustrated in Fig.~\ref{fig:3dtvou_trainloss}. 
% Again, the periodic spikes in training loss are due to the adaptive sampling scheme that periodically adds space-time points, which becomes more effective as the system dimension grows.
The training results of $\hat{p}$ vs $p$ and $\hat{e}_1$ vs $e_t$ are shown in Fig.~\ref{fig:3dtv_phatresult} and~\ref{fig:3dtv_e1hatresult}. 
% The constructed tight error bounds at some time instances are visualized in Fig.~\ref{fig:2dpend_errorbound}.

\begin{table}[htbp]
    \centering
    \begin{tabular}{lccc}
        \toprule
        \textbf{Layer Connection} & \textbf{Type} & \textbf{\# Neurons (Output)} & \textbf{Activation Function} \\
        \midrule
        Input Layer $\rightarrow$ Hidden Layer 1  & Fully Connected & 32  & GeLU \\
        Hidden Layer 1 $\rightarrow$ Hidden Layer 2  & Fully Connected & 32  & GeLU \\
        Hidden Layer 2 $\rightarrow$ Hidden Layer 3  & Fully Connected & 32  & GeLU \\
        Hidden Layer 3 $\rightarrow$ Hidden Layer 4  & Fully Connected & 32  & GeLU \\
        Hidden Layer 4 $\rightarrow$ Hidden Layer 5  & Fully Connected & 32  & GeLU \\
        Hidden Layer 5 $\rightarrow$ Output Layer  & Fully Connected & 1  & Softplus \\
        \bottomrule
    \end{tabular}
    \caption{Neural Network Architecture and Hyperparameters of $\hat{p}$}
    \label{tab:nnphat_3D-TVOU}
\end{table}

\begin{table}[htbp]
    \centering
    \begin{tabular}{lccc}
        \toprule
        \textbf{Layer Connection} & \textbf{Type} & \textbf{\# Neurons (Output)} & \textbf{Activation Function} \\
        \midrule
        Input Layer $\rightarrow$ Hidden Layer 1  & Fully Connected & 32  & GeLu \\
        Hidden Layer 1 $\rightarrow$ Hidden Layer 2  & Fully Connected & 32  & GeLu \\
        Hidden Layer 2 $\rightarrow$ Hidden Layer 3  & Fully Connected & 32  & GeLu \\
        Hidden Layer 3 $\rightarrow$ Hidden Layer 4  & Fully Connected & 32  & GeLu \\
        Hidden Layer 4 $\rightarrow$ Hidden Layer 5  & Fully Connected & 32  & GeLu \\
        Hidden Layer 5 $\rightarrow$ Output Layer  & Fully Connected & 1  & N/A \\
        \bottomrule
    \end{tabular}
    \caption{Neural network architecture and hyper-parameters of $\hat{e}_1$}
    \label{tab:nne1hat_3D-TVOU}
\end{table}

\begin{figure}[htbp!]
    \centering
    \includegraphics[width=0.7\linewidth]{figs/3dtvou_trainloss.pdf}
    \caption{Training losses of $\hat{p}$ and $\hat{e}_1$}
    \label{fig:3dtvou_trainloss}
\end{figure}

\begin{figure}[htbp!] % Use figure* for wide figures spanning both columns
    \centering
    \begin{subfigure}[b]{0.40\textwidth} % 
        \includegraphics[width=\textwidth]{figs/3dtv/p_t0.2.pdf}
        \caption{$p(x,t=0.2)$}
    \end{subfigure}
    \begin{subfigure}[b]{0.40\textwidth} % 
        \includegraphics[width=\textwidth]{figs/3dtv/phat_t0.2.pdf}
        \caption{$\hat{p}(x,t=0.2)$}
    \end{subfigure}
    \begin{subfigure}[b]{0.40\textwidth} % 
        \includegraphics[width=\textwidth]{figs/3dtv/p_t0.6.pdf}
        \caption{$p(x,t=0.6)$}
    \end{subfigure}
    \begin{subfigure}[b]{0.40\textwidth} % 
        \includegraphics[width=\textwidth]{figs/3dtv/phat_t0.6.pdf}
        \caption{$\hat{p}(x,t=0.6)$}
    \end{subfigure}
    \begin{subfigure}[b]{0.40\textwidth} % 
        \includegraphics[width=\textwidth]{figs/3dtv/p_t1.0.pdf}
        \caption{$p(x,t=1.0)$}
    \end{subfigure}
    \begin{subfigure}[b]{0.40\textwidth} % 
        \includegraphics[width=\textwidth]{figs/3dtv/phat_t1.0.pdf}
        \caption{$\hat{p}(x,t=1.0)$}
    \end{subfigure}
    \caption{Trained PINNs $\hat{p}(x,t)$ v.s. true PDF $p(x,t)$ for all $x$ at some $t$.}
    \label{fig:3dtv_phatresult}
\end{figure}

\begin{figure}[htbp!]
    \centering
    \begin{subfigure}[b]{0.40\textwidth} % 
        \includegraphics[width=\textwidth]{figs/3dtv/e_t0.2.pdf}
        \caption{$e_1(x,t=0.2)$}
    \end{subfigure}
    \begin{subfigure}[b]{0.40\textwidth} % 
        \includegraphics[width=\textwidth]{figs/3dtv/ehat_t0.2.pdf}
        \caption{$\hat{e}_1(x,t=0.2)$}
    \end{subfigure}
    \begin{subfigure}[b]{0.40\textwidth} % 
        \includegraphics[width=\textwidth]{figs/3dtv/e_t0.6.pdf}
        \caption{$e_1(x,t=0.6)$}
    \end{subfigure}
    \begin{subfigure}[b]{0.40\textwidth} % 
        \includegraphics[width=\textwidth]{figs/3dtv/ehat_t0.6.pdf}
        \caption{$\hat{e}_1(x,t=0.6)$}
    \end{subfigure}
    \begin{subfigure}[b]{0.40\textwidth} % 
    \includegraphics[width=\textwidth]{figs/3dtv/e_t1.0.pdf}
        \caption{$e_1(x,t=1.0)$}
    \end{subfigure}
    \begin{subfigure}[b]{0.40\textwidth} % 
        \includegraphics[width=\textwidth]{figs/3dtv/ehat_t1.0.pdf}
        \caption{$\hat{e}_1(x,t=1.0)$}
    \end{subfigure}
    \caption{Trained PINNs $\hat{e}_1(x,t)$ v.s. true error $e_1(x,t)$ for all $x$ at some $t$.}
    \label{fig:3dtv_e1hatresult}
\end{figure}


\clearpage
\subsection{7D Time-Varying OU Experiment}\label{appendix:7D-TVOU-exp}
The neural networks of $\hat{p}$ and $\hat{e}_1$ are summarized in Table~\ref{tab:nnphat_7D-TVOU} and~\ref{tab:nne1hat_7D-TVOU}. 
For training $\hat{p}$, we begins with $N_0=N_r=2000$ samples. Half of these samples are drawn uniformly, and the other half follow the normal distribution specified by the initial condition. During training, we gradually add samples using adaptive sampling. 
For training $\hat{e}_1$, we begins with $N_0=N_r=300$ samples (with same distributions as training $\hat{p}$), and gradually add samples using adaptive sampling. 
The weights of training both $\hat{p}$ and $\hat{e}_1$ are $w_0=1,w_r=|T|=1$, and $w_{\nabla}=0$ without regularization.
The training losses of $\hat{p}$ and $\hat{e}_1$ are illustrated in Fig.~\ref{fig:7dtvou_trainloss}. 
% Again, the periodic spikes in training loss are due to the adaptive sampling scheme that periodically adds space-time points, which becomes more effective as the system dimension grows.
% The training results of $\hat{p}$ vs $p$ and $\hat{e}_1$ vs $e_t$ are shown in Fig.~\ref{fig:2dpend_results}. 

\begin{table}[htbp]
    \centering
    \begin{tabular}{lccc}
        \toprule
        \textbf{Layer Connection} & \textbf{Type} & \textbf{\# Neurons (Output)} & \textbf{Activation Function} \\
        \midrule
        Input Layer $\rightarrow$ Hidden Layer 1  & Fully Connected & 32  & GeLU \\
        Hidden Layer 1 $\rightarrow$ Hidden Layer 2  & Fully Connected & 32  & GeLU \\
        Hidden Layer 2 $\rightarrow$ Hidden Layer 3  & Fully Connected & 32  & GeLU \\
        Hidden Layer 3 $\rightarrow$ Hidden Layer 4  & Fully Connected & 32  & GeLU \\
        Hidden Layer 4 $\rightarrow$ Hidden Layer 5  & Fully Connected & 32  & GeLU \\
        Hidden Layer 5 $\rightarrow$ Output Layer  & Fully Connected & 1  & Softplus \\
        \bottomrule
    \end{tabular}
    \caption{Neural Network Architecture and Hyperparameters of $\hat{p}$}
    \label{tab:nnphat_7D-TVOU}
\end{table}

\begin{table}[htbp]
    \centering
    \begin{tabular}{lccc}
        \toprule
        \textbf{Layer Connection} & \textbf{Type} & \textbf{\# Neurons (Output)} & \textbf{Activation Function} \\
        \midrule
        Input Layer $\rightarrow$ Hidden Layer 1  & Fully Connected & 32  & GeLu \\
        Hidden Layer 1 $\rightarrow$ Hidden Layer 2  & Fully Connected & 32  & GeLu \\
        Hidden Layer 2 $\rightarrow$ Hidden Layer 3  & Fully Connected & 32  & GeLu \\
        Hidden Layer 3 $\rightarrow$ Hidden Layer 4  & Fully Connected & 32  & GeLu \\
        Hidden Layer 4 $\rightarrow$ Hidden Layer 5  & Fully Connected & 32  & GeLu \\
        Hidden Layer 5 $\rightarrow$ Output Layer  & Fully Connected & 1  & N/A \\
        \bottomrule
    \end{tabular}
    \caption{Neural network architecture and hyper-parameters of $\hat{e}_1$}
    \label{tab:nne1hat_7D-TVOU}
\end{table}

\begin{figure}[htbp!]
    \centering
    \includegraphics[width=0.7\linewidth]{figs/7dtvou_trainloss.pdf}
    \caption{Training losses of $\hat{p}$ and $\hat{e}_1$}
    \label{fig:7dtvou_trainloss}
\end{figure}


\clearpage
\subsection{10D Time-Varying OU Experiment}\label{appendix:10D-TVOU-exp}
The neural networks of $\hat{p}$ and $\hat{e}_1$ are summarized in Table~\ref{tab:nnphat_10D-TVOU} and~\ref{tab:nne1hat_10D-TVOU}. 
For training $\hat{p}$, we begins with $N_0=N_r=600$ samples. Half of these samples are drawn uniformly, and the other half follow the normal distribution specified by the initial condition. During training, we gradually add samples using adaptive sampling. 
For training $\hat{e}_1$, we begins with $N_0=N_r=600$ samples (with same distributions as training $\hat{p}$), and gradually add samples using adaptive sampling. 
The weights of training both $\hat{p}$ and $\hat{e}_1$ are $w_0=1,w_r=|T|=1$, and $w_{\nabla}=0$ without regularization.
The training losses of $\hat{p}$ and $\hat{e}_1$ are illustrated in Fig.~\ref{fig:10dtvou_trainloss}. 
% Again, the periodic spikes in training loss are due to the adaptive sampling scheme that periodically adds space-time points, which becomes more effective as the system dimension grows.
% The training results of $\hat{p}$ vs $p$ and $\hat{e}_1$ vs $e_t$ are shown in Fig.~\ref{fig:2dpend_results}. 
% The constructed tight error bounds at some time instances are visualized in Fig.~\ref{fig:2dpend_errorbound}.

\begin{table}[htbp]
    \centering
    \begin{tabular}{lccc}
        \toprule
        \textbf{Layer Connection} & \textbf{Type} & \textbf{\# Neurons (Output)} & \textbf{Activation Function} \\
        \midrule
        Input Layer $\rightarrow$ Hidden Layer 1  & Fully Connected & 50  & GeLU \\
        Hidden Layer 1 $\rightarrow$ Hidden Layer 2  & Fully Connected & 50  & GeLU \\
        Hidden Layer 2 $\rightarrow$ Hidden Layer 3  & Fully Connected & 50  & GeLU \\
        Hidden Layer 3 $\rightarrow$ Hidden Layer 4  & Fully Connected & 50  & GeLU \\
        Hidden Layer 4 $\rightarrow$ Hidden Layer 5  & Fully Connected & 50  & GeLU \\
        Hidden Layer 5 $\rightarrow$ Hidden Layer 6  & Fully Connected & 50  & GeLU \\
        Hidden Layer 6 $\rightarrow$ Output Layer  & Fully Connected & 1  & Softplus \\
        \bottomrule
    \end{tabular}
    \caption{Neural Network Architecture and Hyperparameters of $\hat{p}$}
    \label{tab:nnphat_10D-TVOU}
\end{table}

\begin{table}[htbp]
    \centering
    \begin{tabular}{lccc}
        \toprule
        \textbf{Layer Connection} & \textbf{Type} & \textbf{\# Neurons (Output)} & \textbf{Activation Function} \\
        \midrule
        Input Layer $\rightarrow$ Hidden Layer 1  & Fully Connected & 50  & GeLu \\
        Hidden Layer 1 $\rightarrow$ Hidden Layer 2  & Fully Connected & 50  & GeLu \\
        Hidden Layer 2 $\rightarrow$ Hidden Layer 3  & Fully Connected & 50  & GeLu \\
        Hidden Layer 3 $\rightarrow$ Hidden Layer 4  & Fully Connected & 50  & GeLu \\
        Hidden Layer 4 $\rightarrow$ Hidden Layer 5  & Fully Connected & 50  & GeLu \\
        Hidden Layer 5 $\rightarrow$ Hidden Layer 6  & Fully Connected & 50  & GeLu \\
        Hidden Layer 6 $\rightarrow$ Output Layer  & Fully Connected & 1  & N/A \\
        \bottomrule
    \end{tabular}
    \caption{Neural network architecture and hyper-parameters of $\hat{e}_1$}
    \label{tab:nne1hat_10D-TVOU}
\end{table}

\begin{figure}[htbp!]
    \centering
    \includegraphics[width=0.7\linewidth]{figs/10dtvou_trainloss.pdf}
    \caption{Training losses of $\hat{p}$ and $\hat{e}_1$}
    \label{fig:10dtvou_trainloss}
\end{figure}


\subsection{1D Heat PDE Experiment}\label{appendix:1D-HEAT-exp}
The neural networks of $\hat{p}$ and $\hat{e}_1$ are summarized in Table~\ref{tab:nnphat_1D-HEAT} and~\ref{tab:nne1hat_1D-HEAT}. 
For each training iteration, $N_0=N_r=500$ space-time points are uniformly sampled as in Eq.~\ref{eq:pinn_loss_general}, with weights $w_0=w_{bc}=1$ and $w_r=|T|=1$, where $w_{bc}$ is the weight of the Dirichlet boundary condition loss described in Appendix~\ref{proof:1D_Heat}.
The training losses of $\hat{p}$ and $\hat{e}_1$ are illustrated in Fig.~\ref{fig:1dheat_trainloss}. 
The training results of $\hat{p}$ vs $p$ and $\hat{e}_1$ vs $e_t$ are shown in Fig.~\ref{fig:1dheat_surfaces}, 
% and the constructed (and synthesized) error bounds at some time instances are visualized in Fig.~\ref{fig:1dl_errorboundsresult}.

\begin{table}[htbp]
    \centering
    \begin{tabular}{lccc}
        \toprule
        \textbf{Layer Connection} & \textbf{Type} & \textbf{\# Neurons (Output)} & \textbf{Activation Function} \\
        \midrule
        Input Layer $\rightarrow$ Hidden Layer 1  & Fully Connected & 32  & Tanh \\
        Hidden Layer 1 $\rightarrow$ Hidden Layer 2  & Fully Connected & 32  & Tanh \\
        Hidden Layer 2 $\rightarrow$ Hidden Layer 3  & Fully Connected & 32  & Tanh \\
        Hidden Layer 3 $\rightarrow$ Output Layer  & Fully Connected & 1  & N/A \\
        \bottomrule
    \end{tabular}
    \caption{Neural Network Architecture and Hyperparameters of $\hat{p}$}
    \label{tab:nnphat_1D-HEAT}
\end{table}

\begin{table}[htbp]
    \centering
    \begin{tabular}{lccc}
        \toprule
        \textbf{Layer Connection} & \textbf{Type} & \textbf{\# Neurons (Output)} & \textbf{Activation Function} \\
        \midrule
        Input Layer $\rightarrow$ Hidden Layer 1  & Fully Connected & 50  & Tanh \\
        Hidden Layer 1 $\rightarrow$ Hidden Layer 2  & Fully Connected & 50  & Tanh \\
        Hidden Layer 2 $\rightarrow$ Hidden Layer 3  & Fully Connected & 50  & Tanh \\
        Hidden Layer 3 $\rightarrow$ Hidden Layer 4  & Fully Connected & 50  & Tanh \\
        Hidden Layer 4 $\rightarrow$ Hidden Layer 5  & Fully Connected & 50  & Tanh \\
        Hidden Layer 5 $\rightarrow$ Output Layer  & Fully Connected & 1  & N/A \\
        \bottomrule
    \end{tabular}
    \caption{Neural network architecture and hyper-parameters of $\hat{e}_1$}
    \label{tab:nne1hat_1D-HEAT}
\end{table}

\begin{figure}[htbp!]
    \centering
    \includegraphics[width=0.7\linewidth]{figs/1dheat_trainloss.pdf}
    \caption{Training losses of $\hat{p}$ and $\hat{e}_1$}
    \label{fig:1dheat_trainloss}
\end{figure}

\begin{figure}[htbp!] % Use figure* for wide figures spanning both columns
    \centering
    \begin{subfigure}[b]{0.47\textwidth} % First row, first column
        \includegraphics[width=\textwidth]{figs/1dheat_phatsurface.pdf}
        \caption{$\hat{u}$ vs $u$}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.47\textwidth} % First row, second column
        \includegraphics[width=\textwidth]{figs/1dheat_e1hatsurface.pdf}
        \caption{$\hat{e}_1$ vs $e_1$}
    \end{subfigure}
    \caption{Trained PINNs $\hat{u}(x,t)$ and $\hat{e}_1(x,t)$ v.s. true solution $u(x,t)$ and error $e_1(x,t)$ for all $x$ and $t$.}
    \label{fig:1dheat_surfaces}
\end{figure}

\begin{figure}[htbp!]
    \centering
    \includegraphics[width=0.7\linewidth]{figs/1dheat_errorbound.pdf}
    \caption{$u$, $\hat{u}$, and the error bound $B_1$ at some $t$.}
    \label{fig:1dheat_errorboundsresult}
\end{figure}


