%\documentclass{uai2023} % for initial submission
 \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams


\usepackage[utf8]{inputenc} 
\usepackage[T1]{fontenc}    
\usepackage{hyperref}       
\usepackage{url}            
\usepackage{booktabs}       
\usepackage{amsfonts}       
\usepackage{nicefrac}       
\usepackage{microtype}      
\usepackage{algorithm}
\usepackage[noend]{algorithmic}

\usepackage{graphicx}
\usepackage{amsmath,amssymb}
\usepackage{amsthm}
\usepackage{dsfont}

%\usepackage[pdftex,dvipsnames]{xcolor}  %--> appears later already
\usepackage{wrapfig}
\usepackage{subfigure}
\usepackage{colortbl}
\usepackage{color}
\usepackage{xcolor} %--> appeared before already
\usepackage{multirow}

\usepackage{xargs}                      

%\usepackage[colorinlistoftodos,prependcaption,textsize=tiny,disable]{todonotes}

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{reeb_297}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example


\newcommand{\MMD}{\ensuremath{\mathrm{MMD}}}
\newcommand{\COS}{\ensuremath{\mathrm{COS}}}
\newcommand{\R}{\ensuremath{{\mathbb R}}}
\newcommand*\rot{\rotatebox{90}}
\definecolor{verylightgray}{gray}{.75}
\definecolor{veryverylightgray}{gray}{.85}

\newtheorem{theorem}{Theorem}
\newtheorem{proposition}{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\theoremstyle{definition}
\newtheorem{remark}{Remark}
\newtheorem{question}[theorem]{Question}


\title{Validation of Composite Systems by Discrepancy Propagation\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
%\author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2023 paper}{Jane~J.~von~O'L\'opez}{}}
%\author[1]{Harry~Q.~Bovik}
%\author[1,2]{Further~Coauthor}
%\author[3]{Further~Coauthor}
%\author[1]{Further~Coauthor}
%\author[3]{Further~Coauthor}
%\author[3,1]{Further~Coauthor}
\author[1]{\href{mailto:david.reeb@de.bosch.com}{David~Reeb}{}}
\author[1]{Kanil~Patel}
\author[1]{Karim~Barsim}
\author[1]{Martin~Schiegg}
\author[1]{Sebastian~Gerwinn}
% Add affiliations after the authors
\affil[1]{%
	Bosch Center for Artificial Intelligence, Robert Bosch GmbH, 71272 Renningen, Germany
}
%\affil[1]{%
%    Computer Science Dept.\\
%    Cranberry University\\
%    Pittsburgh, Pennsylvania, USA
%}
%\affil[2]{%
%    Second Affiliation\\
%    Address\\
%    …
%}
%\affil[3]{%
%    Another Affiliation\\
%    Address\\
%    …
%  }
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\appendix



\section{Semidefinite Relaxation of the Bound Optimization in Eq.\ (\ref{max-discrepancy-objective-alpha})}\label{app:semidefinite-relaxation}
%\section{SEMIDEFINITE RELAXATION OF THE BOUND OPTIMIZATION IN EQ.\ (\ref{max-discrepancy-objective-alpha})}

Let the discrepancy measures in Eq.\ (\ref{max-discrepancy-objective-alpha}) in Sec.\ \ref{subsec:validation-method} be given by MMD (maximum mean discrepancy) with kernels $k^{c\rightarrow c''}$ and $k^{c'\rightarrow c}$, respectively. 
When representing the distributions via samples, we have (see \citep{gretton2012kernel}):
\newcommand{\kout}[2]{\ensuremath{k^{c\rightarrow c''}\left(#1,#2\right) }}
\newcommand{\kin}[2]{\ensuremath{k^{c'\rightarrow c}\left(#1,#2\right) }}
\newcommand{\Kout}{\ensuremath{{\bf{K}}_{c\rightarrow c''}}}
\newcommand{\Kin}{\ensuremath{{\bf{K}}_{c'\rightarrow c}}}
\newcommand{\en}[1]{\ensuremath{{\bf{e}}_{#1}}}
\begin{align}
D(p_\alpha|_{c\rightarrow c''}, q|_{c\rightarrow c''})^2 & = \alpha ^\top \Kout^{VV}\alpha  - 2\alpha^\top \Kout^{VM}\frac{\en{n_M}}{n_M} + \frac{\en{n_M}^\top}{n_M} \Kout^{MM}\frac{\en{n_M}}{n_M}
\end{align}
with kernel matrices (with indices $v,v'=1,\ldots,V^c$, $n,n'=1,\ldots,n_M$)
\begin{align}
\left(\Kout^{VV}\right)_{v,v'} & := \kout{y_v^c}{ y_{v'}^c},\nonumber\\
\left(\Kout^{VM}\right)_{v,n'} & := \kout{y_v^c}{ y_{n'}^{M^c}},\nonumber\\
\left(\Kout^{MM}\right)_{n,n'} & := \kout{y_{n}^{M^c}}{ y_{n'}^{M^c}},\nonumber
\end{align}
and where $\en{d}:=(1,\dots,1)^\top\in{\mathbb R}^d$ denotes the $d$-dimensional all-1's vector, so that $\frac{\en{d}}{d}$ is the uniform probability vector on $d$ elements. Similarly, at the input of component $c$ we have
\begin{align}
D(p_\alpha|_{c'\rightarrow c}, q|_{c'\rightarrow c})^2 & = \alpha ^\top \Kin^{VV}\alpha  - 2\alpha^\top \Kin^{VM}\frac{\en{n_M}}{n_M} + \frac{\en{n_M}^\top}{n_M} \Kin^{MM}\frac{\en{n_M}}{n_M},
\end{align}
where by a slight abuse of notation we define the input kernel matrices as
\begin{align}
\left(\Kin^{VV}\right)_{v,v'} & := \kin{x_v^c}{ x_{v'}^c},\nonumber\\
\left(\Kin^{VM}\right)_{v,n'} & := \kin{x_v^c}{ x_{n'}^{M^c}},\nonumber\\
\left(\Kin^{MM}\right)_{n,n'} & := \kin{x_{n}^{M^c}}{ x_{n'}^{M^c}}.\nonumber
\end{align}




Taken together, we can write (\ref{max-discrepancy-objective-alpha}) in Sec.\ \ref{subsec:validation-method} -- or rather its square -- as the following quadratic optimization problem:
\begin{align}
&(B^{c\rightarrow c''})^2\\
&=\sup_{\{p:D(p_\alpha|_{c'\rightarrow c},q|_{c'\rightarrow c})^2\leq(B^{c'\rightarrow c})^2~~ \forall c'< c\}}D(p_\alpha|_{c\rightarrow c''}, q|_{c\rightarrow c''})^2\\
&=\text{maximize}_\alpha ~\alpha^\top \Kout^{VV}\alpha -2\alpha^\top \Kout^{VM}\frac{\en{n_M}}{n_M}+\frac{\en{n_M}^\top}{n_M} \Kout^{MM}\frac{\en{n_M}}{n_M} \label{eq-no-triang:app-first-term-objective}\\
& \qquad\text{subject to }~\alpha^\top \Kin^{VV}\alpha-2\alpha^\top \Kin^{VM}\frac{\en{n_M}}{n_M}+\frac{\en{n_M}^\top}{n_M} \Kin^{MM}\frac{\en{n_M}}{n_M}\leq(B^{c'\rightarrow c})^2 \quad\forall c'<c\label{eq-no-traing:app-first-term-first-constraint},\\
&\qquad\qquad\qquad~~\en{V^c}^\top\alpha=1,\label{eq-no-triang:equality-constraint}\\
&\qquad\qquad\qquad~~\alpha\geq0,\label{eq-no-traing:app-first-term-last-constraint}
\end{align}
where the vector constraint $\alpha\geq0$ is understood entry-wise.

Unfortunately, the optimization problem (\ref{eq-no-triang:app-first-term-objective}--\ref{eq-no-traing:app-first-term-last-constraint}) is \emph{not} a convex optimization problem because the objective is to maximize a convex function. While heuristic solvers are available, such as the package \texttt{qcqp} \citep{park2017general}, we avoid those heuristic methods, as they do not guarantee a valid upper bound and can be inefficient in computation.

Instead we follow approaches more tailored to the problem, and relax the original problem (\ref{eq-no-triang:app-first-term-objective}--\ref{eq-no-traing:app-first-term-last-constraint}) to obtain an efficiently solvable semidefinite program (SDP), a type of convex optimization problem. 
We follow the ``tightened semidefinite relaxations" from \citet{park2017general}[Secs.\ 3.3, 3.4].

For this, we introduce the (symmetric) matrix variable $A:=\alpha\alpha^\top$ and rewrite the quadratic terms in (\ref{eq-no-triang:app-first-term-objective}) and (\ref{eq-no-traing:app-first-term-first-constraint}) via the matrix traces $\alpha^\top \Kout^{VV}\alpha=\mathrm{Tr}[\Kout^{VV}A]$ and $\alpha^\top \Kin^{VV}\alpha=\mathrm{Tr}[\Kin^{VV}A]$, so that they are now \emph{linear} in the variable $A$. 
Due to (\ref{eq-no-traing:app-first-term-last-constraint}), $A=\alpha\alpha^\top$ is entry-wise nonnegative, which we write as $A\succcurlyeq0$; furthermore, it holds that $\en{V_c}^\top A\en{V_c}=1$ due to (\ref{eq-no-triang:equality-constraint}). 
Also due to (\ref{eq-no-triang:equality-constraint}), we can even recover $\alpha$ from $A=\alpha\alpha^\top$ via $\alpha=A\en{V_c}$. 
Therefore, we will omit our original variable $\alpha$ in favor of the matrix variable $A$ and simply \emph{define} the expression $\alpha:=A\en{V_c}$. 
Rewriting (\ref{eq-no-triang:app-first-term-objective}--\ref{eq-no-traing:app-first-term-last-constraint}) in terms of $A$ and additionally adding the constraints $A=\alpha\alpha^\top$, $\en{V_c}^\top A\en{V_c}=1$ and $A\succcurlyeq0$ gives the same optimum as (\ref{eq-no-triang:app-first-term-objective}--\ref{eq-no-traing:app-first-term-last-constraint}). 
The resulting (rewritten) problem is convex except for the equality constraint $A=\alpha\alpha^\top$. 
We finally relax this constraint to the matrix inequality $A\geq\alpha\alpha^\top$ (where the inequality is understood w.r.t.\ the positive semidefinite order), which \emph{is} convex. 
Writing this matrix inequality in the manifestly convex way (\ref{eq:tightened-SDR-relaxation-A-alpha}) as a semidefinite constraint \citep{park2017general}, we therefore obtain:
\begin{lemma}[Tightened SDP relaxation of (\ref{eq-no-triang:app-first-term-objective}--\ref{eq-no-traing:app-first-term-last-constraint})]\label{lem:SDR-relaxation}
	Define the tightened SDP relaxation of (\ref{eq-no-triang:app-first-term-objective}--\ref{eq-no-traing:app-first-term-last-constraint}) as the following semidefinite optimization problem (SDP) with symmetric matrix variable $A=A^\top\in{\mathbb R}^{V^c\times V^c}$ and the abbreviation $\alpha:=A\en{V_c}$: 
	\begin{align}
	(B^{c\rightarrow c''}_\text{SDR-tightened})^2\quad:=\\
	~\text{maximize}_A~~~~&\mathrm{Tr}[\Kout^{VV}A]-2\alpha^\top \Kout^{VM}\frac{\en{n_M}}{n_M}+ \frac{\en{n_M}^\top}{n_M} \Kout^{MM}\frac{\en{n_M}}{n_M}\label{eq:tightened-SDR-objective}\\
	\text{subject to}~~~~&\mathrm{Tr}[\Kin^{VV}A]-2\alpha^\top \Kin^{V}\frac{\en{n_M}}{n_M}+\frac{\en{n_M}^\top}{n_M} \Kin^{MM}\frac{\en{n_M}}{n_M}\leq (B^{c'\rightarrow c})^2 ~~~\forall c'<c, \label{eq:tightened-SDR-quadratic-constraint}\\
	&\left(\begin{array}{cc}A&\alpha\\\alpha^\top&1\end{array}\right)\geq0~~\text{(i.e.\ the left-hand-side is a positive semidefinite matrix)},\label{eq:tightened-SDR-relaxation-A-alpha}\\
	&\en{V^c}^\top A\en{V^c}=1,\label{eq:tightened-SDR-matrix-equality-constraint}\\
	&A\succcurlyeq0~~\text{(entry-wise; above the diagonal suffices due to constraint (\ref{eq:tightened-SDR-relaxation-A-alpha}) and $A=A^\top$)}.\label{eq:tightened-SDR-last-tightening-constraint}
	\end{align}
	Then its optimal value satisfies $(B^{c\rightarrow c''}_\text{SDR-tightened})^2\geq(B^{c\rightarrow c''})^2$, i.e.\ $(B^{c\rightarrow c''}_\text{SDR-tightened})^2$ is an upper bound on the optimum of the non-convex problem (\ref{eq-no-triang:app-first-term-objective}--\ref{eq-no-traing:app-first-term-last-constraint}), which itself is (the square of) an upper bound on the (unknown) MMD discrepancy $D(p|_{c\to c''},q|_{c\to c''})$ (cf.\ Eq.\ (\ref{max-discrepancy-objective-alpha}) in Sec.\ \ref{subsec:validation-method}).
\end{lemma}
To solve the relaxed semidefinite programs from Lemma \ref{lem:SDR-relaxation}, we use the library \texttt{CVXPY} \citep{diamond2016cvxpy}. 
Note that due to the relaxation, the value $(B^{c\rightarrow c''}_\text{SDR-tightened})^2$ of the relaxed optimization from Lemma \ref{lem:SDR-relaxation} is in general different (larger, i.e.\ worse) than the $(B^{c\rightarrow c''})^2$ from the original optimization (\ref{eq-no-triang:app-first-term-objective}); 
this can happen if the found optimum $\widehat{A}$ of the relaxed problem in Lemma\ \ref{lem:SDR-relaxation} cannot be expressed as $\widehat{A}=\widehat{\alpha}\widehat{\alpha}^\top$ (i.e.\ the optimal $\widehat{A}$ is not of rank 1). 
To detect such a \emph{relaxation gap}, one can plug the found vector $\widehat{\alpha} :=\widehat{A}\en{V_c}$ into (\ref{eq-no-triang:app-first-term-objective}) and compare its value $\text{Opt}_{\text{orig}}(\widehat{\alpha})$ to the optimal value $\text{Opt}_{\text{relax}}(\widehat{A})=(B^{c\rightarrow c''}_\text{SDR-tightened})^2$ of (\ref{eq:tightened-SDR-objective}). 
Then, by Lemma \ref{lem:SDR-relaxation} and the maximization (\ref{eq-no-triang:app-first-term-objective}), it holds that the true optimum $(B^{c\rightarrow c''})^2$ of the nonconvex problem (\ref{eq-no-triang:app-first-term-objective}) is sandwiched between two convexly computable quantities:
\begin{align}
\text{Opt}_{\text{orig}}(\widehat{\alpha})~\leq~(B^{c\rightarrow c''})^2~\leq~\text{Opt}_{\text{relax}}(\widehat{A})=(B^{c\rightarrow c''}_\text{SDR-tightened})^2.
\end{align}
When both values agree, $\text{Opt}_{\text{orig}}(\widehat{\alpha})=\text{Opt}_{\text{relax}}(\widehat{A})$, then the relaxation was \emph{tight}, i.e.\ we have certified that $(B^{c\rightarrow c''})^2=(B^{c\rightarrow c''}_\text{SDR-tightened})^2=\text{Opt}_{\text{relax}}(\widehat{A})$ are equal and our relaxation has found the true optimum. 
More generally, we can guarantee a \emph{relaxation gap} $\Delta=(B^{c\rightarrow c''}_\text{SDR-tightened})^2-(B^{c\rightarrow c''})^2$ of at most $\widehat{\Delta}=\text{Opt}_{\text{relax}}(\widehat{A})-\text{Opt}_{\text{orig}}(\widehat{\alpha})$, and an approximation ratio $\gamma=(B^{c\rightarrow c''})^2/(B^{c\rightarrow c''}_\text{SDR-tightened})^2\in[0,1]$ of at least $\widehat{\gamma}=\text{Opt}_{\text{orig}}(\widehat{\alpha})/\text{Opt}_{\text{relax}}(\widehat{A})\in[0,1]$.

Empirically we find in this way that the relaxation in Lemma \ref{lem:SDR-relaxation} is basically tight in most problem instances, i.e.\ it returns (almost) the correct optimum of (\ref{eq-no-triang:app-first-term-objective}--\ref{eq-no-traing:app-first-term-last-constraint}). See App.\ \ref{sec:empirical-tightness} for an empirical evaluation.

With this relaxation, the number of optimization variables increases from $\text{dim}(\alpha)=V^c$ in (\ref{eq-no-triang:app-first-term-objective}--\ref{eq-no-traing:app-first-term-last-constraint}) to $\text{dim}(A=A^\top)=V^c(V^c+1)/2\sim(V^c)^2/2$ in (\ref{eq:tightened-SDR-objective}--\ref{eq:tightened-SDR-last-tightening-constraint}), i.e.\ it grows quadratically with the number of validation points $V^c$ for component $c$; the number of optimization constraints also increases like $\sim C+(V^c)^2$. This results in a computational limitation which restricts the number of validation inputs to roughly $V^c\lesssim10^3$ with standard convex solvers on standard computing hardware. 
If one would like to apply our general validation method with more validation points $V^c$, one would have to to find another way to efficiently upper-bound the optimization problem (\ref{eq-no-triang:app-first-term-objective}--\ref{eq-no-traing:app-first-term-last-constraint}), instead of our tightened SDP relaxation (Lemma \ref{lem:SDR-relaxation}).

To run the whole validation method in Algorithm \ref{algorithm:DPBound}, the number of bound optimizations (\ref{eq-no-triang:app-first-term-objective}--\ref{eq-no-traing:app-first-term-last-constraint}) or (\ref{eq:tightened-SDR-objective}--\ref{eq:tightened-SDR-last-tightening-constraint}) to perform equals the number of connections $c\to c''$ between components of the system (where $1\leq c<c''\leq C+1$, see Sec.\ \ref{sec:setup}), i.e.\ the number of such bound computation lies between $C$ (for the linear chain) and $C(C+1)/2$ (for the ``fully connected'' system). See App.\ \ref{sec:runtime} for actual runtimes of our method.



%\section{DETAILS ON THE FAILURE PROBABILITY OPTIMIZATION IN EQ.\ (\ref{max-failure-objective-alpha})}
\section{Details on the Failure Probability Optimization in Eq.\ (\ref{max-failure-objective-alpha})}\label{app:violation-optimization}
Similar to (\ref{eq-no-triang:app-first-term-objective}--\ref{eq-no-traing:app-first-term-last-constraint}), we can formulate a sample-based optimization for the optimization problem in Eq.\ (\ref{max-failure-objective-alpha}) in Sec.\ \ref{subsec:validation-method} in the case of using MMD as the discrepancy measure, to compute an upper bound $F_\text{max}$ on the system failure probability $p_\text{fail}$. 
More precisely, assuming that this MMD measure on the TPI output $y^C$ has kernel $k^{y}\equiv k^{C\to C+1}$, we can write this problem (optionally with the monotonicity and Lipschitz conditions mentioned below Eq.\ (\ref{max-failure-objective-alpha}) in Sec.\ \ref{subsec:validation-method} with the optimization variable $\alpha\in{\mathbb R}^V$:

\newcommand{\Kyy}{\ensuremath{{\bf{K}}_y}}
\begin{align}
F_{\text{max}} & = \text{maximize}_\alpha  \sum_{v: g_v>\tau} \alpha_v\label{eq:violation_objective}\\
& \qquad \text{subject to }~ \alpha ^\top \Kyy^{VV}\alpha  - 2\alpha^\top \Kyy^{VM}\frac{\en{n_M}}{n_M} + \frac{\en{n_M}^\top}{n_M} \Kyy^{MM}\frac{\en{n_M}}{n_M} \leq (B^y)^2, \label{eq:violation_mmd_constraint}\\
& \qquad\qquad\qquad~~\alpha\geq0, \qquad\en{V}^\top\alpha =1,\\
&\qquad\qquad\qquad~~\alpha_v \leq \alpha_{v-1}\quad \forall v\text{  with  }g_v\geq \tau'\quad\text{(monotonicity; we take $\tau':=\tau$)},\label{eq:violation_monotonicity_constraint}\\
& \qquad\qquad\qquad~~|\alpha_{v+1}-\alpha_v|\leq\Lambda_\text{max}|g_{v+1}-g_v|\quad \forall v\quad\text{(Lipschitz condition)},\label{eq:violation_lipschitz_constraint}
\end{align}
where we defined kernel matrices on the TPI grid-points $g_v$ and simulation outputs $y^M_n$:
\begin{align}
&\qquad\qquad (\Kyy^{VV})_{v,v'}  = k^y\left(g_v, g_{v'}\right),\nonumber\\
&\qquad\qquad (\Kyy^{VM})_{v,n'}  = k^y\left(g_v, y^M_{n'}\right),\nonumber\\
& \qquad\qquad(\Kyy^{MM})_{n,n'}  = k^y\left(y^M_n, y^M_{n'}\right).\nonumber
\end{align}

The optimization problem (\ref{eq:violation_objective}--\ref{eq:violation_lipschitz_constraint}) has a linear objective and linear constraints except for the quadratic MMD constraint (\ref{eq:violation_mmd_constraint}). 
It is thus a convex optimization problem that can be solved efficiently and exactly, without relaxations (unlike required for the bound optimization (\ref{max-discrepancy-objective-alpha}) in Sec.\ \ref{subsec:validation-method}, see App.\ \ref{app:semidefinite-relaxation}).

The above formulation depends on a set of grid-points $g_v$ and a Lipschitz constant $\Lambda_{\text{max}}$. Ideally, the $\Lambda_\text{max}$ should be a tight upper bound on the Lipschitz constant of the system's TPI output density $p_y$. 
As we do not know this density $p_y$, we use a heuristic estimator of $\Lambda_\text{max}$ computed from histograms of the simulation output distribution $q_y$ as a proxy (see experimental details in App.\ \ref{sec:deta-exper-sect}). 
In order for the solution of (\ref{eq:violation_objective}--\ref{eq:violation_lipschitz_constraint}) to be close to its true value (\ref{max-failure-objective-alpha}) in Sec.\ \ref{subsec:validation-method}, the grid-points need to be sufficiently dense in order to reveal differences as measured by the MMD constraint (\ref{eq:violation_mmd_constraint}) (see also App.~\ref{app:proof-proposition}). 
As the MMD measure $D(p_y,q_y)$ with kernel $k^y$ can also be interpreted as the L2-distance between the corresponding kernel density estimators of the samples from $p_y$ and $q_y$ \citep{gretton2012kernel}, we choose the grid-spacing relative to the lengthscale $\ell$ of the kernel $k^y$; more precisely, in our experiments we require that $g_{v+1}-g_v\leq \frac{\ell}{5}$. 
Additionally, the grid-points should cover the range where the support of both $p_y$ and $q_y$ lies (although $p_y$ is not known). 
For our experiments, we chose the grid range $[g_1,g_V]$ such that it contains all data-points from $q_y$, as well as a significantly large region around the threshold $\tau$ (see App.\ \ref{sec:deta-exper-sect}). 
Even though the system's TPI output distribution $p_y$ is not known, in many applications the plausible (or even the potential) range of TPI values $y^C$ will typically be known from domain knowledge; 
in this case, the grid endpoints should be chosen to cover this range.

Note, that the optimization for the failure probability in Eq.\ (\ref{max-failure-objective-alpha}) in Sec.\ \ref{subsec:validation-method} or in Eqs.\ (\ref{eq:violation_objective}--\ref{eq:violation_lipschitz_constraint}) is also possible for higher-dimensional TPI quantities $y^C$. 
In this case, a specification $y^C\in \mathcal{TPI}_\text{fail}$ of the critical region is required (replacing the specification $y^C>\tau$ in (\ref{eq:violation_objective})).  
Even if this specification is non-linear, the optimization objective will remain linear and the constraints quadratic, again by choosing a grid on the TPI space as in the one-dimensional case.

By construction, the final solution $F_{\text{max}}$ (computed via (\ref{max-discrepancy-objective-alpha}),(\ref{max-failure-objective-alpha} in Sec.\ \ref{subsec:validation-method}), or more concretely via (\ref{eq:tightened-SDR-objective}--\ref{eq:tightened-SDR-last-tightening-constraint}),(\ref{eq:violation_objective}--\ref{eq:violation_lipschitz_constraint})) is an upper bound on the system's true failure probability $F_{\text{max}} \geq p_{\text{fail}}=\int \mathds{1}_{y>\tau}dp_y(y)=\int \mathds{1}_{S(x)>\tau}dS(x)dp_x(x)$  (see Prop.\ \ref{prop:convergence} in Sec.\ \ref{subsec:validation-method} and also App.\ \ref{app:proof-proposition}); 
as a result it can be used for virtual system validation. 
This bound $F_{\text{max}}$ remains valid for any discrepancy measure, choice of kernels or lengthscales. 
For example, when choosing MMD with a very large kernel lengthscale as discrepancy measure, its discriminative power is minimal, resulting in very small discrepancy values and hence small corresponding bounds $B^{c\to c''}$. 
However, for such a measure, it is also difficult to distinguish $p_\alpha$ from the given $q|_{c\to c''}$ in a later bound propagation step, counteracting the small obtained $B^{c\to c''}$ and potentially resulting in a larger final $F_{\text{max}}$. 
We, therefore, use the final $F_{\text{max}}$ as the minimization objective in a Bayesian Optimization scheme \citep{frohlich2020noisy} to select the kernel parameters, see also App.\ \ref{sec:deta-exper-sect}. Another option to select good kernel parameters would be gradient-based minimization of $F_{\text{max}}$ w.r.t.\ to the kernel parameters, which is possible in our framework as the convex programs from Apps.\ \ref{app:semidefinite-relaxation} and \ref{app:violation-optimization} can be (automatically) differentiated \citep{agrawal2019differentiable}.





%\section{PROOF AND EXTENSIONS OF PROP.\ \ref{prop:convergence}}
\section{Proof and Extensions of Prop.\ \ref{prop:convergence} (Sec.\ \ref{subsec:validation-method})}\label{app:proof-proposition}
\begin{proof}[Proof of Prop.\ \ref{prop:convergence}]
	Under the assumptions {\it{(i)}} and {\it{(ii)}} that the set of validation inputs $\{x^c_v\}_v$ contains  \emph{all} actually occurring input points into $S^c$ and that $p_\alpha=\sum_v\alpha_v\delta_{x^c_v}S^c(x^c_v)$ is built with the \emph{correct} outputs $S^c(x^c_v)$, this set of distributions $p_\alpha$ over which we optimize in (\ref{max-discrepancy-objective-alpha}) (see Sec.\ \ref{subsec:validation-method}) contains the joint real-world distribution of in- and outputs of $S^c$ (i.e.\ the marginal of the real-world distribution $p$ capturing the joint in- and outputs of $S^c$). Thus, if the input bound values $B^{c'\to c}$ were true upper bounds on the actual $D(p|_{c'\to c},q|_{c'\to c})$, then $B^{c\to c''}$ from (\ref{max-discrepancy-objective-alpha}) in Sec.\ \ref{subsec:validation-method} is also a true upper bound on the actual $D(p|_{c\to c''},q|_{c\to c''})$. 
	By induction on $c=1,2,\ldots,C$ we can thus conclude that $B^y\equiv B^{C\to C+1}$ is a true upper bound on the real-world discrepancy $D(p_y,q_y)\equiv D(p|_{C\to C+1},q|_{C\to C+1})$ if only the initial bound values $B^{0\to c}$ were true upper bounds on the actual initial discrepancies $D(p|_{0\to c},q|_{0\to c})$; 
	we assume this last statement about the initial bound values $B^{0\to c}$ to be true since they are supposed to be given in that way (alternatively, the same statement can be concluded with high confidence $\geq1-\delta$ if the $B^{0\to c}$ were computed via samples from $p_x$ \citep{gretton2012kernel}; see end of Sec.\ \ref{sec:setup}). 
	Finally, by the same reasoning, under the assumption {\it{(iii)}} that the set $\{g_v\}_v$ of grid-points contains \emph{all} occurring real-world TPI values, the optimization (\ref{max-failure-objective-alpha}) from Sec.\ \ref{subsec:validation-method} translates the true upper bound $B^y$ into a true upper bound $F_\text{max}$ on the real-world failure probability $p_\text{fail}$.
\end{proof}

Note that further upper-bounding the optimizations (\ref{max-discrepancy-objective-alpha}),(\ref{max-failure-objective-alpha}) from Sec.\ \ref{subsec:validation-method} by relaxations as in App.\ \ref{app:semidefinite-relaxation} leads to valid upper bounds $F_\text{max}$ as well, by the same reasoning as in the above proof.

The assumptions {\it{(i)}} and {\it{(ii)}} of Prop.\ \ref{prop:convergence} are so strong that one basically knows all system maps $S^c$ \emph{explicitly}, at least on all those inputs points that occur in the real world. 
If one would, in addition, know the real-world input distribution $p_x$ (e.g.\ in a sample-based way), one could (in theory) simulate the system map $S$ on all those samples and compute (or at least estimate) the real-world TPI distribution $p_y$ by Monte-Carlo sampling; thus determine the desired $p_\text{fail}$ arbitrarily well. 
However, we do \emph{not} need the input distribution $p_x$ to be known explicitly for our proposed method to be applicable; and we apply our method even when the strong knowledge about the $S^c$ implied by {\it{(i)}} and {\it{(ii)}} is \emph{not} available.

\begin{remark}[Upper bounds in the limit]
	Beyond the strong assumptions of Prop.\ \ref{prop:convergence}, the upper bounds obtained by our method (\ref{max-discrepancy-objective-alpha}),(\ref{max-failure-objective-alpha}) (see Sec.\ \ref{subsec:validation-method}) can be proven to be valid under weaker, more realistic assumptions. 
	This appears possible, for example, in the following scenario, as the numbers $V^c\equiv V$ of available validation data points grow: 
	\emph{(a)} The set of validation inputs $\{x^c_v\}_{v=1}^V$ covers the input space $\mathcal{S}^c_{in}\subset{\mathbb R}^{d^c_{in}}$ of $S^c$ increasingly densely as $V\to \infty$, e.g.\ in the sense that $\max_{x\in{\mathcal S}^c_{in}}\min_{v\in\{1,\ldots,V\}}\|x-x^c_v\|\ \longrightarrow\ 0\ \text{as}\ V\to\infty$; with an analogous condition for the set of grid-points $\{g_v\}_{v=1}^V$ used in (\ref{max-failure-objective-alpha}) from Sec.\ \ref{subsec:validation-method}. 
	\emph{(b)} As discrepancy measures $D^{c'\to c}$ we use MMD distances w.r.t.\ continuous and bounded kernel functions $k^{c'\to c}$. This is satisfied by all kernels used here, such as the squared-exponential and IMQ kernels, even when applied after a data embedding \citep{gretton2012kernel}.
	\emph{(c)} The real-world subsystem maps $S^c:x^c\mapsto S^c(x^c)$ (which are not known explicitly, and whose output is a probability distribution in general) are continuous w.r.t.\ the discrepancy measures $D^{c\to c''}$ at their output. This condition would be implied by continuity w.r.t.\ the Wasserstein or the total variation distances and might in some cases be argued from physical considerations. For deterministic components $S^c$, this requirement simply means that the deterministic mapping is continuous. 
	\emph{(d)} For each validation input $x^c_v$ we have measured a sufficient number $W$ of i.i.d.\ samples $y^c_{v,w}\sim S^c(x^c_v)$ ($w=1,\ldots,W$) from the \emph{true but unknown} output distribution $S^c(x^c_v)$ and we use the sample-based $p_\alpha:=\sum_{v=1}^V\sum_{w=1}^W\frac{\alpha_v}{W}\delta_{x^c_v}\delta_{y^c_{v,w}}$ in the optimization (\ref{max-discrepancy-objective-alpha}) from Sec.\ \ref{subsec:validation-method}. We need $W=W(V)$ to be large enough that the empirical estimate $(1/W)\sum_{w=1}^W\delta_{y^c_{v,w}}$ is sufficiently close to $S^c(x^c_v)$ in the kernel mean embedding \citep{muandet2017KME}, instead of the exact equality required by Prop.\ \ref{prop:convergence}. 
	\emph{(e)} The initial $B^{0\to c}$ must either be true upper bounds on the actual $D(p|_{0\to c},q|_{0\to c})$, or must be computed empirically from an increasing number of i.i.d.\ samples $\{x_v\}_{v=1}^V$ coming from the real-world distribution $p_x$ \citep{gretton2012kernel} (see end of Sec.\ \ref{sec:setup}).
	
	Even under such more realistic circumstances, one may still obtain a  provably valid upper bound $F_\text{max}$ on the real-world failure probability $p_\text{fail}$ via our method in the limit $V\to\infty$ of sufficiently many validation points, at least almost surely over the sampling of $y^c_{v,w}$ (and $x_v$ in the case where $B^{0\to c}$ are estimated from those samples) and up to any additive $\varepsilon>0$ chosen beforehand. 
	This can be shown by following the steps of the proof of Prop.\ \ref{prop:convergence}, replacing each valid upper bound or exact equality by an approximation or limiting argument, using the above assumptions. 
	Even more, when e.g.\ the kernels $k^{c'\to c}$ as well as Lipschitz constants of the maps $S^c$ are known, then more effective statements can be obtained, in the sense that $V$ can then be related to $\varepsilon$ and to the confidence in the final $F_\text{max}$ being a valid bound. 
	
	However, even such more realistic convergence statements would still not be practical in all cases, since e.g.\ the assumption \emph{(a)} generally requires a number of validation inputs $V\gtrsim (1/\delta)^{d^c_{in}}$ \emph{exponential} in the dimensions of the input spaces of the $S^c$ (where $\delta= \max_{x\in{\mathcal S}^c_{in}}\min_{v\in\{1,\ldots,V\}}\|x-x^c_v\|$ denotes the desired set approximation accuracy). 
	We, therefore, take a pragmatic viewpoint, in that we apply our method (\ref{max-discrepancy-objective-alpha}),(\ref{max-failure-objective-alpha}) (Sec.\ \ref{subsec:validation-method}; see also Algorithm \ref{algorithm:DPBound}) even in those cases where we only have a limited amount of validation data available. We evalute this empirically in Sec.\ \ref{sec:reli-benchm-eval}. 
	%We evaluate empirically whether and when our computed bounds are valid in Sec.\ \ref{sec:reli-benchm-eval}.
\end{remark}


\begin{remark}[Arbitrary simulation]
	For the justification of our method -- either heuristically or rigorously as above -- it is \emph{not} necessary that the ``simulation distribution'' $q$ be in any sense close to the true system behavior or that the simulations $M^c$ need to be faithful approximations of the actual system components $S^c$. 
	Rather, it suffices that each of the distributions $q|_{\tilde{c}\to\hat{c}}$ (i.e.\ one distribution for each pair $(\tilde{c},\hat{c})$ with $0\leq\tilde{c}<\hat{c}\leq C+1$) has the same value in each of the optimizations (\ref{max-discrepancy-objective-alpha}) and (\ref{max-failure-objective-alpha}) (Sec.\ \ref{subsec:validation-method}); 
	each of these distributions $q|_{\tilde{c}\to\hat{c}}$ simply acts as an ``anchor'' with respect to which the (unknown) real world distribution $p|_{\tilde{c}\to\hat{c}}$ is assessed -- and these anchors need to remain fixed. 
	This means that e.g.\ the full joint distribution $q$ could have been generated by starting from an arbitrarily chosen $q_x$ and arbitrarily ``bad'' models $M^c$, and that the resulting $F_\text{max}$ should remain an upper bound on $p_\text{fail}$ regardless.
	In practice we nevertheless desire the simulations $M^c$ to be close to $S^c$ as we intuitively believe that such closeness should lead to stronger statements about the system via the simulations, i.e.\ that the upper bound $F_ \text{max}$ be as good (i.e.\ small or tight) as possible. 
	We investigate this dependency in the experiments (Sec.\ \ref{sec:experiments}) via the comparison of Perfect Model vs.\ Misfit Model.
\end{remark}




%\section{DETAILS ON EXPERIMENTS (SEC.\ \ref{sec:experiments})}
\section{Details on Experiments (Sec.\ \ref{sec:experiments})}\label{sec:deta-exper-sect}


This section will provide all experiment details to reproduce the result shown in Sec.~\ref{sec:reli-benchm-eval}.
It will contain the description of the reliability benchmark problems, the (input) data generation process, the per-channel/signal kernels and their length scales, the construction of the bias input (i.e. Biased Input), the misfit model (i.e. Misfit Model) and hyper-parameters of the failure probability computation.
Most settings are kept fix across all benchmark problems (described in Sec.~\ref{sec:fix_settings}) and only the biased input construction and kernel parameters vary for each problem (described in Sec.~\ref{sec:per_use_case_settings}).

\subsection{Fixed Settings}
\label{sec:fix_settings}
%Across all experiments, the following settings are kept fixed in order to compare the validation performance across all different problems.

Across all experiments, the following settings are kept fixed in order to be able to compare the validation performances.


\textbf{Data}:
For all subsystems $S^c$ and sub-models $M^c$ ($c=1,2,...,C$) we fix the number of samples to $V_c=100$ and $n_M=500$, respectively.
All data is generated by a fixed base seed of $2349$.\\
\textbf{Trials}:
All experiments report the mean and standard deviation across $5$ independent trials where all settings are kept identical except the base seed; this is incremented by the 0-index id of each trial.\\
\textbf{Failure probability threshold}:
The threshold for each validation problem is set such that the ground truth failure probability of the (TPI) output of the last subsystem $S^C$ is at approximately $1\%$; $1$ million samples are used to determine the threshold.
The goal of all validation methods across all benchmark problems is to achieve a failure probability as close to $1.0\%$ as possible.
It should be noted that in a real-world system, the ground truth failure probability is not available or is poorly approximated through limited samples.\\
\textbf{Failure probability grid}:
The grid of the failure probability optimization is based on the (TPI) output of the last sub-model $M^C$.
It is chosen to cover the entire support of the model output distribution and a significantly large grid region after the threshold $\tau$. \\
\textbf{Failure probability Lipschitz constant}:
Ideally, the Lipschitz-constant should be set according to the smoothness (or Lipschitz constant) of the system's TPI distribution.
As this is not available, we used the empirical histogram distribution of simulated TPI outputs as a proxy.
More precisely, for each problem a histogram with $100$ bins is constructed to compute the resulting empirical Lipschitz-constant. \\
\textbf{Model misfit -- Gaussian process}:
For the cases where we artificially introduce a model misfit in sub-models $M^c$ ($c=1,2,...,C$) (i.e. ``Model Misfit'' in main Tab.\ \ref{tab:results_benchmark}) we learn a Gaussian process (GP) for each sub-model $M^c$ to model the input-output mapping of the corresponding subsystem $S^c$.
The goal is to introduce a misfit in a controllable fashion which reflects realistic misfits between real-world systems and simulations.
We generate $100$ independent training samples for learning the GP (i.e. the training data is not used to solve the validation problem).
We use exact GP inference in \texttt{GPyTorch} \citep{gardner2018gpytorch} with Radial basis function (RBF) kernels.
The \texttt{LBFGS-scipy} optimizer is used with a learning rate of $1.0\mathrm{e}{-3}$ for $2000$ iterations and $10$ restarts with different initializations.
All misfit models use the exact same GP settings, therefore the amount of misfit varies across benchmark problems and their individual components. \\
\textbf{Biased input}:
In order to test the limits of the system and validation methods, we specifically use a biased (model) input $q_{\text{biased}-x}$ for each benchmark problem.
This bias was constructed such that the TPI model output distribution is shifted farther away from the threshold.
Such a bias distribution exploits the weakness of some validation methods (e.g. MCCP and SurrModel) which do not take the input discrepancy between $p_x$ and $q_x$ into account.
As a result, using the bias input distribution $q_x$ for the model input yields overly optimistic estimates with a severe underestimation the failure probability (i.e. $F_{\text{max}} < F_{\text{GT}} = 1.0$). \\
\textbf{Kernels}:
All experiments either use a radial basis function (RBF) (squared-exponential) kernel or inverse multiquadratic (IMQ) (also known as rational quadratic) kernel~\citep{gorham2017measuring}.
Both kernels use a jitter of $1\mathrm{e}{-10}$, and the IMQ kernel has a fixed $\alpha=-0.5$.
The length scales for both kernels differ for each benchmark problem and channel/signal dimension. \\
\textbf{Length scale search}:
For a given benchmark problem, all length scales across all channels (including each dimension) are optimized to minimize the failure probability $F_{\text{max}}$.
It should be note that as $F_{\text{max}}$ is a probability, its interpretation is independent of the kernel length scales.
As a result, one can perform a kernel length scale search with the objective of minimizing $F_{\text{max}}$.
For all problems, we performed a Bayesian optimization search~\citep{frohlich2020noisy} where the search space of all parameters are kept large ($[1.0\mathrm{e}{-8}, 5.0\mathrm{e}{3}]$), except the length scale of the last TPI kernel which depends on the output range of each problem.
Furthermore, we only perform the length scale search for the setting "Perfect Input" and "Perfect Model" (i.e. top left quadrant in main Tab.\ \ref{tab:results_benchmark}).
The found length scales are kept fixed for all other settings.


\subsection{Reliability Benchmark Problems}
\label{sec:per_use_case_settings}
The following subsections provide all details for each reliability benchmark problem/dataset used in main Tab.\ \ref{tab:results_benchmark}. \\


\subsubsection{Controlled Solvers~\citep{sanson2019systems}}
\textbf{Components}:
This problem has $4$ components (i.e. solvers): the Sobol function, Ishigami function, and the remaining two are products of polynomial functions and trigonometric functions.
\begin{align}
f_1 &:(x_{1:5}) \mapsto \prod_{k=1}^5 \frac{|4x_k - 2| + a_k}{1 + a_k} =:x_6 \\
f_2  &:(x_{6:8}) \mapsto \sin{x_6} + 0.7\sin^2{x_7} + 0.1x_8^4 \sin{x_6} =: x_9\\
f_3  &:(x_{9:14}) \mapsto x_{10}^2 \arctan{1-x_{14}} + x_{11} x_{12} x_{13}^3 + 3x_9 =:x_{15}\\
f_4  &: (x_{15:19})\mapsto \sin{x_{19}}x_{18} + x_{15}x_{16} + x_{17},
\end{align}
where $a=(12, 2, 3, 4, 45)$,
the $x_6 = f_1(x_{1:5})$, $x_9 = f_2(x_{6:8})$, and $x_{15} =  f_3(x_{9:14})$.
This problem is also defined in more detail in Sec.\ 5.5. ``Test Case 3'' in \citet{sanson2019systems} with Fig.\ 14 depicting the causal graph. \\
\textbf{Perfect Input}:
$16$-dimensional input ($x_{\{1,2,...,19\} \setminus \{6, 9, 15\}}$) sampled from $\mathcal{U}_{[0.0, 1.0]}$. \\
\textbf{Biased Input}:
Input signal $x_{18}$ sampled from $\mathcal{U}_{[0.0, 0.8]}$ and the remaining inputs ($x_{\{1,2,...,19\} \setminus \{6, 9, 15\}}$) from $\mathcal{U}_{[0.0, 1.0]}$. \\
\textbf{Model Misfit}:
A GP is learned for each component $M^c$ ($c=1,2,...,C$). \\
\textbf{Failure probability}:
The grid range is fixed to $[g_{\text{min}}=-5.0, g_{\text{max}}=60.0]$ with the threshold at $\tau=14.51$.
A (decreasing) monotonicity constraint is enforced for the range $[\tau - 1.5*\text{grid-spacing}, g_{\text{max}}$.
The Lipschitz constant is set to $0.28$. \\
\textbf{Kernels}:
All channels use a RBF kernel with the following length scales: $[1.0\mathrm{e}{-6}, 5.0e01, 1.0\mathrm{e}{-6}, 5.0\mathrm{e}{1}, 1.0\mathrm{e}{-6}, 1.0\mathrm{e}{-6}, 1.0\mathrm{e}{-6}, 6.397]$ for the channels $[x_{1:5}, x_{6:7}, f_1(\cdot), x_{8:12}, f_2(\cdot), x_{13:16}, f_3(\cdot), f_4(\cdot)]$, respectively.


\subsubsection{Chained Solvers~\citep{sanson2019systems}}
\textbf{Components}:
This problem has $2$ components (i.e. solvers) forming a composition of two univariate functions $f_1$ and $f_2$:
\begin{align}
f_1 &: x \mapsto e^{\sqrt{x}}\sin{x} + 6 e^{-(x-2)^2} + \frac{5}{2} e^{-3(x-1)^2} \\
f_2 &: x \mapsto \sin{x} + 0.3  x  \sin{3.4x + 0.5},
\end{align}
where the global output is $f = f_1 \circ f_2$.
This problem is also defined in more detail in Sec.\ ``5.1. Test Case 1'' in \citep{sanson2019systems} with Fig.\ 4 plotting the signals. \\
\textbf{Perfect Input}:
Univariate input sampled from $\mathcal{U}_{[0.0, 6.0]}$. \\
\textbf{Biased Input}:
Univariate input sampled from a mixture distribution $\alpha \mathcal{U}_{[0.0, 6.0]} + (1-\alpha) \mathcal{U}_{[4.0, 6.0]}$ with $\alpha=0.90$.
The $\alpha$ controls the trade-off between the biasedness and correctness of the support of the resulting distribution. \\
\textbf{Model Misfit}:
A GP is learned for each component $M^c$ ($c=1,2,...,C$). \\
\textbf{Failure probability}:
The grid range is fixed to $[g_{\text{min}}=-8.0, g_{\text{max}}=5.0]$ with the threshold at $\tau=1.459$.
A (decreasing) monotonicity constraint is enforced for the range $[\tau - 1.5*\text{grid-spacing}, g_{\text{max}}$.
The Lipschitz constant is set to $99.0$. \\
\textbf{Kernels}:
The $2$ single-dimensional input channels and the TPI channel use a RBF and IMQ kernel, respectively, with length scales $[1\mathrm{e}{-8}, 1\mathrm{e}{-8}, 1.218]$.


\subsubsection{Borehole~\citep{sim_bench_website}}
\textbf{Components}:

This problem has a single component which models water flow through a borehole:
\begin{align}
f: (r_w, r, T_u, H_u, T_l, H_l, L, K_w) \mapsto \frac{2\pi T_i(H_u-H_l)}{\text{ln}(r/r_w)(1+\frac{2LT_u}{\text{ln}(r/r_w)r_w^2K_w} + \frac{T_u}{T_l})} 
\end{align}
This problem is also defined in more detail at \url{https://www.sfu.ca/~ssurjano/borehole.html}.
We construct two variants of this problem: \texttt{single\_borehole} and \texttt{compositional\_borehole}.
The former considers only the function $f$ above, whereas the latter breaks the function up into multiple (5) smaller components:
\begin{align}
f_1 &: (T_u, H_u, H_l) \mapsto 2\pi T_i(H_u-H_l) \\ 
f_2 &: (r_w, r, T_u, L, K_w) \mapsto (\frac{2LT_u}{\text{ln}(r/r_w)r_w^2K_w} \\ 
f_3 &: (T_u, T_l) \mapsto \frac{T_u}{T_l} \\
f_4 &: (r_w, r, y2, y3) \mapsto \text{ln}(\frac{r}{r_w} (1+y_2+y_3))  \\
f_5 &: (y_1, y_4) \mapsto \frac{y_1}{y_2},
\end{align}
where $y_1=f_1(\cdot)$, $y_2=f_2(\cdot)$, $y_3 = f_3(\cdot)$, and $y_4 = f_4(\cdot)$.\\ 
\textbf{Perfect Input}:
The $8$ inputs are sampled from distributions described in detail at \url{https://www.sfu.ca/~ssurjano/borehole.html}.
The description also includes the range of all signals in the system. \\
\textbf{Biased Input}:
The sampling range of the $H_u$ signal is modified from $[990, 1110]$ to $[990, 1010]$. \\
\textbf{Model Misfit}:
A GP is learned for each component $M^c$ ($c=1,2,...,C$). \\
\textbf{Failure probability}:
The grid range is fixed to $[g_{\text{min}}=-35.0, g_{\text{max}}=600.0]$ with the threshold at $\tau=157.1$.
A (decreasing) monotonicity constraint is enforced for the range $[\tau - 1.5*\text{grid-spacing}, g_{\text{max}}$.
The Lipschitz constant is set to $0.0006$. \\
\textbf{Kernels}:
For the \texttt{single\_borehole}, the RBF kernel is used for both input and output kernels with the following length scales: $[[10.599, 6.587, 24.609, 32.369, 46.431, 23.046, 12.943, 2.734], 23.578]$, respectively.
The first kernel has a multi-dimensional length scale; one for each input dimension.
For the \texttt{compositional\_borehole}, the RBF kernel is used for all $8$ input channels and a IMQ kernel for the TPI output kernel with length scales:
$[5.0\mathrm{e}{+3}, 1.\mathrm{e}{-1}, 3.198\mathrm{e}{+3}, 5.0\mathrm{e}{+3}, 1.0\mathrm{e}{-1}, 5.0\mathrm{e}{+3}, 5.0\mathrm{e}{+3}, 1.0\mathrm{e}{-1}, 5.0\mathrm{e}{+3}, 2.634]$, respectively.



\subsubsection{Branin~\citep{sim_bench_website}}
\textbf{Components}:
This problem has a single component:
\begin{align}
f: x_{1:2} \mapsto f_{\text{max}} - a (x_2 -bx_1^2 + cx_1 - r)^2 + s(1-t)\cos{x_1} + s,
\end{align}
where we the recommended parameters are used: $a = 1$, $b = 5.1/(4\pi^2)$, $c = 5 / \pi$, $r = 6$, $s = 10$ and $t = 1 / (8\pi)$.
In order to map the minimization problem to our maximization setting, we modify the branin function by adding $f_{\text{max}}=312.0$ and subtracted the original formulation thereof.
This problem is also defined in more detail at \url{https://www.sfu.ca/~ssurjano/branin.html}\\
We construct two variants of this problem: \texttt{single\_branin} and \texttt{compositional\_branin}.
The former considers only the function $f$ above, whereas the latter breaks the function up into multiple (3) smaller components:
\begin{align}
f_1 &: x_{1:2} \mapsto  (x_2 -bx_1^2 + cx_1 - r)^2 \\
f_2 &: x_{1:2} \mapsto  (1-t)\cos{x_1} \\
f_3 &: x_{3:4} \mapsto  f_{\text{max}} - ax_3 + sx_4 + s,
\end{align}
where $x_3=f_1(\cdot)$ and $x_4 = f_2(\cdot)$. \\
\textbf{Perfect Input}:
The $2$ inputs are sampled from $\mathcal{U}_{[-5.0, 10.0]}$ and $\mathcal{U}_{[0.0, 15.0]}$, respectively. \\
\textbf{Biased Input}:
The $2$ inputs are sampled from a mixture distribution $\alpha \mathcal{U}_{[-5.0, 10.0]} + (1-\alpha) \mathcal{U}_{[8.0, 10.0]}$ and $\alpha \mathcal{U}_{[0.0, 15.0]} + (1-\alpha) \mathcal{U}_{[12.0, 15.0]}$ with $\alpha=0.10$, respectively.
The $\alpha$ controls the trade-off between the biasness and correctness of the support of the resulting distribution. \\
\textbf{Model Misfit}:
A GP is learned for each component $M^c$ ($c=1,2,...,C$). \\
\textbf{Failure probability}:
The grid range is fixed to $[g_{\text{min}}=-35.0, g_{\text{max}}=700.0]$ with the threshold at $\tau=330.82$.
A (decreasing) monotonicity constraint is enforced for the range $[\tau - 1.5*\text{grid-spacing}, g_{\text{max}}$.
The Lipschitz constant is set to $0.005$. \\
\textbf{Kernels}:
For the \texttt{single\_branin}, the RBF kernel is used for both input and output kernels with the following length scales: $[0.003, 21.161]$, respectively.
For the \texttt{compositional\_branin}, the IMQ kernel is used for all channels with length scales: $[1\mathrm{e}{-8}, 1\mathrm{e}{-8}, 500.0, 1\mathrm{e}{-8}, 26.064]$.




\subsubsection{Four Branch~\citep{UQworld}}
\textbf{Components}:
This problem has $4$ independent components which form four branches and the final global output takes the minimum of the four component outputs:

\begin{align}
f_1 &: x_{1:2} \mapsto 3 + 0.1(x_1 - x_2)^2 - \frac{x_1+x_2}{\sqrt{2}}\\
f_2 &: x_{1:2} \mapsto 3 + 0.1(x_1 - x_2)^2 + \frac{x_1+x_2}{\sqrt{2}}\\
f_3 &: x_{1:2} \mapsto (x_1 - x_2) + \frac{p}{\sqrt{2}}\\
f_4 &: x_{1:2} \mapsto (x_1 - x_2) - \frac{p}{\sqrt{2}}\\
f_5 &: x_{1:2} \mapsto \text{min} \{f_1(x_{1:2}), f_2(x_{1:2}), f_3(x_{1:2}),f_4(x_{1:2})\} + 10,
\end{align}
where $p=6.0$.
This problem is also defined in more detail in \cite{UQworld}, where Fig.\ 1 shows the surface plot of the four branch function. 
We construct two variants of this problem: \texttt{single\_four\_branch} and \texttt{compositional\_four\_branch}.
The former considers only the function $f_4$ above, whereas the latter breaks the function up into multiple (4) smaller components. \\
\textbf{Perfect Input}:
The $2$ inputs are sampled from two normal distributions described as in detail at \cite{UQworld}. \\
\textbf{Biased Input}:
The $2$ inputs are sampled from a mixture distribution $\alpha \mathcal{N}(0.0, 1.0) + (1-\alpha) \mathcal{N}(0.0, 0.90)$ and $\alpha \mathcal{N}(0.0, 1.0) + (1-\alpha) \mathcal{N}(0.0, 0.9)$ with $\alpha=0.80$, respectively.
The $\alpha$ controls the trade-off between the biasness and correctness of the support of the resulting distribution. \\
\textbf{Model Misfit}:
A GP is learned for each component $M^c$ ($c=1,2,...,C$). \\
\textbf{Failure probability}:
The grid range is fixed to $[g_{\text{min}}=0.0, g_{\text{max}}=30.0]$ with the threshold at $\tau=9.693$.
A (decreasing) monotonicity constraint is enforced for the range $[\tau - 1.5*\text{grid-spacing}, g_{\text{max}}$.
The Lipschitz constant is set to $0.005$. \\
\textbf{Kernels}:
For the \texttt{single\_four\_branch}, the RBF kernel is used for both input and output kernels with the following length scales: $[[0.201, 0.198], 10.0]$, respectively.
For the \texttt{compositional\_four\_branch}, the RBF kernel is used for all channels with length scales: $[[2.018, 1.983], 2.472, 2.374, 2.077, 2.077, 10.0]$.


%\section{ILLUSTRATIONS OF THE METHODS \& FURTHER EXPERIMENTS (SEC.\ \ref{sec:experiments})}
\section{Illustrations of the Methods \& Further Experiments (Sec.\ \ref{sec:experiments})}\label{appendix:illustrations}

\begin{figure}[t]

	\begin{center}
		\subfigure[]{\label{fig_withUW:b}\includegraphics[width=0.235\textwidth]{model_mismatch_input_perfect.pdf}}
		\subfigure[]{\label{fig_withUW:a}\includegraphics[width=0.235\textwidth]{model_perfect_input_bias.pdf}}
	\end{center}

	\caption{Illustration of \texttt{DPBound} (see also Fig.\ \ref{fig:linear_use_case_illustration} in the main text) and \texttt{SurrModel} for a linear mapping between Gaussian signals.
		{\textbf{(a)}} Model and system are different $S\neq M$, whereas the input distributions are identical $p_x=q_x$.
		{\textbf{(b)}} The model $M$ is the perfect model, i.e. $S=M$, but input distributions are different. Computed weights $\alpha_v$ (see Eq.\ (\ref{eq:p-alpha-joint}) in Sec.\ \ref{subsec:validation-method})  are indicated by the size of markers for $S(x)$ and the worst-case distributions w.r.t. the failure probability are indicated in red.
		The inputs and outputs of the surrogate model \texttt{SurrModel} are shown in pink.
		\label{fig:linear_use_case_illustration_WITH_UW}}
\end{figure}

\subsection{Surrogate Model \texttt{SurrModel} in the Toy Setting (Sec.\ \ref{sec:experiment_linear_use-case})}
\label{sec:appendix_uw_toy}

In Sec.~\ref{sec:experiment_linear_use-case}, an illustrative example was used to visualize the two configurations considered in the experimental setup.
Here, we additionally analyze the performance of the surrogate model (i.e.\ the \texttt{SurrModel} method from Sec.\ \ref{sec:uncertainty-wrapper-method}) for this single-component linear example under the two configurations.

We illustrated and discussed in Sec.~\ref{sec:experiment_linear_use-case} how \texttt{DPBound} can handle biases in the input distribution, as well as mismatches between the models $S$ and $M$.
On the other hand, the explicit uncertainty estimation with surrogate models fails to handle or detect mismatches in the input distribution, because the estimate of the output distribution arises from surrogate models (albeit learned on the validation data from $S$) run on the input distribution of $M$, thereby completely ignoring the real-world input distribution $p_x$ of $S$. 
To see this, note that the resulting output distribution (pink crosses) of the surrogate model in Suppl.\ Fig.\ \ref{fig_withUW:b} lies on top of the system output distribution $S(\cdot)$ (which is different from $M(\cdot)$), whereas in Suppl.\ Fig.\ \ref{fig_withUW:a} it basically coincides with the model output distribution $M(\cdot)$ (so that no difference is detected).
Consequently, surrogate models can detect differences due to modeling mismatches $M\neq S$, but not between input distributions $q_x\neq p_x$.



\begin{figure}
	\begin{center}
		\includegraphics[width=0.9\textwidth]{toy-signal-propagation.png}
		

		
		\caption{Figure showing the input/output signal distributions for the ``Chained Solvers'' use-case in the setting ``Biased Input--Misfit Model'' (cf.\ main Tab.\ \ref{tab:results_benchmark}; we chose this use-case for illustration purposes, as its signals are one-dimensional). \textbf{Top row:} ground-truth signals (from the system $S$). \textbf{Middle row:} simulation signals (from the model $M$). \textbf{Bottom row:} surrogate model signals (from the model $M'$ in the \texttt{SurrModel} method, see Sec.\ \ref{sec:uncertainty-wrapper-method}). \textbf{Left column:} input distributions. \textbf{Middle column:} output distributions after first component. \textbf{Right column:} final TPI distributions.\label{fig:illustration_signal_propagation}}
	\end{center}
\end{figure}

\begin{figure}
	\begin{center}
		\includegraphics[width=0.9\textwidth]{toy-error-propgation.png}
		

		
		\caption{Figure showing the propagation of errors (residuals) of the surrogate model for the ``Chained Solvers'' use-case in the setting ``Biased Input--Misfit Model'' (cf.\ main Tab.\ \ref{tab:results_benchmark}); see Fig.\ \ref{fig:illustration_signal_propagation} for the actual signals. \textbf{Left column:} real-world input distribution (top) and simulation input distribution (bottom; note that the simulation input distribution is biased). \textbf{Top row (2nd and 3rd column):} histograms over residuals (errors) between real system and surrogate model (both starting from the real-world input distribution). \textbf{Bottom row (2nd and 3rd column):} histograms over residuals (errors) between simulation model and surrogate model (both starting from the simulation input distribution), akin to what the \texttt{SurrModel} method uses (see Eq.\ (\ref{eq:delta-surr-model}) in Sec.\ \ref{sec:uncertainty-wrapper-method}). \label{fig:illustration_error_propagation}}
	\end{center}
\end{figure}



\subsection{Illustration of Signal and Error Propagation (for the ``Chained Solvers'' Usecase, Sec.\ \ref{sec:reli-benchm-eval})}\label{sec:signal_propagation_illustration}
To illustrate the propagation of signals through the chain of subsystems, Suppl.\ Fig.\ \ref{fig:illustration_signal_propagation} shows the propagation of signals through the system (top row), the model chain (middle row), and the surrogate model chain (bottom row, for the \texttt{SurrModel} method from Sec.\ \ref{sec:uncertainty-wrapper-method}). This is shown for the ``Chained Solvers'' use-case in the setting of ``Biased Input--Misfit Model'' (see Sec.\ \ref{sec:reli-benchm-eval} and lower right quadrant of the main Tab.\ \ref{tab:results_benchmark}); we picked this ``Chained Solvers'' use-case for visualization purposes as it has one-dimensional signals. The mismatches (errors) between those three signals are illustrated in Suppl.\ Fig.\ \ref{fig:illustration_error_propagation}.



Suppl.\ Fig.\ \ref{fig:illustration_error_propagation} illustrates the propagation of errors (residuals) between system/model and surrogate model through the components, in the same setting as Suppl.\ Fig.\ \ref{fig:illustration_signal_propagation} (described in the previous paragraph). As the \texttt{SurrModel} method (Sec.\ \ref{sec:uncertainty-wrapper-method}), starts from the simulation inputs (bottom row of Suppl.\ Fig.\ \ref{fig:illustration_error_propagation}), which may be biased w.r.t.\ the real-world input distribution (top tow), the residuals from Eq.\ (\ref{eq:delta-surr-model}) in Sec.\ \ref{sec:uncertainty-wrapper-method} used by \texttt{SurrModel} may be too small (compare lower-right vs.\ upper-right panel in Suppl.\ Fig.\ \ref{fig:illustration_error_propagation}), finally leading to an underestimate of the failure probability by \texttt{SurrModel} in Eq.\ (\ref{eq:Fmax_estimate_UW}) (Sec.\ \ref{sec:uncertainty-wrapper-method}). This failure of the \texttt{SurrModel} method is observed in the actual experiments (main Tab.\ \ref{tab:results_benchmark} in the main text), especially for the ``biased-input'' settings.




\subsection{Dependence of \texttt{MCCP} on Its Confidence Level Parameter}\label{app:MCCP99}
In Suppl.\ Tab.\ \ref{table:MCCP99} we corroborate our conclusions from the experiments in Sec.\ \ref{sec:reli-benchm-eval} regarding the (in)validness of the \texttt{MCCP} method.

\begin{table}[h!]
	\begin{center}
		\caption{\label{table:MCCP99}Ratio of invalid bounds (i.e.\ bounds below 1\%) produced by the \texttt{MCCP}-method run with a confidence (CL) parameter of 99\% in each of the four simulations configurations, compared (in parentheses) to the ratio for \texttt{MCCP} at 95\% CL parameter from main Tab.\ \ref{tab:results_benchmark}. 
			While the ratio of invalid bounds does decrease with the higher CL parameter of 99\%, the ratio does not decrease in a proportionate way down to one fifth of the ratio at 95\%-CL, especially not for the \emph{Biased Input} settings. 
			The invalidness ratio stays clearly above its 1\% validity promise (except in the easy case of \emph{Perfect Input--Perfect Model}). 
			This indicates that \texttt{MCCP}'s CL parameter is \emph{not} the main reason for its (high) level of invalidity. 
			The main reason is rather \texttt{MCCP}'s ignorance of the system input distribution and of the model misfits (see Sec.\ \ref{sec:reli-benchm-eval}).}
		
		
		\begin{tabular}{ |l|c|c| } 
			\hline
			\texttt{MCCP} at 99\% CL (95\% CL) & \textbf{Perfect Model} & \textbf{Misfit Model} \\
			\hline
			\textbf{Perfect Input} & 0\% (0\%) & 5\% (22.5\%) \\
			\hline
			\textbf{Biased Input} &47.5\% (67.5\%) & 42.5\% (67.5\%)\\
			\hline
		\end{tabular}
	\end{center}
\end{table}



\subsection{Tightness of the SDP Relaxation (Lemma \ref{lem:SDR-relaxation})}\label{sec:empirical-tightness}
Here we investigate experimentally how tight our convex (SDP) relaxation of the nonconvex bound optimization in Eq.\ (\ref{max-discrepancy-objective-alpha}) from Sec.\ \ref{subsec:validation-method} is (see also App.\ \ref{app:semidefinite-relaxation}). For this, we evaluate the \emph{minimum approximation ratio} $\widehat{\gamma}\in[0,1]$ of the SDP relaxation, as defined below Lemma \ref{lem:SDR-relaxation} in App.\ \ref{app:semidefinite-relaxation}, for each of the $440$ SDP optimizations required to produce our main results table (Tab.\ \ref{tab:results_benchmark} in Sec.\ \ref{sec:method}, which summarizes $8\cdot4\cdot5=160$ validation runs). Note that $\widehat{\gamma}=1$ would be proof of a perfectly \emph{tight} relaxation, while for example $\widehat{\gamma}=0.99$ guarantees that the relaxation was tight up to at most $1\%$. These tightness results are summarized in Suppl.\ Tab.\ \ref{table:SDPtightness}.

\begin{table}[h!]
	\begin{center}
		\caption{\label{table:SDPtightness}Frequency of \emph{minimum approximation ratios} $\widehat{\gamma}$ of the SDP relaxations (defined below Lemma \ref{lem:SDR-relaxation} in App.\ \ref{app:semidefinite-relaxation}) for the $440$ SDP relaxations required to produce Tab.\ \ref{tab:results_benchmark} in Sec.\ \ref{sec:method}. Note that the true but unknown approximation ratio $\gamma$ of each SDP relaxation satisfies $\widehat{\gamma}\leq\gamma\leq1$. Thus, over all $160$ validation tasks from the main Tab.\ \ref{tab:results_benchmark}, $87.3\%$ of the $440$ required SDP bound optimizations are guaranteed to be at least $99\%$-tight.}
		
		\begin{tabular}{ |l|c|c| } 
			\hline
			minimum approximation ratio $\widehat{\gamma}$ & \# of SDP optimizations & \% of SDP optimizations \\
			\hline
			$0.99\leq{\widehat{\gamma}}\leq1.0$ & $384$ & $87.3\%$ \\
			$0.9~~\leq{\widehat{\gamma}}<0.99$ & $~~15$ & $~~3.4\%$ \\
			$0.1~~\leq{\widehat{\gamma}}<0.9$& $~~29$ & $~~6.6\%$ \\
			$0.0~~\leq{\widehat{\gamma}}<0.1$ & $~~12$ & $~~2.7\%$ \\
			\hline
			total \# of SDP optimizations & $440$ & $~100\%$ \\
			\hline
		\end{tabular}
	\end{center}
\end{table}

\subsection{Computational Cost \& Runtime}\label{sec:runtime}
The computational complexity of the method in terms of the validation data set size(s) and the number of components is discussed at the end of App.\ \ref{app:semidefinite-relaxation} (see also below Eq.\ (\ref{max-discrepancy-objective-alpha}) in the main text).

Empirically, the runtime required for the 160 validation runs of \texttt{DPBound} to produce Tab.\ \ref{tab:results_benchmark} on our desktop machine is 2 hours; this runtime is dominated by the semidefinite optimization steps (where we use Eqs.\ (\ref{eq:tightened-SDR-objective}--\ref{eq:tightened-SDR-last-tightening-constraint}) in place of Eq.\ (\ref{max-discrepancy-objective-alpha}), and use the concrete form Eqs.\ (\ref{eq:violation_objective}--\ref{eq:violation_lipschitz_constraint}) in place of Eq.\ (\ref{max-failure-objective-alpha})). On those same 160 validation problems, the \texttt{MCCP} method takes 2min (as \texttt{MCCP} must only propagate model inputs through the given model chain, with no optimizations to do), while \texttt{SurrModel} takes 25min (mainly spent on fitting the surrogate Gaussian Process models).





\bibliography{reeb_297}

\end{document}
