% \documentclass{uai2024} % for initial submission
\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\newtheorem{theorem}{Theorem}
\newtheorem{definition}{Definition}

\newtheorem{remark}{Remark}
\newtheorem{lemma}{lemma}
\newtheorem{assumption}{Assumption}
\newtheorem{corollary}{Corollary}
\newtheorem{proposition}{Proposition}


\newcommand{\argmin}{\operatornamewithlimits{argmin}}
\newcommand{\argmax}{\operatornamewithlimits{argmax}}




\usepackage{amsmath}  

\usepackage{booktabs} % for professional tables
%\floatsetup[table]{capposition=top}
%\newfloatcommand{capbtabbox}{table}[][\FBwidth]

\usepackage{bm}


\usepackage{verbatim}
%\usepackage{float}
\usepackage{subfigure}

\usepackage{ulem}% 强调文本

\usepackage{bbm}


\usepackage[ruled,vlined]{algorithm2e}

\usepackage{makecell}
\renewcommand\theadalign{ll}%行列对齐

\usepackage{bbding}
%\usepackage{section}


\usepackage[thinlines,thiklines]{easybmat}

\usepackage{graphicx}

\graphicspath{{figures/}}

\makeatletter

\makeatother

%%% User-defined macros should be placed here, but keep them to a minimum.
%\def\Bka{{\it Biometrika}}

\def\AIC{\textsc{aic}}
\def\T{{ \mathrm{\scriptscriptstyle T} }}
\def\v{{\varepsilon}}
\def\cS{\mathcal{S}}




%\newtheorem{myDef}{Definition}

\usepackage{amsmath}
\usepackage{mathrsfs}
\usepackage{amssymb}

%\usepackage{section}

\usepackage{multirow} 
%\usepackage{paragraph}
\usepackage{extarrows}

\usepackage{diagbox}
\usepackage{booktabs}
\usepackage{multirow}

\usepackage{lipsum}

\usepackage{floatrow}

\usepackage{environ}
\usepackage{tikz}

\usepackage{fancybox}
\shadowsize=0.4\shadowsize
\NewEnviron{elaboration}{
\par
\begin{tikzpicture}
\node[rectangle,minimum width=0.5\textwidth] (m) {\begin{minipage}{0.79\textwidth}\BODY\end{minipage}};
\draw[dashed] (m.south west) rectangle (m.north east);
\end{tikzpicture}
}


\newfloatcommand{capbtabbox}{table}[][\FBwidth]
\DeclareSymbolFont{largesymbol}{OMX}{yhex}{m}{n}
\DeclareMathAccent{\Widehat}{\mathord}{largesymbol}{"62}

\newcommand{\zh}[1]{{\color{blue}{\bf\sf [ZH: #1]}}}
\newcommand{\Teacher}[1]{{\color{red}{\bf\sf [Tea: #1]}}}

\iffalse
\newtheorem{theorem}{Theorem}
\newtheorem{definition}{Definition}

\newtheorem{remark}{Remark}
\newtheorem{lemma}{lemma}
\newtheorem{assumption}{Assumption}
\newtheorem{corollary}{Corollary}
\newtheorem{proposition}{Proposition}
\fi

\usepackage{amsmath}  

\usepackage{booktabs} % for professional tables
%\floatsetup[table]{capposition=top}
%\newfloatcommand{capbtabbox}{table}[][\FBwidth]

\usepackage{bm}

\usepackage{tikz}
\newcommand*{\circled}[1]{\lower.7ex\hbox{\tikz\draw (0pt, 0pt)%
    circle (.5em) node {\makebox[1em][c]{\small #1}};}}
    
%增加
\usepackage{graphicx}
\usepackage{caption, threeparttable}
\captionsetup{labelfont = sc, textfont = it}


%% Please use the following statements for
%% managing the text and math fonts for your papers:

%\usepackage{times}
%\usepackage[cmbold]{mathtime}
\usepackage{bm}


\newcommand{\xinyan}[1]{{\color{red} Su: #1}}


%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Partial Identification with Proxy of Latent Confoundings via Sum-of-ratios Fractional Programming}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Zhiheng Zhang\thanks{Correspondence author. Contacting Email: \texttt{zhiheng-20@mails.tsinghua.edu.cn}.}}
\author[2,3]{Xinyan Su}

% Add affiliations after the authors
\affil[1]{%
     Institute for Interdisciplinary Information Sciences\\
    Tsinghua University, Beijing, China
}
% \affil[*]{Corresponding author email: \texttt{\{zhiheng-20@mails.tsinghua.edu.cn\} } }


\affil[2]{%
Computer Network Information Center\\
Chinese Academy of Sciences, Beijing, China
}

\affil[3]{University of Chinese Academy of Sciences, Beijing, China
}

  
  \begin{document}
\maketitle

\begin{abstract}
Causal effect estimation is a crucial theoretical tool in uncertainty analysis. The challenge of unobservable confoundings has raised concerns regarding quantitative causality computation. To address this issue, proxy control has become popular, employing auxiliary variables $\bm{W}$ as proxies for the confounding variables $\bm{U}$. However, proximal methods rely on strong assumptions, such as reversibility and completeness, that are challenging to interpret empirically and verify. Consequently, their applicability in real-world scenarios is limited, particularly when the proxies lack informativeness. In our paper, we have developed a novel optimization method named \textbf{P}artial \textbf{I}dentification with {P}roxy of Latent Confoundings via \textbf{S}um-of-Ratios \textbf{F}ractional \textbf{P}rogramming (PI-SFP). This method does not impose any additional restrictions upon proxies and only assumes the mild partial observability of the transition matrix $P(\bm{W}\mid \bm{U})$. We have theoretically proven the global convergence of PI-SFP to the valid bound of the causal effect and analyzed the conditions under which the bounds could be tight. Our synthetic and real-world experiments validate our theoretical framework.
\end{abstract}


\section{Introduction}
Causal inference is crucial in uncertainty analysis across various fields such as medicine~\citep{castro2020causality}, economics~\citep{hicks1980causality, zhang2023robust,zhang2020dynamic}, and education~\citep{peng2003culture}. However, extracting useful causal information from observational data is challenging due to latent confoundings that can impede statistical association studies \citep{pearl2009causality}. To address this issue, researchers commonly rely on auxiliary variables for confounding adjustment. Representative auxiliaries include instrumental variables (IV) \citep{soderstrom2002instrumental}, proximal variables \citep{kuroki2014measurement, tchetgen2020introduction}, or outcome-dependent variables \citep{gabriel2022causal}.


In this paper, our primary focus is proximal causal identification, as it represents one of the most commonly utilized auxiliaries in our real world. The use of proxies for confounding adjustment has become a prominent topic of research both theoretically and empirically. Empirical work in this area dates back to \citep{wickens1972note}, which examined the potential benefits of proxies as an alternative to latent confounding in least square estimations. The concept has since been applied in observational studies such as~\cite{kolenikov2009socioeconomic, wooldridge2009estimating} and further studied in other empirical works~\citep{frost1979proxy,rothman2008modern}. On the other hand, theoretical research on this topic can be broadly categorized into two groups, which we illustrate in Fig.\ref{fig1}: the "single-proxy scenario" (Figure\ref{Fig.sub.1})~\citep{tchetgen2023single, park2023single} and the "double-proxy scenario" (Figures~\ref{Fig.sub.2} and~\ref{Fig.sub.3})~\citep{miao2018confounding, shi2020multiply, singh2020kernel, kallus2021causal}, where two confounding proxies $\bm{Z},\bm{W}$ are available.

\begin{figure}[t]
\centering  %图片全局居中
\subfigure[]{
\label{Fig.sub.1}
\includegraphics[scale = 0.28]{figures/left.pdf}}
\subfigure[]{
\label{Fig.sub.2}
\includegraphics[scale = 0.28]{figures/middle.pdf}}
\subfigure[]{
\label{Fig.sub.3}
\includegraphics[scale = 0.28]{figures/right.pdf}}
\caption{Causal identification with confoundings via single-proxy control (a) or double-proxy control (b,c). $\bm{W}$ or $\bm{Z}$ are so-called confonuder proxies when identifying the causal effect of treatment $\bm{X}$ on outcome $\bm{Y}$. }
\label{fig1}
\end{figure}





However, the application scope of proximal-based learning are still limited. For Figure~\ref{Fig.sub.1}, when both $\bm{W}$ and $\bm{U}$ are discrete random variables with a finite number of choices, \citet{pearl2012measurement,tchetgen2023single, park2023single} proved the point-wise identifiability of the true causal quantities when the proxies are informative enough, e.g., the probability transition matrix $P(\bm{W} \mid \bm{U})$ is fully observable and reversible, which is the so-called completeness assumption. Stepping forward, when $P(\bm{W}\mid \bm{U})$ is not observable, Pearl extended the point-wise double-proxy cases (Figures~\ref{Fig.sub.2},~\ref{Fig.sub.3}) from \cite{cai2012identifying}, where both the exposure proxy control $\bm{Z}$ and the outcome proxy control $\bm{W}$ exist. Regretfully, so-called ``double-negative control''~\citep{miao2018identifying, cui2020semiparametric, tchetgen2020introduction, deaner2018proxy,shi2020multiply,singh2020kernel, nagasawa2018identification} methods and ``single-proxy control'' methods~\citep{pearl2012measurement,tchetgen2023single, park2023single} are still subject to strict bridge functions, completeness assumptions
\footnote{The previous reversibility assumption of $P(\bm{W} \mid \bm{U})$ was strengthened to that of $P(\bm{Z},\bm{W}\mid x)$ and $P(y, \bm{Z},\bm{W}\mid x)$. Moreover, the path $\bm{W}$ to $\bm{Y}$ in Fig.~\ref{Fig.sub.3} can be additionally permitted.},  or their weaker forms~\cite{ghassami2023partial, kallus2021causal}.





These untestable and impractical constraints surrogates an important motivation: excessively strong conditions on proxy are imposed to sufficiently achieve the point-wise value of the causal effect, which would be violated in general cases. For instance, in a recommender system, an item's exposure to users is often considered as treatment $\bm{X} (\bm{X}=0,1)$, while the observed feedback is treated as outcome $\bm{Y}$. In this process, the user's socio-economic status and the item characteristics are used as latent confounders~\citep{sato2020unbiased}, which affect both $\bm{X}$ and $\bm{Y}$, and some observations of confounders are considered as proxy (e.g., item popularity ranking)  $\bm{W}$~\citep{zhang2023debiasing}. Unfortunately, the measurement of $\bm{W}$ and $\bm{U}$ would be inaccurate, and even worse, $P(\bm{W}\mid \bm{U})$ is irreversible due to the low dimension of $\bm{W}$, rendering the previous approach invalid. Hence, a natural scientific question arises: \textit{How to conduct partial identification via partially observed proxies?}
% In other words, we aim to relax the constraints on the exact observability and invertibility of $P(\bm{W} \mid \bm{U})$ in our method, in order to expand the applicability of our approach. 

To address this question, due to its originality and difficulty of double-proxies collection, we mainly focus on the single proxy case (Fig.\ref{Fig.sub.1}), which is mostly related to~\cite{pearl2012measurement, tchetgen2023single, park2023single}, and our method could be naturally generalized to Fig.\ref{Fig.sub.2}-Fig.\ref{Fig.sub.3}. We weaken the requirement of ``total precise observability'' of $P(\bm{W} \mid \bm{U})$ to ``partial observability'', and thus generalize the point-wise identification into a partial identification. Our method contributes to the traditional fractional programming methods~\citep{stancu2012fractional} since we do not rely on strong concavity assumption. More importantly, our method also advances the state-of-the-art constrained-optimization-based literature upon partial identification~\cite{duarte2023automated, li2022bounds}, since we additionally provide a non-parametric convergence rate to causal queries based on branch-and-bound strategy. Our contributions are summarized as follows: 

\begin{itemize}
    \item We generalize the traditional proximal learning literature from point-wise identification to partial identification, with a more reasonable and weaker partial observability assumption.
    \item  We introduce a global optimization strategy called PI-SFP and theoretically prove that it can globally converge to the valid bound of casual queries. Moreover, we justify whether the bound is tight. Synthetic and real-world experiments have demonstrated our findings.
     \item We theoretically justify the necessity of the partial observability assumption in proximal control. It is supported by the negative result that traditional informative proxies might not sufficiently guarantee informative partial identification.
\end{itemize}





% The paper is structured as follows: In Section.~1, we provide an introduction to the basic principles of partial identification. In Section.~2, we review the existing literature regarding the construction of causal effects and the evolution of relevant hypotheses. We propose a new hypothesis in response to the shortcomings of the previous approaches, which is more intuitive, applicable, and verifiable. In Section.~3, we establish the estimation of causal effects as a sum-of-ratios fractional programming problem. We explicitly construct the objective function and identification region of the solutions. In Section.~4, we solve this problem using a branch-and-bound strategy. We focus on the theoretical global convergence property of our algorithm in Section.~5. To demonstrate the effectiveness of our algorithm, we present simulation results in Section.~6. Finally, in Section.~7, we discuss several additional topics and illustrate the generalizability and scalability of our approach.


\section{Preliminaries and framework}\label{pre+frame}

\iffalse
\textcolor{red}{Pearl and Kuroski showed that $f(y \mid do(x))$ is identifiable with invertibility condition, namely that XXXX.}\\

\textcolor{red}{In this paper, we consider the setting where we have partial observability: XXXXX. In this paper, we consider partial identification with partial observability. It is natural to get the upper bound (lower bound is symmetric) via solving the following program, proposition....}\\

\textcolor{red}{This fractional programming is non-trivially. fractional programming literature review: convex-concave.}\\

\textcolor{red}{Definitions: end of Section 2 or beginning of Section 3.}\\
\fi

The concept of causal effect is closely linked with the `do' operator, which can be viewed as an external intervention~\citep{pearl2000models, reason:Pearl09a}. In particular, the causal effect of treatment $\bm{X}$ on outcome $\bm{Y}$ is represented as $f(y\mid do(x))$ in Fig.\ref{fig1}, where $do(x)$ signifies that the treatment $\bm{X}$ is fixed at a specific value $x$, and $f(\cdot)$ denotes the probability mass/density function for discrete/continuous variables. We use $d_u$ to denote the cardinality of confounders. As per the back-door criteria~\citep{pearl2000models}, the identification of $f(y \mid do(x))$ is given by $\sum_{i=1}^{d_u}f(y \mid u_i, x)f(u_i)$, namely $f(y,x) + \sum_{i=1}^{d_u} {f(y,u_i,x) f(u_i,\neg x)}/{f(u_i,x)}$.

Such decomposition results from $f(u_i) = f(u_i, x)+f(u_i, \neg x)$. In \cite{kuroki2014measurement}, the authors assumed that the transition matrix $P(\bm{W}\mid \bm{U})$ is observable and reversible, and hence claimed that $f(y \mid do(x))$ is identifiable. In other words, the value of each item as above can be explicitly extracted as follows\footnote{In our paper, we use bold letters to denote column vectors of corresponding possible values. For instance, $f(y,\bm{U},x) = [f(y,u_1,x), f(y,u_2,x),...f(y,u_{d_u},x)]^{T}$. Furthermore, if a symbol has two bold letters such as $P(\bm{W} \mid \bm{U})$, it denotes the matrix $[f(\bm{W} \mid {u_1}), f(\bm{W} \mid u_2),...f(\bm{W} \mid u_{d_u)}]$, where $f(\bm{W} \mid u_i) = [f(w_1 \mid u_i),f(w_2 \mid u_i),...f(w_{d_w} \mid u_i)]^{T}, i=1,2,...d_u$.}:
% \begin{equation}
%     \begin{aligned}
%     \left[\begin{matrix} 
%     &f(y,\bm{U},x) \\
%     &f(\bm{U},x) \\
%     &f(\bm{U},\neg x) 
%     \end{matrix}\right] = {P(\bm{W}\mid \bm{U})}^{-1} \left[\begin{matrix} 
%     &f(y,\bm{W},x) \\
%     &f(\bm{W},x) \\
%     &f(\bm{W},\neg x) 
%     \end{matrix}\right].
%     \end{aligned}
% \end{equation}\label{inverse_matrix}
{
\begin{small}
\begin{equation}
    \begin{aligned}
    \left[\begin{matrix} 
    &f(y,\bm{U},x) \\
    &f(\bm{U},x) \\
    &f(\bm{U},\neg x) 
    \end{matrix}\right]\! \! =\! \! {P(\bm{W}\mid \bm{U})}^{-1} \! \!\left[\begin{matrix} 
    &f(y,\bm{W},x) \\
    &f(\bm{W},x) \\
    &f(\bm{W},\neg x) 
    \end{matrix}\right].
    \end{aligned}
\end{equation}\label{inverse_matrix}
\end{small}

}

Our paper focuses on generalization, where we consider the partial identification of $f(y \mid do(x))$ instead of its unique form computation. This change stems from our relaxed assumption on $P(\bm{W} \mid \bm{U})$, where we move from total observability to partial observability and remove the guarantee for reversibility (thus ${P(\bm{W} \mid \bm{U})}^{-1}$ in Eqn~\eqref{inverse_matrix} may not exist). Specifically, we expand the identification region of $P(\bm{W}\mid \bm{U})$ from a fixed distribution to the family $\mathscr{P}$, which contains all possible $P(\bm{W} \mid \bm{U})$ such that ${P^{}(\bm{W} \mid \bm{U})} - \underline{P^{}(\bm{W} \mid \bm{U})}$ and 
$\overline{P^{}(\bm{W} \mid \bm{U})} - P^{}(\bm{W} \mid \bm{U})$ all both non-negative. Here $\underline{P^{}(\bm{W} \mid \bm{U})}$ and $\overline{P^{}(\bm{W} \mid \bm{U})}$ are two priori known matrices to bound ${P^{}(\bm{W} \mid \bm{U})}$. This scenario is prevalent in the real world. While studies~\citep{kuroki2014measurement, greenland2005multiple} have generally confirmed its verifiability, recent literature has not fully explored it. In our paper, we reiterate the condition $P(\bm{W}\mid \bm{U}) \in \mathscr{P}$ as the 'partial observability assumption' in our following text. Under this assumption, we can naturally set our original goal as seeking the lower bound of $f(y \mid do(x))$ (upper bound is symmetric) by solving the following partial identification problem: 
$f(y,x) + {\min }\!\! \sum_{i=1}^{d_u} {f(y,u_i,x) f(u_i,\neg x)}/{f(u_i,x)},$
    \text{subject~to~} ${f(y, \bm{W}, \bm{U},\bm{X}) \in \mathcal{F}}$. 
Here $f(y, \bm{W}, \bm{U},\bm{X})$ is a three-order $d_w* d_u* d_x$ tensor indicating the joint probability distribution of each $ w \in \bm{W}, u \in \bm{U}, x \in \bm{X}$ together with $\bm{Y}=y$. The set $\mathcal{F}$ = \{$f(y, \bm{W}, \bm{U},\bm{X}): f(y, \bm{W}, \bm{U},\bm{X})$ is compatible with $P(\bm{W}\mid \bm{U}) \in \mathscr{P}$\}. 

Achieving this goal is challenging due to the difficulty in achieving its tight bound. Firstly, the feasible region $\mathcal{F}$ is challenging to represent in a closed form due to the boundary constraints that include the partially observable $P(\bm{W} \mid \bm{U})$. This constraint can be seen as a first-kind Fredholm integral equation, which is an ill-posed problem when $P(\bm{W} \mid \bm{U})$ is irreversible\footnote{Otherwise, the closed-form expression of $\mathcal{F}$ can only be approximated iteratively by complex numerical methods~\citep{strand1968statistical}, which is beyond our scope.}. To address this issue, we propose relaxing the feasible region from $\mathcal{F}$ to $\mathcal{\widetilde{F}}$ ($\mathcal{F} \subseteq \mathcal{\widetilde{F}}$), which contains a closed-form expression. Specifically, the relaxed condition $f(y, \bm{W}, \bm{U},\bm{X}) \in \mathcal{\widetilde{F}}$ ensures that the feasible regions of $f(y,\bm{U}, x)$, $f(y,\bm{U}, x)$, and $f(\bm{U}, \neg x)$ can be represented in a calculable closed-form. We refer to these as $IR_{F(y,U,x)}$, $IR_{F(U,x)}$, and $IR_{F(U,\neg x)}$, respectively, in our final objective function in the following section.

    Secondly, even if we retreat and seek its valid bound as above, it remains non-trivial due to the difficulty in finding an appropriate optimization method. Since the causal effect is expressed as a fractional summation, it is natural to explore techniques in sum-of-ratios fractional programming (SFP). The general form of SFP, as summarized in~\citep{schaible2003fractional}, is represented as follows: $\min \{\sum_{i=1}^{M} \frac{g_{1i}(\bm{\phi})}{g_{2i}(\bm{\phi})}\}, \bm{\phi} \in S\}, g_{1i}(\bm{\phi})~\text{is convex}, g_{2i}(\bm{\phi})~\text{is concave},g_{1i}(\bm{\phi}), g_{2i}(\bm{\phi}) > 0$.
% {
% \begin{equation}
% \begin{small}
%     \begin{aligned}
%     &\min \{\sum_{i=1}^{M} \frac{g_{1i}(\bm{\phi})}{g_{2i}(\bm{\phi})}\}, \bm{\phi} \in S, g_{1i}(\bm{\phi})~\text{is convex}, g_{2i}(\bm{\phi})~\text{is concave},\\ &g_{1i}(\bm{\phi})\geq 0, g_{2i}(\bm{\phi}) > 0,
%     \end{aligned}
% \end{small}
% \end{equation}
% }
Here $S$ is a convex set, and $M \geq 2$ is a integer. In order to ensure the global nature of optimal solutions, $g_{1i}(\Phi)$ and $g_{2i}(\Phi)$ are assumed to be convex and concave, respectively. In contrast with the above formulation, we should choose $   \bm{\phi} = (({f(y,u_i,x),...})^T,  (f(u_i,x), ...)^T,  (f(u_i,\neg x),...)^T),$ 
     and $g_{1i}(\bm{\phi}) = f(y,u_i,x)f(u_i,\neg x),g_{2i}(\bm{\phi}) = f(u_i,x).$
% {
% \begin{equation}
%     \begin{aligned}
%      &\bm{\phi} = (({f(y,u_i,x),...})^T,  (f(u_i,x), ...)^T,  \\&(f(u_i,\neg x),...)^T), \\
%     & g_{1i}(\bm{\phi}) = f(y,u_i,x)f(u_i,\neg x),\\&g_{2i}(\bm{\phi}) = f(u_i,x).
%     \end{aligned}
% \end{equation} 
% }
Moreover, $M = d_u, i=1,2,...d_u$. However, this construction violates the traditional convex-concave assumption, as $g_{1i}(\bm{\phi})$ is not convex. Therefore, here traditional SFP algorithms~\citep{schaible2003fractional} are not suitable.


To address these two challenges, we introduce the Partial Identification with Sum-of-Ratios Fractional Programming (PI-SFP) algorithm. Motivated by the branch and bound strategy~\citep{dai2005conical,lawler1966branch,branch-bound-SFP, pei2013global} and DC programming~\citep{horst1999dc,tao1997convex,pei2013global}, our algorithm iteratively searches the optimal bound through feasible region partitioning. Different from the closely-related partial identification literature~\cite{duarte2023automated, li2022bounds}, we also provide a comprehensive new convergence analysis. 

% Our paper not only theoretically contributes to the traditional branch-and-bound optimization literature~, but also 


% makes two contributions mainly on two sides: Firstly, for the causal bounding problem, our proposed method, PI-SFP, utilizes the fractional branch-and-bound optimization technique to estimate the causal effect. This technique has rarely been used in previous work. To our knowledge, existing literature only includes studies such as \cite{duarte2021automated} and \cite{li2022bounds}, which leverage branch-and-bound and fractional-programming frameworks, respectively. However, these studies do not take into account the partial observability of $P(\bm{W} \mid \bm{U})$, making them less general than PI-SFP. Secondly, in the optimization field, our method also contributes to existing literature on the convergence analysis in the branch and bound strategy, such as \citep{dai2005conical,lawler1966branch,branch-bound-SFP, pei2013global}.


For supplement, due to these two challenges of solving the partial observability case, another line of recent literature has avoided further discussion on the observability of $P(\bm{W}\mid \bm{U})$. Instead, researchers have introduced an auxiliary variable $\bm{Z}$ and formalized the problem as the double negative control~\citep{miao2018identifying, cui2020semiparametric, tchetgen2020introduction, deaner2018proxy,shi2020multiply,singh2020kernel, nagasawa2018identification,kallus2021causal}. However, as illustrated in the introduction, there are no free lunch (Table.~\ref{table_literature}). These works are restricted by additional assumptions about proxies, such as the completeness condition and the bridge function condition. Importantly, all of these works still rely on the reversibility of $P(\bm{W} \mid \bm{U})$, except for~\cite{ghassami2023partial, kallus2021causal}, who substituted it as a weaker bridge function condition. Consequently, even if the transition matrix is still reversible, but with a large conditional number\footnote{The conditional number of matrix $A$ is denoted as $\kappa(A) = \frac{\sigma_{max}(A)}{\sigma_{min}(A)}$, where $\sigma_{max}(A)$ and $\sigma_{min}(A)$ denote the maximal and minimal singular values of $A$. If some rows/columns of $A$ are similar (or equal), then $\kappa(A)$ is large (or $+\infty$), and $A^{-1}$ is computationally hard (or may not exist).}, numerical computations would already become extensively overwhelming. In conclusion, revisiting single-proxy control under the partial observability of $P(\bm{W} \mid \bm{U})$ is not only challenging but also necessary.
% This scenario represents a number of common real-world situations that are overlooked by double-proxy control methods, as illustrated above. In this paper, we explore the estimation of causal effects with this assumption and propose an algorithm called PI-SFP. 






\section{Method}\label{framework}
We begin by presenting definitions and assumptions and then rigorously formulate the objective function. 


\paragraph{DEFINITIONS}
% Based on preliminaries, we reorganize all definitions and assumptions.
We denote $Y,Y_{0},Y_{1}\in [Y^L,Y^U], Z \in [Z^L, Z^U], X \in [X^{L}, X^{U}]$, $W \in [W^L, W^U]$, $U\in [U^{L}, U^{U}]$. Moreover, we use $d_z, d_u, d_w,d_x$ to denote the cardinality of variables $\bm{Z}, \bm{U}, \bm{W}, \bm{X}$. For instance, the set of confounder $\bm{U}$ is $\{u_1,u_2,...u_{d_u}\}$. $X\neq x$ is simplified as $\neg x$. $Y$ can be discrete or continuous.


Moreover, $Y_x$ is the value of $\bm{Y}$ when $X$ is forced to be $x$. $\pi(x)$ is a weight function\footnote{In~\citep{kallus2021causal}, it is called as generalized average causal effect. It can degenerate to the traditional form~\citep{pearl2013testability} as
$ACE_{\bm{X}\rightarrow \bm{Y}} = E(Y_1)-E(Y_0)$ if we choose $d_x=2, X=\{0,1\}$, and $\pi(x) = \bm{sgn}(x)
$, where $\bm{sgn}(\cdot)$ is the sign function.}  of $\bm{X}$. On this basis, $ACE_{\bm{X} \rightarrow \bm{Y}}$ denotes the average causal effect (ACE) from $\bm{X}$ to $\bm{Y}$, namely that $ACE_{\bm{X} \rightarrow \bm{Y}} = \int_{x} \int_y y f(Y_x=y)\pi(x) dx dy$.



 








\begin{assumption}{{(partial observability)}} $P(\bm{W} \mid \bm{U}) \in \mathscr{P}$. \label{ass_partial_bounded}
\end{assumption}
Here the set $\mathscr{P}$ is identified in Section~\ref{pre+frame}, where $\underline{P^{}({\bm{W}} \mid {\bm{U})}}$ and $\overline{P^{}(\bm{W} \mid \bm{U})}$ are two a priori known matrices serving as the partial order of ${P^{}(\bm{W} \mid \bm{U})}$.

% According to Assumption.~\ref{ass_partial_bounded}, we can derive that


% \begin{equation}
%     \begin{aligned}
%     \left[\begin{matrix}
%      +\overline{P^{}(\bm{W} \mid \bm{U})} f(y,\bm{U},x) - f(y,\bm{W},x)\\
%     -\underline{P^{}(\bm{W} \mid \bm{U})} f(y,\bm{U},x) +f(y,\bm{W},x)
%     \end{matrix}\right]
%      \geq 0. 
%     \end{aligned}\label{partial_observe}
% \end{equation}
%  We are preparing to construct the relaxed feasible region of $f(y, \bm{U}, \bm{W}, \bm{X})$. In the following section, we will outline the model framework for identifying the valid bound of $f(Y_x = y)$ and extending it to the ACE case.


\paragraph{{OBJECTIVE FUNCTION}}\label{sec_obj_fun}

The objective of this section is to formalize the optimization problem of the single proxy control under Assumption.~\ref{ass_partial_bounded}. During this process, we aim to tackle the two challenges introduced in the preliminaries. Our primary objective is (the maximum case is symmetric):
\begin{equation}
    \begin{aligned}
    &\text{min~} f(y,x)  +  \sum_{i=1}^{d} \frac{f(y,u_i,x) f(u_i, \neg x)}{f(u_i,x)},\\&\text{subject to: $f(y,\bm{U},\bm{W}, \bm{X}) \in \mathcal{F}$}.
    \label{eqn_basic_bound}
    \end{aligned}
\end{equation}
Here $\mathcal{F} = \{f(y, \bm{U}, \bm{W}, \bm{X}): f(y, \bm{U}, \bm{W}, \bm{X})$ is compatible with Assumption.~\ref{ass_partial_bounded} and observed $f(y, \bm{W}, \bm{X}) \}$. 

 
As we suggested in the preliminaries, \textit{the first challenge} is the nonexistence of closed-form expression of $\mathcal{F}$. To solve it, we introduce the new symbol $\mathcal{\widetilde{F}}$ to formally describe the relaxation of the identification region of $f(y,\bm{U},\bm{W}, \bm{X})$. For preparation, we introduce the symbol $\bm{\theta}, \bm{\psi}, \bm{\omega}$ and follow the previous notation $\bm{\phi}$: 
$   \theta_i = f(y,u_i,x),
    \psi_i = f(u_i,x),
    \omega_i = f(u_i,\neg x),
    \bm{\theta} = (\theta_1, \theta_2,...\theta_d)^T,
    \bm{\psi} = (\psi_1, \psi_2,...\psi_d)^T,
    \bm{\omega} = (\omega_1, \omega_2,...\omega_d)^T,
    ~\bm{\phi} = 
    (\bm{\theta}~  \bm{\psi}~ \bm{\omega}).$

Then we construct a broader set $\mathcal{\widetilde{F}}$ as follows:  
% \begin{equation}
%     \begin{aligned}
%     \mathcal{\widetilde{F}}= \left\{f(y,\bm{U},\bm{W}, \bm{X}): \bm{\phi} \in IR_{\bm{\Phi}}, IR_{\bm{\Phi}} = IR^{1}_{\bm{\Phi}}\cap IR^{2}_{\bm{\Phi}} \right\},
%     \end{aligned}\label{construction_F}
% \end{equation}
{
\begin{equation*}
    \begin{aligned}
    \mathcal{\widetilde{F}}= \left\{f(y,\bm{U},\bm{W}, \bm{X}): \bm{\phi} \in IR_{\bm{\Phi}}, IR_{\bm{\Phi}} = IR^{1}_{\bm{\Phi}}\cap IR^{2}_{\bm{\Phi}} \right\},
    \end{aligned}\label{construction_F}
\end{equation*}
}
% \xinyan{
% \begin{equation}
%     \mathcal{\widetilde{F}}= \left\{
%     \begin{aligned} &f(y,\bm{U},\bm{W}, \bm{X}): \bm{\phi} \in IR_{\bm{\Phi}}, \\&IR_{\bm{\Phi}} = IR^{1}_{\bm{\Phi}}\cap IR^{2}_{\bm{\Phi}}
%     \end{aligned}\right\},
%     \label{construction_F}
% \end{equation}
% }

where the set $IR^{1}_{\bm{\Phi}}$ denotes the set of $\bm{\Phi}$ satisfies $\overline{P(\bm{W}\mid \bm{U})} \bm{\phi} \geq f(y, \bm{W}, x), f(\bm{W},x), f(\bm{W}, \neg x) \geq \underline{P(\bm{W}\mid \bm{U})}\bm{\phi}$.
% \begin{equation}
% \begin{aligned}
%   &IR^{1}_{\bm{\Phi}} = \{\bm{\phi}: \left[\begin{matrix}
%   -\bm{I_{d*d}} \\ \bm{I_{d*d}}
%   \end{matrix}\right] \left[\begin{matrix}
%   &f(y,\bm{W},x)^{T} \\ &f(\bm{W},x)^{T} \\ &f(\bm{W},\neg x)^{T}
%   \end{matrix} \right]^{T}  -  \left[ \begin{matrix}
%   &-\overline{P(\bm{W}\mid \bm{U})} \\  &\underline{P(\bm{W} \mid \bm{U})}
%   \end{matrix} \right] \bm{\phi} \geq \bm{0}\}.\\
%   \end{aligned}\label{IR_1}
% \end{equation}
% {
% \begin{equation}
% \begin{aligned}
%   &IR^{1}_{\bm{\Phi}} = \{\bm{\phi}: \left[\begin{matrix}
%   -\bm{I_{d*d}} \\ \bm{I_{d*d}}
%   \end{matrix}\right] \left[\begin{matrix}
%   &f(y,\bm{W},x)^{T} \\ &f(\bm{W},x)^{T} \\ &f(\bm{W},\neg x)^{T}
%   \end{matrix} \right]^{T}  \\&-  \left[ \begin{matrix}
%   &-\overline{P(\bm{W}\mid \bm{U})} \\  &\underline{P(\bm{W} \mid \bm{U})}
%   \end{matrix} \right] \bm{\phi} \geq \bm{0}\}.\\
%   \end{aligned}\label{IR_1}
% \end{equation}
% }
Here we use $\bm{S}_1 \geq \bm{S}_2$ to denote $\bm{S}_1 - \bm{S}_2$ is a non-negative matrix, and $\bm{I_{d*d}}$ denotes the $d*d$ identity matrix. Moreover, the set $IR^{2}_{\bm{\Phi}}$ indicates all possible $\bm{\phi}$ such that $\bm{1}* \bm{\theta} = f(y,x), \bm{1}* \bm{\phi} = f(x), \bm{1}* \bm{\omega} = f(\neg x)$, and $\forall i, \theta_i \in [0, f(y,x)], \phi_i \in (0,f(x)], \omega_i \in [0,f(\neg x)]$.


% the natural constraints:

% \begin{equation}
%     \begin{aligned}
%     IR_{\bm{\Phi}}^{2} = \left\{ \bm{\phi}:
%     \left[\begin{matrix}
%     &\bm{1_{1*d}} \bm{\theta} \\ &\bm{1_{1*d}} \bm{\phi} \\ &\bm{1_{1*d}} \bm{\omega} 
%     \end{matrix}\right] = \left[ \begin{matrix}
%     &f(y,x)\\ &f(x)\\ &f(\neg x) \}
%     \end{matrix}\right], \forall i,
% \left\{\begin{matrix}
%     \theta_i \in [0,f(y,x)] \\
%     \phi_i \in (0,f(x)] \\
%     \omega_i \in [0,f(\neg x)]
% \end{matrix} \right\}. \right\}    .
%     \end{aligned}
% \end{equation}

% {
% \begin{equation}
%     IR_{\bm{\Phi}}^{2} = \left\{ \begin{aligned}
%     &\bm{\phi}:
%     \left[\begin{matrix}
%     &\bm{1_{1*d}} \bm{\theta} \\ &\bm{1_{1*d}} \bm{\phi} \\ &\bm{1_{1*d}} \bm{\omega} 
%     \end{matrix}\right] = \left[ \begin{matrix}
%     &f(y,x)\\ &f(x)\\ &f(\neg x) \}
%     \end{matrix}\right], \\ &\forall i,
% \left\{\begin{matrix}
%     \theta_i \in [0,f(y,x)] \\
%     \phi_i \in (0,f(x)] \\
%     \omega_i \in [0,f(\neg x)]
% \end{matrix} \right\}. \end{aligned} \right\}.
% \end{equation}
% }
Here $\bm{1}$ denotes the corresponding all-ones vector. By this construction, the enclosure property $\mathcal{F} \subseteq \widetilde{\mathcal{F}}$ is guaranteed.
\begin{proposition} $\mathcal{F}$ is enclosed by $\widetilde{\mathcal{F}}$, namely that $\mathcal{F} \subseteq \mathcal{\widetilde{F}}$. \label{basic_IR}
\end{proposition}

The proof is shown in the Appendix~\ref{proof_basic_IR}. Proposition.~\eqref{basic_IR} provides the extension of the feasible region of $f(y, \bm{U}, \bm{W}, \bm{X})$ from $\mathcal{F}$ to $\mathcal{\widetilde{F}}$. On this basis, Eqn~\eqref{eqn_basic_bound} is relaxed as follows: $\underline{f(Y_x = y)} =  \text{min~} f(y,x) +  \sum_{i=1}^{d}\theta_{i}\omega_{i}/ {\psi_{i}},~\text{subject to:~} f(y,\bm{U},\bm{W}, \bm{X}) \in \mathcal{\widetilde{F}}, i.e., \bm{\phi} \in IR_{\bm{\Phi}}.$
  
Symmetrically, the optimal value is denoted as $\overline{f(Y_x = y)}$ for the maximum case. Moreover, the corresponding set of optimal solutions are denoted as $\bm{\Phi_{opt}}$. The following proposition discuss the tightness of $\underline{f(Y_x = y)}$:
\begin{proposition}\label{constraint_prove_tight}
The outcome $\underline{f(Y_x = y)}$ serves as the valid lower bound of $f(Y_x = y)$. Moreover, this bound is {tight} {if and only if} the following set is not empty:
$\Big\{f(y,\bm{U},\bm{W},\bm{X}) : f(y,\bm{U},\bm{W},\bm{X})\in \mathcal{F} \text{~and is compatible with some~} \bm{\phi_{opt}}\in \bm{\Phi_{opt}}\Big\} \neq \emptyset$, where $\bm{\phi_{opt}}$ is an element of the set $\bm{\Phi_{opt}}$. The maximum case $\overline{f(Y_x = y)}$ is symmetric. \label{proposition_tight}
\end{proposition}


As discussed in the preliminaries, we have explained the reasons why guaranteeing that $\underline{f(Y_x = y)}$ is a tight bound is beyond the community scope, hence Proposition~\ref{constraint_prove_tight} is already the optimal result. In practice, it could be verified that Eqn~\eqref{proposition_tight} holds in a number of cases, whose details are detailed in Appendix~\ref{equal_reform_algorithm1}.


We now aim to tackle \textit{the second challenge}: the non-trivial nature of the fractional programming problem due to the invalidation of the convex-concave condition. To address this issue, we adopt the \textit{difference-in-convex (DC)} decomposition strategy to formally describe how we relax the above formulation into a linear programming problem. To prepare for this, we first transform the fractional form and introduce the knockoff variable $\bm{\psi^o}$ to replace the denominator. Next, we introduce the $4d-$ dimensional vector $\bm{\gamma}$: $\bm{\psi^o} = (\psi_{1}^{o}, \psi_{2}^o,... \psi_{d})^T,~\bm{\gamma} = \left(\begin{matrix}
    (\bm{\psi^o})^T, \bm{\theta}^T, \bm{\psi}^T, \bm{\omega}^T
    \end{matrix} \right)^T, \text{where~} (\bm{\theta}, \bm{\psi}, \bm{\omega}) \text{~is copied from~} \bm{\phi}.$

Then our original function is equivalently transformed to
\begin{equation}
    \begin{aligned}
    &\underline{f(Y_x = y)} = \text{~min~} f(y ,x) + \sum_{i=1}^{d} \psi_{i}^{o} \theta_{i} \omega_{i} \\
    &\text{~subject to}: \bm{\gamma} \in  IR_{\Gamma}: = \{\bm{\gamma}: \bm{\phi} \in IR_{{\Phi}},
    \psi_{i}^{o} \psi_{i}  = 1\}.
    \end{aligned}   \label{re-formulation}
\end{equation}
Here $i=1,...d.$ Although the knock-off trick has been implemented, achieving the final goal remains challenging in practice. Firstly, the objective function and constraints are both non-convex and nonlinear. Secondly, relying on local optimal algorithms alone is not viable, as it may not guarantee the validity of the bound $\underline{f(Y_x = y)}$. Thus, our motivation is to construct a weaker linear programming form that can approximate the global optimal value of (\ref{re-formulation}). To achieve this, we propose applying the \textit{difference-in-convex (DC)} decomposition as our core idea: $ \forall \bm{\gamma},$
% \begin{equation}
%     \begin{aligned}
%      \forall \bm{\gamma}, &\sum_{i=1}^{d}\psi_{i}^o \theta_{i} \omega_{i} = C_1(\bm{\gamma}) -C_2(\bm{\gamma}),~\psi_{i}^{o}\psi_{i} = D_{i1}(\bm{\gamma}) - D_{i2}(\bm{\gamma}),  \label{DC_decomposition}
%     \end{aligned}
% \end{equation}
\hspace{-20mm}
\begin{equation}
    \begin{aligned}
     \sum_{i=1}^{d}\psi_{i}^o \theta_{i} \omega_{i} = C_1(\bm{\gamma}) -C_2(\bm{\gamma}),~\psi_{i}^{o}\psi_{i} = D_{i1}(\bm{\gamma}) - D_{i2}(\bm{\gamma}),  \label{DC_decomposition}
    \end{aligned}
\end{equation}
where $i=1, 2, \cdots, d, C_1(\bm{\gamma}), C_2(\bm{\gamma}), D_{i1}(\bm{\gamma}), D_{i2}(\bm{\gamma})$ \footnote{Note that the sub-script $cyc$ in~\eqref{dc_decomposition} is an abbreviation of cyclic sum following~\citep{du2012note}, which cycles through $\{\psi^o_i,\theta_{i},\omega_{i}\}$ in the corresponding function and take the sum. For instance, we have $\sum_{cyc}[\psi^o_i+\theta_i^2]^2 = [\psi^o_i+\theta_i^2]^2 + [\theta_{i}+\omega_{i}^2]^2 + [\omega_{i}+(\psi^o_{i})^2]^2$.} are all convex functions (see Appendix~\ref{app_main_result_1}) satisfying that $C_1(\bm{\gamma}), C_2(\bm{\gamma}), D_{i1}(\bm{\gamma}) :=$
% {\begin{equation}
%     \begin{aligned}
%       &C_1(\bm{\gamma}) =\sum_{i=1}^{d} \frac{1}{6}(\sum\limits_{cyc} \psi^o_i )^3+\frac{1}{2}\sum\limits_{cyc}(\psi^o_i)^4 + \frac{1}{2}\sum\limits_{cyc}(\psi^o_i)^2,\\
%     &C_2(\bm{\gamma}) = \sum_{i=1}^{d}  \frac{1}{6}\sum\limits_{cyc} (\psi^o_i)^3+\frac{1}{4}\sum_{cyc}[(\psi^o_i)^2+\theta_i]^2 + \frac{1}{4}\sum_{cyc}[\psi_i^o+\theta_i^2]^2,\\
%     &D_{i1}(\bm{\gamma}) = \frac{1}{2}(\psi_i^o+\psi_i)^2,~
%     D_{i2}(\bm{\gamma}) = \frac{1}{2}[(\psi^o_i)^2 + (\psi_i)^2].
%     \end{aligned}\label{dc_decomposition}
% \end{equation}}
\begin{equation}
    \begin{aligned} 
    \hspace{-2em}
      &\sum_{i=1}^{d} \frac{1}{6}(\sum\limits_{cyc} \psi^o_i )^3+\frac{1}{2}\sum\limits_{cyc}(\psi^o_i)^4 + \frac{1}{2}\sum\limits_{cyc}(\psi^o_i)^2,\\
    & \sum_{i=1}^{d}  \frac{1}{6}\sum\limits_{cyc} (\psi^o_i)^3+\frac{1}{4}\sum_{cyc} [(\psi^o_i)^2+\theta_i]^2+ [\psi_i^o+\theta_i^2]^2,\\
    & \frac{1}{2}(\psi_i^o+\psi_i)^2,~
    D_{i2}(\bm{\gamma}) = \frac{1}{2}[(\psi^o_i)^2 + (\psi_i)^2],
    \end{aligned}  \label{dc_decomposition}
\end{equation}
respectively. Exploiting their convexity, we bound them by the following linear functions, which are constructed by secants and tangents of the original function: 
% \begin{equation}
% \begin{aligned}
% &C_1(\bm{\gamma}) -C_2(\bm{\gamma})  \geq  C_1^{\text{tan}}(\bm{\gamma}) - C_2^{\text{sec}}(\bm{\gamma}), \\ &
%  D_{i1}(\bm{\gamma}) - D_{i2}(\bm{\gamma}) \in [D_{i1}^{\text{tan}}(\bm{\gamma}) - D_{i2}^{\text{sec}}(\bm{\gamma}), D_{i1}^{\text{sec}}(\bm{\gamma}) - D_{i2}^{\text{tan}}(\bm{\gamma})]
% \end{aligned}
% \end{equation}
{
\begin{small}
\begin{equation}
\begin{aligned}
&C_1(\bm{\gamma}) -C_2(\bm{\gamma})  \geq  C_1^{\text{tan}}(\bm{\gamma}) - C_2^{\text{sec}}(\bm{\gamma}), \\ &
 D_{i1}(\bm{\gamma}) - D_{i2}(\bm{\gamma}) \in [D_{i1}^{\text{tan}}(\bm{\gamma}) - D_{i2}^{\text{sec}}(\bm{\gamma}), D_{i1}^{\text{sec}}(\bm{\gamma}) - D_{i2}^{\text{tan}}(\bm{\gamma})]
\end{aligned}
\end{equation}
\end{small}

}
For their explicit form solutions, we refer the readers to \eqref{tan_sec}.
This allows us to relax the original problem in~(\ref{re-formulation}) into the following linear program:
\begin{equation}
    \begin{aligned}
    &\text{min~}f(y ,x) + C_1^{\text{tan}}(\bm{\gamma}) - C_2^{\text{sec}}(\bm{\gamma}) \\
    &\text{subject to}: \bm{\phi} \in IR_{{\Phi}}, D_{i1}^{\text{tan}}(\bm{\gamma}) - D_{i2}^{\text{sec}}(\bm{\gamma}) \leq 1 , \\ &D_{i1}^{\text{sec}}(\bm{\gamma}) - D_{i2}^{\text{tan}}(\bm{\gamma}) \geq 1, i=1, 2, \cdots, d.
    \end{aligned}   \label{re-formulation_linear_weaker}
\end{equation}
It is clear that this shift causes the estimation error. In order to eliminate it in practice, we iteratively do DC within simplicial partitioned feasible regions. Details will be shown in the following section. In conclusion, we already address these two challenges in the preliminaries. In addition, Our framework is natural to extend to the ACE cases, and we refer readers to Appendix~\ref{ACE_APP} for details.






\section{Algorithm}\label{section_algorithm}
In this section, we demonstrate how to compute $\underline{f(Y_x = y)}$ in~\eqref{re-formulation} practically. As mentioned earlier, this involves optimizing a non-convex function, which requires new optimization techniques to find the global optimum. To this end, we propose Partial Identification via Sum-of-ratios Fractional Programming (PI-SFP), a fractional programming-based method that optimizes the objective through iterative approximation. Specifically, we begin by constructing a simplex $S_0$ that encloses the feasible region of~\eqref{re-formulation}. We then use $S_0$ to identify a lower bound of $\underline{f(Y_x = y)}$ using the difference-in-convex (DC) decomposition strategy. In each iteration, we partition $S_{0}$ into multiple simplices to refine the lower bound constructed in the initial step.

The remainder of this section is organized as follows. In Section.~\ref{sec_framework}, we introduce the main framework of our algorithm, which we divided into four modules: 1) \textbf{Initialization}, 2)\textbf{Bisection}, 3) \textbf{Bounding}, and 4) \textbf{Global\_error}. We then elaborate on each module in detail in Section.~\ref{initialization_section}. For ease of notation, we introduce the following symbols for algorithm description: $\bullet$ For a simplex $S$, $dia(S) := \max_{s_1,s_2 \in S_{}}\|s_1 - s_2\|_2$ denotes its diameter, and $S^i$ denotes its $i-$th supporting vector, $i={0,1,...4d}$.
$\bullet$ $\underline{f_{S}(Y_{x} = y)}$ denotes the optimal value of (\ref{re-formulation}) when its feasible region is strengthened to $\bm{\gamma} \in IR_{{\Gamma}} \cap S$.







\subsection{Overview of PI-SFP}\label{sec_framework}

The framework of PI-SFP to solve (\ref{re-formulation}) is as follows.

%\iffalse
%\begin{algorithm}[t]
%\caption{PI-SFP.}
%\begin{algorithmic}[1]
%\REQUIRE $k \leftarrow 0$, $\underline{\underline{f(Y_x = y)}} \leftarrow -\infty$, $L \leftarrow 0$, $\varepsilon >0 $. 
%\ENSURE $\underline{\underline{f(Y_x = y)}}$. 
%\WHILE{$\text{error} \leq \varepsilon$}
%\STATE {\textbf{Step~1:~}}$S_{0}=\textbf{Initialization} (f(y, \bm{W}, \bm{X}), \underline{P(\bm{W} \mid \bm{U})}, \overline{P(\bm{W} \mid \bm{U})})$.
%\STATE {\textbf{Step~2:~}}$\underline{\underline{f(Y_x = y)}} \leftarrow \min\limits_{i\in \{0,1,...k\}}\textbf{Bounding}(S_{ki})$;
%\STATE {\textbf{Step~3:~}}$\{S_{(k+1)i}\}_{i=0}^{k+1} \leftarrow \textbf{Bisection} (\{S_{ki}\}_{i=0}^{k}, \arg\min\limits_{i\in \{0,1,...k\}}\textbf{Bounding}(S_{ki}) )$;
%\STATE {\textbf{Step~4:~}}$\text{error} \leftarrow \textbf{Global\textbf{\_error}}(\text{Bi}), k \leftarrow k+1$;
%\ENDWHILE
%
%\RETURN $\underline{\underline{f(Y_x = y)}}$.
%
%\end{algorithmic}\label{PI-SFP}
%\end{algorithm}
%\fi

%\iffalse. %example
\begin{algorithm}[t]
\normalem
\caption{Partial Identification via Sum-of-ratios Fractional Programming (PI-SFP).}%算法名字
\LinesNumbered %要求显示行号
\KwIn{Observational distribution $f(y, \bm{W}, \bm{X})$, $\underline{P(\bm{W} \mid \bm{U})}$, $\overline{P(\bm{W} \mid \bm{U})}$, a prespecified error bound $\delta >0$.}
\KwOut{A lower bound estimate ${\underline{f^k_{opt}(Y_x = y)}}$. }

Let $k = 0$, construct an original simplex
$S_0 = \textbf{Initialization} (f(y, \bm{W}, \bm{X}), \underline{P(\bm{W} \mid \bm{U})}, \overline{P(\bm{W} \mid \bm{U})});$

Calculate a lower bound of $\underline{f_{S_{0}}(Y_x = y)}$ via the $\textbf{Bounding}$ function:
$\underline{\underline{f_{S_{0}}(Y_x = y)}} = \textbf{Bounding}(S_{0});$

Set the collection of simplices at the $0$-th iteration as $\cS_0 = \{S_{0}\}$;

\While {$\text{{PI-SFP}}_{\text{{error}}} \leq \delta$} {
	
Let $\tilde{S}_k = \argmin_{S \in \cS_k} \underline{\underline{f_{S}(Y_x = y)}}$, where $\underline{\underline{f_{S}(Y_x = y)}}$ denotes the output of $\textbf{Bounding}(S)$ with input $S$;

Split  $\tilde{S}_k$ into two simplicies $\tilde{S}_{k1}$ and $\tilde{S}_{k2}$ via the $\textbf{Bisection}$ function:
$
\tilde{S}_{k1}, \tilde{S}_{k2} = \textbf{Bisection} (\tilde{S}_k)
$
and set
$
\cS_{k + 1} = \left(\cS_k \setminus \tilde{S}_k \right) \cup \{\tilde{S}_{k1}, \tilde{S}_{k2}\};
$

Calculate the estimation error bound via
\emph{$\text{{PI-SFP}}_{\text{{error}}}$} = $ \textbf{Global\textbf{\_error}}(\tilde{S}_0, \tilde{S}_1, \cdots, \tilde{S}_{k+1});$

Set $k = k + 1$;
}

%\ENDWHILE

Return $\underline{f^k_{opt}(Y_x = y)} = \max\limits_{i\in \{0,1,...k\}} \underline{\underline{f_{\tilde{S}_i}(Y_x = y)}}$.\label{main_alg}
\end{algorithm}


%\usepackage[lined,boxed,commentsnumbered]{algorithm2e}
\iffalse
\begin{algorithm}[H]
  \SetAlgoLined
  \KwData{this text}
  \KwResult{how to write algorithm with \LaTeX2e }

  initialization\;
  \While{not at end of this document}{
    read current\;
    \eIf{understand}{
      go to next section\;
      current section becomes this one\;
      }{
      go back to the beginning of current section\;
      }
    }
  \caption{How to write algorithms}
\end{algorithm}
\fi


\textbf{Step 1} involves pre-processing, where the $\textbf{Initialization}$ function is used to construct a baseline simplex $S_0$ enclosing the original feasible region, such that $IR_{\Gamma} \subseteq S_0$ and $\underline{f(Y_x = y)} = \underline{f_{S_0}(Y_x = y)}$ (see Lemma~\ref{lemma_enclose} in Appendix~\ref{app_main_result_1}). This equivalent transformation enables the computation of $\underline{f(Y_x = y)}$ via $\underline{f_{S_0}(Y_x = y)}$. In \textbf{Step 2}, we use the DC decomposition strategy as in~\eqref{DC_decomposition}-\eqref{re-formulation_linear_weaker} to find a lower bound of $\underline{f_{S_0}(Y_x = y)}$, which is denoted by $\underline{\underline{f_{S_0}(Y_x = y)}}$.

In \textbf{Steps 4-9}, we employ a bisection-like approach to iteratively partition $S_0$ into a set of simplices $\cS_k$ in the $k$-th iteration. Next, we reapply the DC decomposition strategy to new simplices to obtain a more accurate estimate. We stop and return the lower bound estimate in \textbf{Step 10} once the bounding error calculated by \textbf{Global\_error} reaches the prespecified threshold $\delta$. Otherwise, we make more delicate partitions and iterate this step.

%\textbf{Step~3-5} is for reducing bounding errors. To approximate $\underline{f_{S_{0}} (Y_x = y)}$ more closely, we combine the above DC relaxation strategy with a simplicial partitioning. Specifically, we iteratively partition $S_{0}$ to multiple simplices and derive the lower bound within each simplex. In $k$-th iteration, the original simplex $S_{0}$ is transformed as $\{S_{ki}\}_{i=0}^k$ such that $\underline{f_{S_{0}}(Y_x = y)} = \min \{\underline{f_{S_{ki}}(Y_x = y)}\}_{i=0}^k$ (see lemma.~\eqref{lemma_ObjS0_equal_ObjSk} in Appendix~\ref{app_main_result_1}). In this process, we seek linear bound $\underline{\underline{f_{S_{ki}}(Y_x = y)}}$ on each simplex $S_{ki}$, and \emph{the lowest of which} is selected as the temporary optimal lower bound of $\min \{\underline{f_{S_{ki}}(Y_x = y)}\}_{i=0}^k (\text{equal to our goal}~\underline{f(Y_x = y)})$. We denote it as $\underline{\underline{f(Y_x = y)}}[k]$. Finally, if the bounding error calculated by \textbf{Global\_error} is not below $\delta$, we further partition $\{S_{ki}\}_{i=0}^k$ into more simplices $\{S_{(k+1)i}\}_{i=0}^{k+1}$ and repeat the same bounding strategy.





\iffalse

\textbf{Step~3-4} is for iterative approximation. In $k$-th iteration, the above $S_{0}$ has already been replaced by the set of disjoint partitions $\{S_{ki}\}_{i=0}^{k}$ satisfying
\begin{equation}
    \begin{aligned}
     \bigcup_{i \in \{0,1,...k\}} S_{ki} = S_{0}, \text{int} (S_{ki}) \cap \text{int} (S_{kj}) = \emptyset , i,j \in \{0,1,...k\}, i \neq j,~k=0,1,2,...
    \end{aligned}
\end{equation}
whose first subscript represents the iteration number and the second represents the simplex index. On this basis, by function $\textbf{Bounding}$ in \textbf{Step~3}, we seek the lower bound of Eqn~\eqref{re-formulation} when the feasible region is within each $S_{ki}$. The lowest bound during the traversal on all partitions is recorded as $\underline{\underline{f(Y_x = y)}}$ as the temporary output. Further, \textbf{Step~4} provides the iterative transforming strategy from $\{S_{ki}\}_{i=0}^{k}$ to $\{S_{(k+1)i}\}_{i=0}^{k+1}$. By function \textbf{Bisection}, we adopt the simplicial bisection method~\citep{wood1992bisection} extended from the one-dimensional interval, in respect of which the partition with the temporary lowest bound $\underline{\underline{f(Y_x = y)}}$ is bisectioned. The new partitioning set $\{S_{(k+1)i}\}_{i=0}^{k+1}$ is prepared for the next iteration, and more importantly, it guarantees the convergence of $\underline{\underline{f(Y_x = y)}}$ to $ \underline{f(Y_x = y)}$ during recursion.


\textbf{Step~5} is for termination. In function $\textbf{Global\textbf{\_error}()}$, \iffalse\footnote{The concept of nested sequences is from~\citep{horst2000introduction}. It denotes a sequence of simplex where each simplex is always bisectioned from the front. In which the size of the feasible region decays rapidly.}\fi the error bound of PI-SFP is theoretically produced by the index from \textbf{Step~3}. We terminate PI-SFP until it reaches the threshold $\varepsilon$, and the lowest bound on the partitioning set is taken as the final approximation.

\fi









%\newpage
\subsection{Implementation of PI-SFP}
\label{initialization_section}

In this section, the above four functions are illustrated in detail.
\noindent ${\textbf{1) Initialization()}}$: The objective of this function is to construct an original simplex $S_0$ that encloses the feasible region $IR_{\Gamma}$. To achieve this, we draw inspiration from~\citep{horst2000introduction, pei2013global} and use the following approach to construct $S_0$. The justification of such construction is given in lemma.~\eqref{lemma_enclose}.Here $S_0$ denotes the set of $ \bm{\gamma} = (\gamma_1, \cdots \gamma_{4d})$ satisfying $1\leq i\leq 4d_u, \textbf{1} * \bm\gamma \leq \alpha, \gamma_i \geq \gamma_i^l: = \min\limits_{\bm\gamma \in IR_{\Gamma}} \gamma_i$, and $\alpha = 1 + f(y, x) + 
		\frac{d^2 (\psi^{l}+ \psi^{u})^2 }{4f(x) \psi^{l} \psi^{u}  } $, \text{~where~} \begin{equation}\begin{aligned}\psi^{l} = \min\limits_{i \in [2d+1,3d]}\!\! \gamma_{i}^l, \psi^{u} = \min\limits_{i \in [2d+1,3d]}\!\! \gamma_{i}^u, \gamma^{u}_i\!\! :=\!\!\max\limits_{\bm\gamma \in IR_{\Gamma}} \gamma.
	\label{initialization_S00}	\end{aligned}\end{equation}
% {
% \begin{equation}
%     \begin{aligned}
%     %  S_0 &= \{ \bm{\gamma} = (\gamma_1, \cdots \gamma_{4d}): \textbf{1} * \bm\gamma \leq \alpha, \gamma_i \geq \gamma_i^l, 1\leq i\leq 4d_u \},  \text{~where~}  \gamma^{l}_i = \min\limits_{\bm\gamma \in IR_{\Gamma}} \gamma_i,  \\
% 	\alpha &=  1 + f(y, x) + 
% 		\frac{d^2 (\psi^{l}+ \psi^{u})^2 }{4f(x) \psi^{l} \psi^{u}  }, \\ \left[\begin{matrix}
% 	\psi^{l}, \psi^{u}
% \end{matrix}\right] \!\!:&= \!\!\left[  \begin{matrix}\!\!
% 	\min\limits_{i \in [2d+1,3d]}\!\! \gamma_{i}^l, \!\! \max\limits_{i \in [2d+1,3d]}\!\! \gamma_{i}^u 
% \end{matrix}\right], \gamma^{u}_i\!\! :=\!\!\max\limits_{\bm\gamma \in IR_{\Gamma}} \gamma_i.
%     \end{aligned}\label{initialization_S00}
% \end{equation}
% }


%\setcounter{algocf}{0}
%\renewcommand{\algorithmcfname}{Function}

%\renewcommand{\thefootnote}{\ding{\numexpr192 }}

% \begin{algorithm}[h]
% %\begin{algorithmic}[1]
% \caption{\textbf{Initialization}.}
% \LinesNumbered %要求显示行号
% \KwIn {$f(y, \bm{W}, \bm{X}), \underline{P(\bm{W} \mid \bm{U})}, \overline{P(\bm{W} \mid \bm{U})}$.}
% \KwOut {The original simplex $S_{0}$.}

% Set $\gamma_{i}^l$, which serves as the lower bound of $\bm{\gamma}$, namely $\gamma^{l}_i = \min\limits_{\bm\gamma \in IR_{\Gamma}} \bm\gamma \bm{\vec{e_i}}$. 

% Set $\alpha$, which serves as the upper bound of $\max\limits_{\bm\gamma \in IR_{\Gamma}} \bm{1_{1*4d}}\bm{\gamma}$, namely
% \begin{equation}
% \begin{aligned}
%  \alpha =  1 + f(y, x) + 
% \frac{d^2 (\psi^{l}+ \psi^{u})^2 }{4f(x) \psi^{l} \psi^{u}  },
% \end{aligned}
% \end{equation}
% where $ \left[\begin{matrix}
% \psi^{l}\\ \psi^{u}
% \end{matrix}\right] = \left[  \begin{matrix}
% \min\limits_{i \in [2d+1,3d]} \gamma_{i}^l \\ \max\limits_{i \in [2d+1,3d]} \gamma_{i}^u 
% \end{matrix}\right]$. Symmetrically, here we denote $\gamma^{u}_i = \max\limits_{\bm\gamma \in IR_{\Gamma}} \bm\gamma \bm{\vec{e_i}}$.


% Simplex construction: 
% $S_{0}$ is spanned by $\{S_{0}^0, S_{0}^1, ...S_{0}^{4d}\}$, where\footnotemark{}
% \begin{equation}
%     \begin{aligned}
%       S_{0}^{i} =  \begin{cases}
% \bm{\gamma^l},~i=0 \\ \bm{\gamma^l}+ (\alpha - \bm{1_{1*4d}}\bm{\gamma^l})*\bm{\vec{e_i}} ,~i \in \{1,2,...4d\}
% \end{cases}, \text{~where~} \bm{\gamma^{l}} = (\gamma^{l}_1, \gamma^{l}_2,...\gamma^{l}_{4d})^T.
%     \end{aligned}\label{span_S00}
% \end{equation}



% Return $S_{0}$.
% %\end{algorithmic}
% \end{algorithm}

% \footnotetext{Recall that the definition of the simplex is $S_{0} = \{\sum_{i=0}^{4d} \lambda_i S_{0}^{i} | \sum_{i=0}^{4d} \lambda_i = 1, \lambda_i \geq 0\}$.}

 \noindent ${\textbf{2) Bisection()}}$: Motivated by the approach proposed in~\cite{rivara1984mesh}, the goal of this function is to partition an input simplex $S$ into two simplices $S_1$ and $S_2$ using the longest-edge (LE) bisection strategy. The partitioning details are outlined in Algorithm~\ref{alg_bisection}.


\begin{algorithm}[t]
%\begin{algorithmic}
\caption{Recursive procedure to split the simplicial partitions (\textbf{Bisection}).}
\LinesNumbered
\KwIn{Simplex $S$ with vertices $\{S^0, \cdots, S^{4d}\}$.}
\KwOut{Two new simplices $S_1, S_2$.}

Set $S^{t_1}, S^{t_2}$ as the vertices incident to the longest edge of $S$: $\{t_1, t_2\} = \argmax\limits_{\{a, b\} \in \{0, 1, \cdots, 4d\}} \|S^a - S^b\|_2;$

% Construct $S_1, S_2$ based on the following two sets of vertices: $\{S^{0}, \cdots, S^{t_1 - 1}, v, S^{t_1 + 1}, \cdots, S^{4d}\},\quad \{S^{0}, \cdots, S^{t_2 - 1}, v, S^{t_2 + 1}, \cdots, S^{4d}\},$
% where $v$ corresponds to the midpoint the longest edge. Return $S_1,  S_2$.\label{alg_bisection}

{
Construct $S_1, S_2$ based on the following two sets of vertices: $\{S^{0}, \cdots, S^{t_1 - 1}, v, S^{t_1 + 1}, \cdots, S^{4d}\}$,  $\{S^{0}, \cdots, S^{t_2 - 1}, v, S^{t_2 + 1}, \cdots, S^{4d}\},$
where $v$ corresponds to the midpoint the longest edge.\label{alg_bisection}}
%\end{algorithmic}
\end{algorithm}

% \footnotetext{The supporting vertices are
%     $
%         \{S^1_{k\text{Bi}[k]},...S^{s_1-1}_{k\text{Bi}[k]},\bm{v},S^{s_1+1}_{k\text{Bi}[k]},... S^{4d+1}_{k\text{Bi}[k]}\}, \{S^1_{k\text{Bi}[k]},...S^{s_2-1}_{k\text{Bi}[k]},\bm{v},S^{s_2+1}_{k\text{Bi}[k]},... S^{4d+1}_{k\text{Bi}[k]}\} 
%     $ \\
% respectively, where $ \{s_1,s_2\} = \arg\max\limits_{\{t_1,t_2\}\in \{0, ..., 4d\}}  \|  S_{k\text{Bi}[k]}^{t_1} -S_{k\text{Bi}[k]}^{t_2} \|_2, \bm{v} = \frac{S_{k\text{Bi}[k]}^{s_1} + S_{k\text{Bi}[k]}^{s_2}}{2}$.
% }

\noindent ${\textbf{3) Bounding()}}$: The purpose of this function is to derive a lower bound of $\underline{f_S(Y_x = y)}$ using the input $S$. This is the most crucial element of the algorithm. It is worth remembering that $\underline{f_S(Y_x = y)}$ can be expressed as the solution of the optimization program~\eqref{re-formulation} with an additional constraint $\bm{\gamma} \in S$. With the derivations shown in (\ref{DC_decomposition})-(\ref{re-formulation_linear_weaker}), we can easily obtain a lower bound of $\underline{f_S(Y_x = y)}$ by solving the following optimization problem\footnote{If $IR_{\Gamma} \cap S = \emptyset$, then $\underline{\underline{f_S(Y_x = y)}} = +\infty$. }:
% \begin{equation}
%     \begin{aligned}
%      \underline{\underline{f_{S_{}}(Y_{x} = y)}}= &\text{~min~} f(y,x) + C_1^{\text{tan}}(\bm{\gamma}) - C_2^{\text{sec}}(\bm{\gamma}) \\
% &\text{~subject to:~} \bm{\phi} \in IR_{{\Phi}},\bm{\gamma} \in S_{};~D_{i1}^{\text{tan}}(\bm{\gamma}) - D_{i2}^{\text{sec}}(\bm{\gamma}) \leq 1, D_{i1}^{\text{sec}}(\bm{\gamma}) - D_{i2}^{\text{tan}}(\bm{\gamma}) \geq 1, i=1,...d.
% %  \underline{\underline{f_{S_{}}(Y_{x} = y)}} =  &\max \{ \textbf{Bounding}(\text{pa}(S_{})), \underline{f_{S_{}}^{linear}(Y_{x} =  y)}\} ;
% \label{lemma_sub_linear}
%     \end{aligned}
% \end{equation}
{
\begin{equation}
    \begin{aligned}
     &\underline{\underline{f_{S_{}}(Y_{x} = y)}}= \text{~min~} f(y,x) + C_1^{\text{tan}}(\bm{\gamma}) - C_2^{\text{sec}}(\bm{\gamma}) \\ &\text{~subject to:~} \bm{\phi} \in IR_{{\Phi}},\bm{\gamma} \in S_{};D_{i1}^{\text{tan}}(\bm{\gamma}) - D_{i2}^{\text{sec}}(\bm{\gamma}) \leq 1, \\~~~~&D_{i1}^{\text{sec}}(\bm{\gamma}) - D_{i2}^{\text{tan}}(\bm{\gamma}) \geq 1, i=1,...d.
%  \underline{\underline{f_{S_{}}(Y_{x} = y)}} =  &\max \{ \textbf{Bounding}(\text{pa}(S_{})), \underline{f_{S_{}}^{linear}(Y_{x} =  y)}\} ;
\label{lemma_sub_linear}
    \end{aligned}
\end{equation}
}
As demonstrated in lemma.~\eqref{tan_sec_bounded} of Appendix~\ref{app_main_result_1}, the functions $C_1^{\text{tan}}(\bm{\gamma})$, $C_2^{\text{sec}}(\bm{\gamma})$, $D_{i1}^{\text{tan}}(\bm{\gamma})$, $D_{i2}^{\text{sec}}(\bm{\gamma})$, $D_{i1}^{\text{sec}}(\bm{\gamma})$ and $D_{i2}^{\text{tan}}(\bm{\gamma})$ are constructed from $C_1(\bm{\gamma}), C_2(\bm{\gamma})$, $D_{i1}(\bm{\gamma}), D_{i2}(\bm{\gamma})$'s in~\eqref{dc_decomposition} based on \textbf{sec}ants and \textbf{tan}gents within the simplex $S$: 
% \hspace{-2cm}\begin{equation} 
%     \begin{aligned}
%     \left[\begin{matrix}C_k^{\text{tan}}(\bm{\gamma})\\ D^{\text{tan}}_{ik}(\bm{\gamma}) \end{matrix} \right] :&= {\left[ \begin{matrix}
%   C_k(\bm{\gamma_0}) \\ D_{ik}(\bm{\gamma_0})
%   \end{matrix} \right] + \left[\begin{matrix}
%   \frac{\partial C_k (\bm{\gamma})}{\partial \bm{\gamma}}\mid_{\bm{\gamma} = \bm{\gamma_0}} \\ \frac{\partial D_{ik} (\bm{\gamma})}{\partial \bm{\gamma}}\mid_{\bm{\gamma} = \bm{\gamma_0}}
%   \end{matrix}\right](\bm{\gamma}-\bm{\gamma_0})},~k=1,2, \forall \bm{\gamma_0} \in S,\\
%   \left[\begin{matrix}
%   C_k^{\text{sec}}(\bm{\gamma}) \\ D^{\text{sec}}_{ik}(\bm{\gamma})
%   \end{matrix}\right] :&= {\left[\begin{matrix}
%   C_k(S^0),... C_k(S^{4d}) \\
%   D_{ik}(S^0), ... D_{ik}(S^{4d})
%   \end{matrix}\right] \left[\begin{matrix} S^0, ... , S^{4d} \\ 1,...,1 \end{matrix}\right]^{-1}\left[\begin{matrix}\bm{\gamma} \\ 1\end{matrix}\right]},~k=1,2. \\ \label{tan_sec}
%   \end{aligned}
%   \end{equation}
  \begin{equation} 
    \begin{aligned}
    \left[\begin{matrix}C_k^{\text{tan}}(\bm{\gamma})\\ D^{\text{tan}}_{ik}(\bm{\gamma}) \end{matrix} \right] :&= {\left[ \begin{matrix}
   C_k(\bm{\gamma_0}) \\ D_{ik}(\bm{\gamma_0})
   \end{matrix} \right] + \left[\begin{matrix}
   \frac{\partial C_k (\bm{\gamma})}{\partial \bm{\gamma}}\mid_{\bm{\gamma} = \bm{\gamma_0}} \\ \frac{\partial D_{ik} (\bm{\gamma})}{\partial \bm{\gamma}}\mid_{\bm{\gamma} = \bm{\gamma_0}}
   \end{matrix}\right](\bm{\gamma}-\bm{\gamma_0})}\\
   \left[\begin{matrix}
   C_k^{\text{sec}}(\bm{\gamma}) \\ D^{\text{sec}}_{ik}(\bm{\gamma})
   \end{matrix}\right] \!\!:&=\!\! {\left[\begin{matrix}
   C_k(S^0),... C_k(S^{4d}) \\
   D_{ik}(S^0), ... D_{ik}(S^{4d})
   \end{matrix}\right] \!\!\left[\begin{matrix} S^0, ... , S^{4d} \\ 1,...,1 \end{matrix}\right]^{-1}\!\!\left[\begin{matrix}\bm{\gamma} \\ 1\end{matrix}\right]}.\\ \label{tan_sec}
   \end{aligned}
   \end{equation}
Here $k=1,2, \forall \bm{\gamma_0} \in S$. As shown above, \eqref{lemma_sub_linear} is a linear programming problem that can be solved using various methods, including the simplex algorithm~\citep{klee1972good} and the interior algorithm~\citep{kojima1989primal, nesterov1994interior}.



% asymptotically. In the above process, although the analysis of $\underline{f_{S_{0}}(Y_x = y})$ has already been reduced into a set of sub optimization problems $\{\underline{f_{S_{ki}}(Y_x = y)}\}_{i=0}^{k}$, their objectives and constraints still all maintain irregular nonlinearity with extremely high computational complexity. In this sense, this function derives an asymptotic bounding strategy to approach each $\underline{f_{S_{ki}}(Y_x = y)}$ instead of directly searching these optimal values.

% \begin{algorithm}[h]
% %\begin{algorithmic}
% \caption{ \textbf{Bounding}.}
% \LinesNumbered
% \KwIn{Simplex $S$ during iteration.}
% \KwOut{$\underline{\underline{f_S(Y_x = y)}}$ serving as the lower bound of $\underline{f_{S}(Y_x = y)}$.}
% If $IR_{\Gamma} \cap S_{} = \varnothing $, then we let $\underline{\underline{f_{S_{}}(Y_x = y) }} = +\infty$. Otherwise we solve\footnotemark
% \begin{equation}
%     \begin{aligned}
%      \underline{\underline{f_{S_{}}(Y_{x} = y)}}= &\text{~min~} f(y,x) + C(\bm{\gamma}) \\
% &\text{~subject to:~} \bm{\phi} \in IR_{{\Phi}}, D^l_i(\bm{\gamma}) \leq 1, D^u_i(\bm{\gamma}) \geq 1, \bm{\gamma} \in S_{};
% %  \underline{\underline{f_{S_{}}(Y_{x} = y)}} =  &\max \{ \textbf{Bounding}(\text{pa}(S_{})), \underline{f_{S_{}}^{linear}(Y_{x} =  y)}\} ;
% \label{lemma_sub_linear}
%     \end{aligned}
% \end{equation}
% where $C(\cdot), D_i^l(), D^u_i()$ are identified in Eqn~\eqref{final_tan_sec}.

% Return $\underline{\underline{f_{S_{}}(Y_{x} = y)}}$.
% %\end{algorithmic}
% \end{algorithm}\footnotetext{Notice that this linear programming is derived from Eqn~\eqref{re-formulation_linear_weaker} with an additional constraint $\bm{\gamma} \in S$.}

% Specifically, the construction of the linear functions $C(\bm{\gamma}), D_{i}(\bm{\gamma})$ is via DC decomposition and linear bounding. Recalling $C_1(\bm{\gamma}), C_2(\bm{\gamma}), D_{i1}(\bm{\gamma}), D_{i2}(\bm{\gamma})$ in Eqn~\eqref{dc_decomposition}, we linearly bound them by \textbf{sec}ants and \textbf{tan}gents taking advantage of convexity: 
% \hspace{-2cm}\begin{equation} 
%     \begin{aligned}
%     \left[\begin{matrix}C_k^{\text{tan}}(\bm{\gamma})\\ D^{\text{tan}}_{ik}(\bm{\gamma}) \end{matrix} \right] :&= {\left[ \begin{matrix}
%   C_k(\bm{\gamma_0}) \\ D_{ik}(\bm{\gamma_0})
%   \end{matrix} \right] + \left[\begin{matrix}
%   \frac{\partial C_k (\bm{\gamma})}{\partial \bm{\gamma}}\mid_{\bm{\gamma} = \bm{\gamma_0}} \\ \frac{\partial D_{ik} (\bm{\gamma})}{\partial \bm{\gamma}}\mid_{\bm{\gamma} = \bm{\gamma_0}}
%   \end{matrix}\right](\bm{\gamma}-\bm{\gamma_0})},~k=1,2, \forall \bm{\gamma_0} \in S,\\
%   \left[\begin{matrix}
%   C_k^{\text{sec}}(\bm{\gamma}) \\ D^{\text{sec}}_{ik}(\bm{\gamma})
%   \end{matrix}\right] :&= {\left[\begin{matrix}
%   C_k(S^1),... C_k(S^{4d+1}) \\
%   D_{ik}(S^1), ... D_{ik}(S^{4d+1})
%   \end{matrix}\right] \left[\begin{matrix} S^1, ... , S^{4d+1} \\ 1,...,1 \end{matrix}\right]^{-1}[\begin{matrix}\bm{\gamma} \\ 1\end{matrix}]},~k=1,2, \\ \label{tan_sec}
%   \end{aligned}
%   \end{equation}
% Thus the objective function and constraints can be linearly bounded as
%   \begin{equation}
%       \begin{aligned}
%   C(\bm{\gamma}) :&= C_1^{\text{tan}}(\bm{\gamma}) - C_2^{\text{sec}}(\bm{\gamma}) \leq \sum_{i=1}^{d}\psi_{i}^o \theta_{i} \omega_{i}, \\
%   D^l_{i}(\bm{\gamma}) :&= D_{i1}^{\text{tan}}(\bm{\gamma}) - D_{i2}^{\text{sec}}(\bm{\gamma}) \leq \psi_{i}^o \psi_{i} ,\\
%   D^u_{i}(\bm{\gamma}) :&= D_{i1}^{\text{sec}}(\bm{\gamma}) - D_{i2}^{\text{tan}}(\bm{\gamma}) \geq \psi_{i}^o \psi_{i} .
%     \end{aligned}\label{final_tan_sec}
% \end{equation}
% By this substitution, Eqn~\eqref{lemma_sub_linear} is fully transformed into a linear programming problem, which can be solved by a wide variety of solutions, e.g. simplex algorithm~\citep{klee1972good}, interior algorithm~\citep{kojima1989primal, nesterov1994interior}, etc. 

\noindent ${\textbf{4) Global\_error()}}$: This function is to terminate PI-SFP via estimating the order of the error with respect to $n$. Recall that in \textbf{Step~5} of Algorithm.~1, we always select the $\tilde{S}_k$ with the lowest $\underline{\underline{f_S(Y_x = y)}}$ in the $k-$th iteration. This strategy guarantees (see Appendix~\ref{app_main_result_1} for more details)
\begin{equation}
    \begin{aligned}
        \underline{\underline{f_{\tilde{S}_k}(Y_x = y)}} \leq \min_{S \in \cS_k}\underline{f_S(Y_x = y)} = \underline{f(Y_x = y)},
    \end{aligned}\label{bound_each_iteration}
\end{equation}
i.e., all the $\underline{\underline{f_{\tilde{S}_k}(Y_x = y)}}$'s are lower bounds of $\underline{f(Y_x = y)}$, and thus $\underline{f^n_{opt}(Y_x = y)} \leq \underline{f(Y_x = y)}$. From this, we further have that, in the $n$-th iteration, for any $k \in \{0, \cdots, n\}$,
% \begin{equation}
%     \begin{aligned}
%       0 \leq \underline{f(Y_x = y)} - \underline{f^n_{opt}(Y_x = y)}
%       \leq \min_{S \in \cS_k} \underline{f_S(Y_x = y)} - \underline{\underline{f_{\tilde{S}_k}(Y_x = y)}} \leq 
%         {\underline{f^{{}}_{\tilde{S}_k}(Y_x = y)}} - \underline{\underline{f_{\tilde{S}_k}(Y_x = y)}}. 
%     \end{aligned}\label{shrinking_diameter}
% \end{equation}
{
\begin{equation}
    \begin{aligned}
       0 &\leq \underline{f(Y_x = y)} - \underline{f^n_{opt}(Y_x = y)} \\
       &\leq \min_{S \in \cS_k} \underline{f_S(Y_x = y)} - \underline{\underline{f_{\tilde{S}_k}(Y_x = y)}} \\&\leq 
        {\underline{f^{{}}_{\tilde{S}_k}(Y_x = y)}} - \underline{\underline{f_{\tilde{S}_k}(Y_x = y)}}. 
    \end{aligned}\label{shrinking_diameter}
\end{equation}
}
Also see Appendix~\ref{app_main_result_1} for details. This allows us to calculate an error bound via targeting  
\begin{equation}
\min_{0 \leq k \leq n}\left\{\underline{f^{{}}_{\tilde{S}_k}(Y_x = y)} - \underline{\underline{f_{\tilde{S}_k}(Y_x = y)}}\right\}.\label{eq:tildesbnd}
\end{equation}
Since the bound of $\underline{f^{{}}_{\tilde{S}_k}(Y_x = y)} - \underline{\underline{f_{\tilde{S}_k}(Y_x = y)}}$ is dominated by the diameter of the simplex $\tilde{S}_k$, i.e., $dia(\tilde{S}_k)$, we aim to get an order of \eqref{eq:tildesbnd} based on the order of the smallest $dia(\tilde{S}_k)$ with respect to $n$. As shown in Eqn~(\ref{second_bounded_result}) in Appendix~\ref{app_main_result_1}, this order is controlled by the length $L_n$ of the longest nested subsequence of $\{\tilde{S}_k\}_{k=0}^{n}$, which is summarized as Algorithm~3.
%~\quad \\
%In conclusion, these four functions guarantee the convergence of PI-SFP to $\underline{f(Y_x = y)}$ after sufficient iterations.

\begin{algorithm}[t]
%\begin{algorithmic}
\caption{Procedure to estimate the current convergence (\textbf{Global\textbf{\_error}}).}
\LinesNumbered
\KwIn{Collections of simplex partitions in each iteration till $n-$th iteration: $\tilde{S}_0, \cdots, \tilde{S}_n$.}
\KwOut{An estimate of the global error.}

Let $\{\tilde{S}_{i_k}\}_{k=1}^{L_n}$ be the (longest) subsequence of $\{\tilde{S}_k\}_{k=0}^n$ such that each $\tilde{S}_{i_{j + 1}}$ is partitioned from $\tilde{S}_{i_j}$ for $j=0, 1, \cdots, L_n-1$, where $L_n$ is the length of this subsequence;

Return the global error estimate $(\frac{\sqrt{3}}{2})^{\lfloor \frac{L_n}{4d}\rfloor}$.
\end{algorithm}



\begin{table*}[t]
	%\centering
	%\resizebox{\textwidth}{15mm}
	\caption{Simulation results (the upper bound is symmetric). The middle column $\Phi$ denotes the optimal solution within iteration $10^6$. The ground truth is approached via $10^6$ Monte-Carlo sampling. Our PI-SFP result decreases monotonically with the increasing $\varepsilon$, since the feasible region of latent variables $\Phi$ is gradually enlarged with $\varepsilon$. Detailed visulization of convergence rate is shown in Fig.~\ref{PI-SFP_0.5} (Appendix~\ref{fig}).}
	%\resizebox{\textwidth}{40mm}
	\begin{tabular}{c| ccc ccc  | c  cc}
		\hline
		\multirow{2}{*}{$\varepsilon$}    & \multicolumn{6}{c|}{$\Phi$} & \multirow{2}{*}{PI-SFP result}   & \multirow{2}{*}{$\underline {f(Y_x = y)} $ (Ground Truth)}  & \multirow{2}{*}{Error}   \\
		\cline{2-7} 
		& \makecell[c]{$\theta_1$ }        & \makecell[c]{$\theta_2$ }   &\makecell[c]{$\psi_1$ }  & \makecell[c]{$\psi_2$ }        & \makecell[c]{$\omega_1$ }   &\makecell[c]{$\omega_2$ }\\
		\hline
		\hline
		0.1    & 0.067 &0.133 & 0.261 & 0.239 &0.333 & 0.167 & 0.370 & 0.372 & 0.548\% \\
		0.2    & 0.050 &0.150 & 0.262 & 0.238 &0.375 & 0.125 & 0.350 & 0.351 & 0.285\%\\
		0.3    & 0.029 &0.171 & 0.264 & 0.236 &0.429 & 0.072 & 0.298 & 0.301 & 0.997\%\\
		{$\geq$ 0.4}    & 0.001 & 0.199 & 0.310 & 0.190 &0.500 & 0.000& 0.200 & 0.205 & 2.439\% \\
		\hline
	\end{tabular}
	\label{simulation_result}
\end{table*}
\section{Theoretical analysis}

This section delves into the theoretical properties of PI-SFP. First, we examine the general convergence rate of PI-SFP concerning $L_n$ (see Theorem.~\ref{convergence_theorem}). Then, we demonstrate that PI-SFP can be extended from computing $\underline{f(Y_x = y)}$ to the general ACE case.

% For preparation, we need to ensure that $IR_{\Gamma}$ is bounded to make $dia(S_0)<+\infty$, which allows for partitioning into smaller units for better estimation. To achieve this goal, we introduce the following positivity assumption:

\begin{assumption}{{(Positivity)}}\label{positive definite assumption}
$\mathscr{P}$ is a set of $P(\bm{W} \mid \bm{U})$ guaranteeing each compatible solution $P(\bm{U}, \bm{X}=x)$ to be positive definite. Namely, $\exists \delta > 0$, such that $\forall \bm{\phi}=(\bm{\theta}, \bm{\psi}, \bm{\omega}) \in IR_{\bm{\phi}}$, we have $\bm{\psi} \geq \delta * \bm{1}_{1*d}>\bm{0}_{1*d}$.
\end{assumption}


It is a fairly broad and reasonable assumption in practice, just in order to ensure that the denominator in the original formulation is not too small to facilitate the calculation. Under this assumption, we have $\psi_i^o < \frac{1}{\delta}$ in \eqref{re-formulation} and $\psi^l >\delta$ in \eqref{initialization_S00}. Hence we have $sup_{\gamma \in IR_{\Gamma}}\|\bm{\gamma}\|_{+\infty} < +\infty$ and $dia(S_0)<+\infty$ respectively. In addition, when Assumption~\ref{positive definite assumption} is violated, we propose an alternative PI-SFP refer readers to Appendix~\ref{discussion_ass} for more information.On this basis, we formally collate the previous analysis as follows:

% \begin{theorem}
%     Under Assumption.~\ref{ass_partial_bounded}--\ref{positive definite assumption}, PI-SFP concentrates around the target value $\underline{f(Y_x = y)}$ at the $O((\frac{3}{4})^{\lfloor\frac{L_n}{4d}}\rfloor)$ rate. Specifically, 
%     \begin{equation}
%         \begin{aligned}
%             &\mid \underline{f^{n}_{opt}(Y_x = y)} - \underline{f(Y_x = y)} \mid  \leq  A   ( \frac{3}{4})^{\lfloor \frac{L_n}{4d} \rfloor}  dia(S_0)^2,\\
%              \end{aligned}
%     \end{equation}
%   $A = \max\limits_{\bm{\gamma} \in S_{0}} \frac{2(\sqrt{2}+1)\sqrt{d}}{\delta} \|\frac{\partial{(C_1(\bm{\gamma}) - C_2(\bm{\gamma}))}}{\partial{\bm{\gamma}}}\|  + \max\limits_{\bm{\gamma} \in S_{0}} \|\frac{\partial^2 C_1(\bm{\gamma)}}{\partial \bm{\gamma}^2}\|_F +  \frac{1}{2} \max\limits_{\bm{\gamma} \in S_{0}} \|\frac{\partial^2 C_2(\bm{\gamma)}}{\partial \bm{\gamma}^2} \|_F < +\infty$. Here $\|\cdot \|$ denotes the Euclidean norm, and $\|\cdot \|_F$ denotes the Frobenius norm. $L_n\in [\lfloor log(n)\rfloor+1, n]$ is the length of the longest nested sequence till $n-$th iteration. Moreover, $\lim\limits_{n\rightarrow +\infty} \underline{f_{opt}^{n}(Y_x = y)} = \underline{f(Y_x = y)}$.
% \label{convergence_theorem}
% \end{theorem}

{
\begin{theorem}
    Under Assumption.~\ref{ass_partial_bounded}--\ref{positive definite assumption}, PI-SFP concentrates around the target value $\underline{f(Y_x = y)}$ at the $O((\frac{3}{4})^{\lfloor\frac{L_n}{4d}}\rfloor)$ rate. Specifically, 
$\mid \underline{f^{n}_{opt}(Y_x = y)} - \underline{f(Y_x = y)} \mid  \leq  A   ( \frac{3}{4})^{\lfloor \frac{L_n}{4d} \rfloor}  dia(S_0)^2,$ where
$A = A_1 +A_2 +A_3 < +\infty$, $A_1 = \max\limits_{\bm{\gamma} \in S_{0}} \frac{2(\sqrt{2}+1)\sqrt{d}}{\delta} \|\frac{\partial{(C_1(\bm{\gamma}) - C_2(\bm{\gamma}))}}{\partial{\bm{\gamma}}}\|, A_2 = \max\limits_{\bm{\gamma} \in S_{0}} \|\frac{\partial^2 C_1(\bm{\gamma)}}{\partial \bm{\gamma}^2}\|_F$, $A_3 =  \frac{1}{2} \max\limits_{\bm{\gamma} \in S_{0}} \|\frac{\partial^2 C_2(\bm{\gamma)}}{\partial \bm{\gamma}^2} \|_F$. Here $\|\cdot \|$ denotes the Euclidean norm, and $\|\cdot \|_F$ denotes the Frobenius norm. $L_n\in [\lfloor log(n)\rfloor+1, n]$ is the length of the longest nested sequence till $n-$th iteration. Moreover, $\lim\limits_{n\rightarrow +\infty} \underline{f_{opt}^{n}(Y_x = y)} = \underline{f(Y_x = y)}$.
\label{convergence_theorem}
\end{theorem}
}
Theorem.~\ref{convergence_theorem} states that PI-SFP converges to $\underline{f(Y_x = y)}$ with the growing length of the longest nested sequence, and will approach it in the infinite case. We relegate the proof to Appendix~\ref{app_main_result_1} and reserve a brief summary. First, $\underline{f(Y_x = y)}$ is equal to $\underline{f_{S_{0}}(Y_x = y)}$ via
constructing an original enclosure $S_0$ in \eqref{initialization_S00}. Second, $\underline{f_{S_{0}}(Y_x = y)}$ is substituted with $\min_{S\in \cS_k}\underline{f_{S}(Y_x = y)}$ in the $k$-th iteration by bisection. Third, each $\underline{f_{S}(Y_x = y)}$ is lower bounded by \eqref{lemma_sub_linear}, namely we have $\forall S \in \cS_{k}, {\underline{f_S(Y_x = y)}} \geq \underline{\underline{f_S(Y_x = y)}}$. Finally,
$\tilde{S}_k$ with the lowest bound $\min\limits_{S\in \cS_k}\underline{\underline{f_S(Y_x = y)}}$ is gathered as $\{\tilde{S}_k\}_{k=0}^n$ in order to formulate $\underline{f^n_{opt}(Y_x = y)}$ (see \textbf{Step~10} in Algorithm~1). The asymptotic error can be bounded by \eqref{shrinking_diameter}-\eqref{eq:tildesbnd}. In conclusion, these four steps correspond to the four functions in the above section in order.


Noteworthy, it is well beyond the scope of this paper to theoretically estimate $L_n$ w.r.t $n$, both empirically and theoretically. We refer readers to Appendix~\ref{comment} for detailed comment. In this comment, we also figure out a conjecture upon finiteness of regular simplicial partitions, which is our extra contribution. Moreover, our method and theorem could be naturally extended to the ACE case, which is detailed in Appendix~\ref{corollary_ACE_appendix} for space limitation.

\section{Simulations and Real-world experiments}
% In this section, we conduct experiments to illustrate the effectiveness of PI-SFP. We aim to answer two questions: 1) Can PI-SFP handle the partial observable $P(\bm{W} \mid \bm{U})$ which has not been addressed in the previous literature and produce informative bounds? 2) What about PI-SFP's convergence rate in practice? For space limitation, we defer part of visualizations to Appendix~\ref{fig}.
In this section, we perform experiments to demonstrate the efficacy of PI-SFP, aiming to address two key questions: 1) Can PI-SFP effectively manage the partially observable $P(\bm{W} \mid \bm{U})$, a scenario not previously explored in the literature, and generate informative bounds? 2) How does PI-SFP’s convergence rate manifest in practical applications? Due to space constraints, some visualizations are deferred to Appendix~\ref{fig}.

\subsection{Simulations}
\noindent \textbf{Experiment settings} We refer to the case presented in Section.~3, as shown in Equation.~\eqref{tight_example}, and generalize our findings, with a specific focus on Fig.~\ref{Fig.sub.1}. We address an intriguing and universal situation referred to as 'information leakage,' where the information of $\bm{U}$ is regularly retained by $\bm{W}$ but incurs loss during transmission. Formally, we claim $P(W=w_i \mid U=u_i)\geq 1-\varepsilon, \varepsilon \in (0,1)$. To make the experiment simple and representative, we consider the binary cases of $\bm{W},\bm{U},\bm{X}$. On this basis, the construction is $\overline{P(\bm{W}\mid \bm{U})}  := (1-\varepsilon)\bm{I_{2*2}} + \varepsilon \bm{J_{2*2}}$ and $\underline{P(\bm{W}\mid \bm{U})} := (1-\varepsilon) \bm{I_{2*2}} , \varepsilon \in (0,1)$.
% \begin{equation}
%     \begin{aligned}
%          \left[\begin{matrix}
% \overline{P(\bm{W}\mid \bm{U})} \\ \underline{P(\bm{W}\mid \bm{U})}
% \end{matrix}\right] = \left[ \begin{matrix}
% (1-\varepsilon)\bm{I_{2*2}} + \varepsilon \bm{J_{2*2}} \\ (1-\varepsilon) \bm{I_{2*2}} 
% \end{matrix}\right], \varepsilon \in (0,1).
%     \end{aligned} \label{initial_observe}
% \end{equation}
The construction of $f(\bm{Y},\bm{W},\bm{X})$ still follows \eqref{tight_example}\footnote{In order to avoid the ill-conditioned case for PYTHON $3.8.5$, we make a rather broad restriction that elements of $P(\bm{U}, x)$ are at least $1e^{-2}$ in all cases (Assumption.~\ref{positive definite assumption})}. Moreover, we set the iteration number as $1000$.


\noindent \textbf{Experiment result} The simulation results, shown in Table.~\ref{simulation_result} and Fig.\ref{PI-SFP_0.5}, indicate that PI-SFP successfully finds optimal solutions and values, with a fast convergence rate within 1000 iterations. At the beginning step, estimation errors increase as $\varepsilon$ increases, but they remain under control by the theoretical error guaranteed by Theorem.\ref{convergence_theorem}. During iterations, estimation errors converge quickly to the real $\underline{f(Y_x = y)}$. The ground truth decreases as the feasible region of $\Phi$ increases, i.e., when $\varepsilon$ increases. Moreover, when $\varepsilon \geq 0.4$, we observe that $\underline{f(Y_x = y)}$ achieves its minimum value~\cite{reason:Pearl09a} of $f(y,x) = 0.200$.

\subsection{Real-world applications}




In the simulation experiments, we have demonstrated that PI-SFP can quickly converge to the valid bound; furthermore, in the real experiments in this section, we show that the valid bound generated by PI-SFP can more effectively substantiate the causal relationships in the real world compared with previous methods. Specifically, we re-analyze the Zika Virus outbreak dataset~\citep{taddeo2022causal,tchetgen2024universal} in the most-related literature~\citep{tchetgen2023single}. Our PI-SFP result exhibits a more significant adverse effect from Zika Virus to the birth rate, which is more aligned with the well-known scientific hypothesis~\citep{castro2018implications} compared with the previous literature~\citep{tchetgen2023single}. We defer the experimental details to Appendix~\ref{fig} due to space limitation.



























\section{Justification of assumptions and further discussions}\label{section_dis_ext}
In this section, for the core partial observability assumption (Assumption.\ref{ass_partial_bounded}), we analyze its necessity, generalizability, and verifiability. First, the necessity is supported by the following lemma. It leads to an interesting and counter-intuitive \textit{negative result}: informative proxies (namely transition matrix $P(\bm{W}\mid \bm{U})$ is reversible) \textit{do not guarantee} informative bounds (instead of the vanilla bound); 


% the partial observability assumption is more beneficial in achieving a better bound of $f(Y_x = y)$ compared to reversibility, despite the latter being an important hypothesis for calculations in previous studies. Secondly, we establish that our Assumption.\ref{ass_partial_bounded} is weaker and more general, encompassing cases where previous assumptions are ineffective. Finally, we provide evidence that Assumption.~\ref{ass_partial_bounded} is verifiable.

% \noindent \textbf{Necessity} In this part, we investigate the necessity of the partial observability assumption. We consider the case when there is no knowledge on $P(\bm{W} \mid \bm{U})$, i.e., without Assumption.~\ref{ass_partial_bounded}. Notice that a precise portrayal of the dynamic relationship between partial observability and tight bound is not realistic as illustrated in the preliminaries. In this sense, the following lemma can at least serve as an initial exploration in single proxy control, to illustrate the necessity of Assumption.~\ref{ass_partial_bounded}. Specifically, we show the tight lower bound can degenerate to be trivial $f(y, x)$ without Assumption.~\ref{ass_partial_bounded}. 


\begin{lemma}
Assume that $[\underline{P(\bm{W} \mid \bm{U})}, \overline{P(\bm{W} \mid \bm{U})}] = [\bm{0}_{d_w * d_u}, \bm{1}_{d_w * d_u} ]$, and $f(\bm{U},x) > \bm{0}$. We consider the whole set of $f(y,\bm{W},\bm{U},\bm{X})$ which is within $\mathcal{\widetilde{F}}$ and is additionally compatible with two observed distributions $f(\bm{W}, \neg x)> \bm{0}_{d_w * 1}$, $f(y, \bm{W}, x)>\bm{0}_{d_w * 1}$ by an unknown $P(\bm{W}\mid \bm{U})$. Then (i) The tight lower bound of ${f(Y_x = y)}$ is vanilla, namely $f(y, x)$.
(ii) If $P(\bm{W} \mid \bm{U})$ is restricted to be left-reversible and $ f(\bm{W} \mid \neg x) \neq  f(\bm{W} \mid x,y)$, then the tight lower bound of ${f(Y_x = y)}$ is still the vanilla $f(y, x)$. (iii) If $P(\bm{W} \mid \bm{U})$ is restricted to be left-reversible and $ f(\bm{W} \mid \neg x) =  f(\bm{W} \mid x,y)$, then ${f(Y_x = y)}$ is lower bounded by another vanilla bound $f(y\mid x)$.
\label{lemma_no_assumption}
\end{lemma}

The proof is in Appendix~\ref{pro_lemma_no_assumption}. This lemma extends the well-known inequality $f(Y_x = y) \geq f(y,x)$ \citep{reason:Pearl09a} to single proxy control. 
% $f(y, \bm{W}, x)$, $f(\bm{W}, \neg x)$ are to control $f(y, \bm{U}, x)$, $f(\bm{U}, \neg x)$ respectively. During this process we aim to construct $f(y, \bm{U}, x)\circ f(\bm{U}, \neg x) = \bm{0}_{d*1}$, here $\circ$ denotes the Hadamard product.. 
Lemma~\ref{lemma_no_assumption} sufficiently indicates that partial observability Assumption~\ref{ass_partial_bounded} (\citep{kuroki2014measurement, greenland2005multiple}), instead of the reversibility assumption in the previous literature~(e.g., \citep{miao2018confounding}), is more necessary for partial identification in most cases.

Furthermore, for genealizability, verifiability and practical correspondence of Assumption~\ref{ass_partial_bounded}, we refer readers to arguments in Appendix~\ref{pro_lemma_no_assumption}. Moreover, we provide algorithm comparison and algorithm acceleration in Appendix~\ref{pre_train}, discuss graph structure extension in Appendix~\ref{diss_graph}, and then extend to the continuous case of confounders in Appendix~\ref{proof_continuous}. 









% \subsection{Discussions on extensions to the continuous confoundings}
% In this section, we extend our analysis to the continuous case of $\bm{U}$. The assumption presented in Assumption.~\ref{ass_partial_bounded} is seamlessly extended to incorporate a priori upper/lower bounds on ${P(\bm{W}\mid u \in [u_{i-1}, u_{i}]), i=1,2,...d }$. To tackle this, we adopt a branch-and-bound strategy, similar to the one presented in the main text, but now based on discretization. It is important to note that the sampling length of our method directly impacts the approximation error, which decreases with longer samples. We refer the readers to Appendix~\ref{proof_continuous} for more illustrations and proofs.







\section{Conclusions}



In this paper, we highlight the limitations and strict assumptions of the transfer matrix $P(\bm{W} \mid \bm{U})$ through practical examples, emphasizing that exact observability and reversibility are often not feasible in real-world scenarios. Based on this, we propose a novel PI-SFP framework that achieves a valid bound for the causal effect, even with only partial observability of $P(\bm{W}\mid \bm{U})$. To achieve it, we employ deformation techniques in DC programming and implement a branch-and-bound method. We offer a theoretical analysis of the mathematical reasons behind the lack of tight bounds and provide sufficient and necessary conditions to determine if the bounds are tight. We also conduct a convergence rate analysis of PI-SFP. Furthermore, we provide specific convergence rate analysis for these methods. We also provide a fundamental negative result that informative proxies might not yield informative partial identification bounds.


Our paper has initiated new research trajectories, specifically focusing on the proximal partial identification with broader confounding proxy information. An additional avenue for exploration could involve evaluating the performance of PI-SFP under more intricate partial observability assumptions. Furthermore, it would be promising for in-depth investigation to extend our single-proxy control scheme to encompass double-proxy control and other causal graphs. These avenues remain further exploration in our future research.

\section*{Acknowledgement}
The research was partially completed while Zhiheng Zhang
was a student intern at Shanghai Qi Zhi Institute. He is grateful to Professor Yuhao Wang for suggestions.


\normalem


% \clearpage

% References
\bibliography{uai2024-template}















\clearpage


\onecolumn

% \title{Supplement of Partial Identification with Proxy of Latent Confoundings via Sum-of-ratios Fractional Programming\\(Supplementary Material)}
% \maketitle



\appendix

\setcounter{equation}{0}
\setcounter{section}{0}
\setcounter{subsection}{0}
\renewcommand{\theequation}{A.\arabic{equation}}
\renewcommand{\thesubsection}{A.\arabic{subsection}}


%\setcounter{equation}{0}


In the appendices, we provide supplementary material and proofs for our main text. Appendices~\ref{proof_basic_IR}-\ref{equal_reform_algorithm1} contain proofs for propositions. In Appendix~\ref{proof_basic_IR}, we establish that $\mathcal{F} \subseteq \mathcal{\widetilde{F}}$. In Appendix~\ref{equal_reform_algorithm1}, we demonstrate that the bound is tight under certain conditions.

Appendix~\ref{discussion_ass} is dedicated to discussing the assumption. We explore cases where Assumption~\ref{positive definite assumption} does not hold.

Appendix~\ref{app_main_result_1} contains the main results. Firstly, we show that the original simplex $S_{0}$ encloses our identification region. Secondly, we prove that the original optimization problem can be transformed into a set of sub-problems in the reduced space. Thirdly, we demonstrate our construction to transfer the original nonlinear optimization problem to the weaker linear case. Finally, we prove that our algorithm converges to the global optimal solution at an exponential rate.

Appendix~\ref{app_corollary} extends our result from $f(Y_x=y)$ to the more general ACE.

Appendix~\ref{app_discussion} is dedicated to extensions. We discuss 1) the previous assumptions in the original literature, 2) auxiliary acceleration strategies, 3) extension of graph structure, and 4) extension to the continuous confounding.




\subsection{The proof of Proposition~\ref{basic_IR}}\label{proof_basic_IR}
According to Assumption.~\ref{ass_partial_bounded}, by integration, we can also directly claim that if $f(y,\bm{U}, \bm{W}, \bm{X}) \in \mathcal{F}$, then 
\begin{equation}
    \begin{aligned}
     &\underline{P^{}(\bm{W} \mid \bm{U})} \bm{\theta} \leq P(y,\bm{W},x) \leq  \overline{P^{}(\bm{W} \mid \bm{U})} \bm{\theta}, \forall x \in X.\\
     &\underline{P^{}(\bm{W} \mid \bm{U})} \bm{\psi} \leq P(\bm{W},x) \leq  \overline{P^{}(\bm{W} \mid \bm{U})} \bm{\psi}, \forall x \in X.\\
     &\underline{P^{}(\bm{W} \mid \bm{U})} \bm{\omega} \leq f(\bm{W},\neg x) \leq  \overline{P^{}(\bm{W} \mid \bm{U})}\bm{\omega}, \forall x \in X.
    \end{aligned}
\end{equation}
Thus
\begin{equation}
    \begin{aligned}
        \left[\begin{matrix}
   -\bm{I_{d*d}} \\ \bm{I_{d*d}}
   \end{matrix}\right] \left[\begin{matrix}
   &f(y,\bm{W},x)^{T} \\ &f(\bm{W},x)^{T} \\ &f(\bm{W},\neg x)^{T}
   \end{matrix} \right]^{T}  -  \left[ \begin{matrix}
   &-\overline{P(\bm{W}\mid \bm{U})} \\  &\underline{P(\bm{W} \mid \bm{U})}
   \end{matrix} \right] \bm{\phi} \geq \bm{0}.
    \end{aligned}
\end{equation}
Combined with the natural that $\theta_i \in [0, P(y,x)]$, $\psi_i \in [0,P(x)], \omega_i \in [0, P(\neg x)], i=1,2,...d$, we have $f(y,\bm{U}, \bm{W}, \bm{X}) \in \mathcal{\widetilde{F}}$. In conclusion, we claim $\mathcal{F} \subseteq \mathcal{\widetilde{F}}$.


\begin{table*}[]
	%\centering
	%\resizebox{\textwidth}{15mm}
	\caption{Tools and assumptions of previous literature on partial identification. \citep{kuroki2014measurement}$(1)$ is without $\bm{Z}$, while $(2)$ is with $\bm{Z}$.}
	{
	\begin{tabular}{c | ccc | ccc}
		\hline
		\multirow{2}{*}{Literature}    & \multicolumn{3}{c|}{Tools} & \multicolumn{3}{c}{Assumptions }        \\ 
		\cline{2-7} 
		& \makecell[c]{Valid \\Instrument}        & \makecell[c]{Negative\\ exposure}   &\makecell[c]{Negative\\outcome}     & \makecell[c]{Reversibility\\ Completeness} &\makecell[c]{Bridge\\ function}  & \makecell[c]{Observability \\ of $P(\bm{W}\mid \bm{U})$}\\
		\hline
		\hline
		\makecell[l]{{\citep{balke1994counterfactual}}\\ 	\citep{kitagawa2009identification}}  & \Checkmark          & \XSolid     &\XSolid        & \XSolid            &\XSolid         &\XSolid     \\
		\hline
		\makecell[l]{\citep{kuroki2014measurement}(1) \\ \citep{rothman2008modern} \\ \citep{leecausal}} & \XSolid          & \XSolid     &\Checkmark        & \Checkmark           &\XSolid         &\makecell[c]{\Checkmark \footnote{$P(\bm{W}\mid \bm{U})$ is assumed to be reversible and explicitly, totally observed.}}     \\
		\hline
		\makecell[l]{\citep{kuroki2014measurement}(2) \\ \citep{nagasawa2018identification}}& \XSolid          & \Checkmark     &\Checkmark        & \Checkmark            &\XSolid         &\XSolid     \\
		\hline
		\makecell[l]{\citep{miao2018identifying} \\\citep{shi2020multiply}\\ \citep{singh2020kernel} \\ \citep{cui2020semiparametric} \\\citep{tchetgen2020introduction} \\\citep{deaner2018proxy}} & \XSolid          & \Checkmark     &\Checkmark        & \Checkmark            &\Checkmark     &\XSolid     \\
		\hline
		\makecell[l]{\citep{kallus2021causal}} & \XSolid          & \Checkmark     &\Checkmark        & \XSolid          &\Checkmark     &\XSolid     \\ 
		\hline
		\makecell[l]{\textbf{Our paper}} & \XSolidBold          & \XSolidBold     &\CheckmarkBold        & \XSolidBold          &\XSolidBold     &\makecell[c]{\CheckmarkBold \footnote{In our paper, $P(\bm{W}\mid \bm{U})$ only needs be partially bounded.}\\   } \\
		\hline
	\end{tabular}  } 
	\label{table_literature}
\end{table*}


\subsection{The proof of Proposition~\ref{proposition_tight}} \label{equal_reform_algorithm1}






As the optimal solution $\bm{\phi_{opt}}$ satisfies the constraint \eqref{constraint_prove_tight} in Proposition.~\eqref{proposition_tight}, we can equivalently claim that $\bm{\phi_{opt}}$ is compatible with some $f(y,\bm{W}, \bm{U}, \bm{X})$ which satisfies $f(y,\bm{W}, \bm{U}, \bm{X}) \in \mathcal{F}$. On this basis, the original formulation can be transformed with stricter constraints but equal minimum optimal value, namely that from 


\begin{equation}
    \begin{aligned}
      \text{min~} f(y,x) +  \sum_{i=1}^{d}\frac{1}{\psi_{i}}\theta_{i}\omega_{i},~\text{subject to:~} f(y,\bm{U},\bm{W}, \bm{X}) \in \mathcal{\widetilde{F}}
    \end{aligned} 
\end{equation}

to 

\begin{equation}
    \begin{aligned}
      \text{min~} f(y,x) +  \sum_{i=1}^{d}\frac{1}{\psi_{i}}\theta_{i}\omega_{i},~\text{subject to:~} f(y,\bm{U},\bm{W}, \bm{X}) \in \mathcal{\widetilde{F}} \cap \mathcal{{F}} = \mathcal{{F}}.
    \end{aligned} 
\end{equation}

This is equal to the original \eqref{eqn_basic_bound}. Hence $f(Y_x = y)$ is the tight lower bound of $f(Y_x = y)$ under constraint.~\ref{constraint_prove_tight}. 

By contrast, if the constraint \eqref{constraint_prove_tight} does not hold, then any $f(y,\bm{U},\bm{W}, \bm{X})$ compatible with $\bm{\phi_{opt}}$ will be within $\mathcal{F}^c \cap \widetilde{\mathcal{F}}$. In another word, the minimum value of the original formulation will be lower than that of Eqn~\eqref{eqn_basic_bound}, and the bound $\underline{f(Y_x = y)}$ is not tight. Proved. 

We further provide an instance that our bound is tight:
\begin{equation}
    \begin{aligned}
        \left[ \begin{matrix}f(Y = y, \bm{W}, x)^T \\ f(Y = y, \bm{W}, \neg x)^T  \\
        f(Y \neq y, \bm{W}, x)^T \\ f(Y \neq y, \bm{W}, \neg x)^T
        \end{matrix}\right]
        = \left[\begin{matrix}
0.08 & 0.12\\ 0.15 & 0.1 \\ 0.18 & 0.12\\ 0.15 & 0.1 \\
\end{matrix}\right], \left[\begin{matrix}
\overline{P(\bm{W}\mid \bm{U})} \\ \underline{P(\bm{W}\mid \bm{U})}
\end{matrix}\right] = \left[ \begin{matrix}
0.6\bm{I_{2*2}} + 0.4\bm{J_{2*2}} \\ 0.6\bm{I_{2*2}} 
\end{matrix}\right].
    \end{aligned} \label{tight_example}
\end{equation}
Here $\bm{W}, \bm{U}, \bm{X}$ are all binary, and $\bm{I_{n*n}}, \bm{J_{n*n}}$ denote the $n$ dimensional identity matrix and all-ones matrix respectively. we can verify one of the optimal solutions $\bm{\phi_{opt}} = [0~ 0.2~ 0.3~ 0.2~ 0.5~ 0]^T$. The corresponding $f(y,\bm{U},\bm{W},\bm{X})$ satisfying Eqn~\eqref{constraint_prove_tight} exists, whose explicit form is detailed as follows:
\begin{equation}
    \begin{aligned}
        \left[ \begin{matrix}
        f(Y=y, W=w_1, \bm{U}, \bm{X}) & f(Y\neq y, W=w_1, \bm{U}, \bm{X}) \\ f(Y=y, W=w_2, \bm{U}, \bm{X}) & f(Y\neq y, W=w_2, \bm{U}, \bm{X})
        \end{matrix}
        \right] = \left[\begin{matrix} 0 & 0.15 & 0.18 & 0.15 \\ 0.08 & 0 & 0 & 0 \\ 0 & 0.1 & 0.12 & 0.1 \\ 0.12 & 0 & 0 & 0 \end{matrix} \right]
    \end{aligned}
\end{equation}










\subsection{Further discussion on Assumption.~\ref{positive definite assumption}}\label{discussion_ass}


In this section, we consider the case when Assumption.~\ref{positive definite assumption} does not hold. We propose a new version of PI-SFP. Recall that our objective function is:
\begin{equation}
    \begin{aligned}
   \underline{f(Y_x = y)} =   \text{min~} f(y,x) +  \sum_{i=1}^{d}\frac{1}{\psi_{i}}\theta_{i}\omega_{i},~\text{subject to:~} f(y,\bm{U},\bm{W}, \bm{X}) \in \mathcal{\widetilde{F}}, i.e., \bm{\phi} \in IR_{\bm{\Phi}}.
    \end{aligned}
\end{equation}
In our main text, we let $\psi_i^o = \frac{1}{\psi_i}$. However, when we can not guarantee that $\exists \delta$, $\forall i \in \{1,2,...d\}, \psi_i \geq \delta$ (without Assumption.~\ref{positive definite assumption},), then $\psi_i^o$ may turn to infinity. On this basis, we introduce another algebraic distortion $\psi_i^o = \frac{\theta_i \omega_i }{\psi_i}$. Then the above programming can be transformed to:
\begin{equation}
    \begin{aligned}
    \underline{f(Y_x = y)} = ~&  \text{min~} f(y,x) +  \sum_{i=1}^{d} \psi_i^o \\
    &\text{subject to:~} \bm{\gamma} \in IR_{{\Gamma}}, \text{where~} IR_{\Gamma} = \{\bm{\gamma}: \bm{\phi} \in IR_{{\Phi}},
    \psi_{i}^{o} \psi_{i}  = \theta_i \omega_i, \psi_i^o \leq C, i=1,...d.\}.\\
    \end{aligned}\label{new_PI_SFP}
\end{equation}
where $C$ is a local optimal value (a priori computed) of $\underline{f(Y_x = y)}$. On this basis, we can adopt the analogous strategy as in the traditional PI-SFP. Here the original $S_0$ is easy to be constructed since $\| \bm{\gamma} \|_{+\infty} < +\infty$. 

Programming \eqref{new_PI_SFP} can also be adopted under Assumption.~\ref{positive definite assumption}. Compared with the traditional PI-SFP, firstly, programming \eqref{new_PI_SFP} needs an a priori computed $C$ to upper bound $\psi_i^o$. Secondly, we will do linearization on $\psi_{i}^{o} \psi_{i}  = \theta_i \omega_i$ instead of $\psi_{i}^{o} \psi_{i}  = 1$, which is more complex. There is no guarantee of which version is better and we will explore it in the future work.


\iffalse 
On this basis, we split the identification region of $\bm{\phi}$ into $2^d$ partitions:
\begin{equation}
    \begin{aligned}
    \begin{cases} \psi_{i}  \geq \delta, i\in \{j_1,j_2,...j_q\} \subseteq \{1,2,...d\} \\
    \psi_{i} < \delta, i \notin \{j_1,j_2,...j_q\}
    \end{cases}
    \end{aligned}
\end{equation}
Hence in each partition, our objective function is:
\begin{equation}
\begin{aligned}
&\text{min~} f(x,y) + \sum_{i\in \{j_1,j_2,...j_d\}}^{}\frac{\theta_i \omega_i}{\psi_i} + \sum_{i\notin \{j_1,j_2,...j_d\}}^{}\frac{\theta_i \omega_i}{\psi_i}\\
&\text{subject to~} \bm{\phi} \in IR_{\bm{\Phi}}. \begin{cases} \psi_{i}  \geq \delta, i\in \{j_1,j_2,...j_q\} \subseteq \{1,2,...d\} \\
    \psi_{i} < \delta, i \notin \{j_1,j_2,...j_q\}
    \end{cases}
\end{aligned}\label{each_partition}
\end{equation}

We use the following programming for substitution:

\begin{equation}
    \begin{aligned}
    &\text{min~} f(x,y) + \sum_{i\in \{j_1,j_2,...j_q\} } \frac{\theta_i \omega_i}{\psi_i}\\
&\text{subject to~} \bm{\phi} \in IR_{\bm{\Phi}}.\begin{cases} \psi_{i}  \geq \delta, i\in \{j_1,j_2,...j_q\} \subseteq \{1,2,...d\} \\
    \psi_{i} < \delta, \theta_i = 0, i \notin \{j_1,j_2,...j_q\}
    \end{cases} .
    \end{aligned}\label{eqn_larger_delta}
\end{equation}

Notice that the programming \eqref{eqn_larger_delta} keeps the analogous form with the original formulation, just with additional linear constraints. Hence we can directly adopt PI-SFP algorithm to do optimization on \eqref{eqn_larger_delta}. Moreover, we only need to prove the optimal value of \eqref{eqn_larger_delta} is a good estimate of that of \eqref{each_partition}. 

Consider the optimal value of \eqref{eqn_larger_delta} and \eqref{each_partition} are $\bm{\gamma}^{{opt}_1}, \bm{\gamma}^{{opt}_2}$.
\fi

\subsection{Valid bound of average causal effect (ACE)}\label{ACE_APP}
The identification region of $f(Y_x)$ is constructed as follows. 
\begin{equation}
    \begin{aligned}
    &IR_{F(Y_x)} = \{f(Y_x): \int_{Y^L}^{Y^U}f({Y_x=y})dy =1, \\ & \forall y \in [Y^{L}, Y^{U}], f(y,\bm{U}, \bm{W}, \bm{X}) \in \mathcal{F} \}.
    \end{aligned}
\end{equation}

Then the valid bound of $ACE_{\bm{X} \rightarrow \bm{Y}}$ can be denoted as $[\underline{ ACE_{\bm{X} \rightarrow \bm{Y}}}, \overline{ ACE_{\bm{X} \rightarrow \bm{Y}}}]$:
% \begin{equation}
%     \begin{aligned}
%     &\underline{ ACE_{\bm{X} \rightarrow \bm{Y}}} \leq \min\{ ACE_{\bm{X} \rightarrow \bm{Y}} = \int_{{X}^L}^{{X}^U} \int_{Y^L}^{Y^U} f(Y_x=y)\pi(x) dx dy: f({Y_x}) \in IR_{F(Y_x)}\},~\\
%     &\overline{ ACE_{\bm{X} \rightarrow \bm{Y}}} \geq \max\{ ACE_{\bm{X} \rightarrow \bm{Y}} = \int_{X^{L}}^{X^{U}} \int_{Y^L}^{Y^U} f(Y_x=y)\pi(x) dx dy: f({Y_x}) \in IR_{F(Y_x)}\}.
%     \end{aligned}\label{tight_bound_ace}
% \end{equation}


\begin{equation}
    \begin{aligned}
    &\underline{ ACE_{\bm{X} \rightarrow \bm{Y}}} \leq \min\{ ACE_{\bm{X} \rightarrow \bm{Y}} \\&= \int_{{X}^L}^{{X}^U} \int_{Y^L}^{Y^U} f(Y_x=y)\pi(x) dx dy: f({Y_x}) \in IR_{F(Y_x)}\},~\\
    &\overline{ ACE_{\bm{X} \rightarrow \bm{Y}}} \geq \max\{ ACE_{\bm{X} \rightarrow \bm{Y}} \\&= \int_{X^{L}}^{X^{U}} \int_{Y^L}^{Y^U} f(Y_x=y)\pi(x) dx dy: f({Y_x}) \in IR_{F(Y_x)}\}.
    \end{aligned}\label{tight_bound_ace}
\end{equation}

$[\underline{ ACE_{\bm{X} \rightarrow \bm{Y}}}, \overline{ ACE_{\bm{X} \rightarrow \bm{Y}}}]$ is the valid bound of ACE. In our paper, we aim to design an algorithm to seek the valid bound of $f(Y_x = y)$, and then extend our strategy from bounding $f(Y_x = y)$ to bounding ACE. Homoplastically, we only need to consider the optimization technique on the minimum case, and the maximum case will be symmetric.


\subsection{The proof of Theorem.~\ref{convergence_theorem}}\label{app_main_result_1} 
 
 
\noindent {\textbf{The sketch of proof}} This is the main result of our paper. The main procedure are as follows:
\begin{equation}
    \begin{aligned}
    & \mid \underline{f(Y_x = y)}  - \underline{f_{opt}^n (Y_x=y)} \mid \\
    \overset{}{=} &\mid \underline{f(Y_x = y)}  - \max\limits_{k \in \{0,1,...n\}} \underline{\underline{f_{\tilde{S}_{k}}(Y_x = y)}} \mid  &\text{Definition of~} \underline{f_{opt}^n (Y_x=y)}\\
     =& \min\limits_{k \in \{0,1,...n\}} \mid \underline{f(Y_x = y)}  -  \underline{\underline{f_{\tilde{S}_{k}}(Y_x = y)}} \mid \\
     \overset{\textbf{(1)}}{=} & \min\limits_{k \in \{0,1,...n\}} \mid \underline{f_{S_{0}}(Y_x = y)}  -  \underline{\underline{f_{\tilde{S}_{k}}(Y_x = y)}} \mid &{{\textbf{Initialization}}}\\
      \overset{\textbf{(2)}}{=} & \min\limits_{k \in \{0,1,...n\}} \mid \min\limits_{S\in \cS_k} {\underline{f_{S}(Y_x = y)}}  -  \underline{\underline{f_{{\tilde{S}_{k}}}(Y_x = y)}} \mid &{{\textbf{Bisection}}}\\
    \overset{*}{\leq} & \min\limits_{k \in \{0,1,...n\}} \mid \underline{f_{{\tilde{S}_{k}}}(Y_x = y)}  - \underline{{\underline{f_{{\tilde{S}_{k} }}(Y_x = y)}} } \mid \\
    {\leq}& \mid {{\underline{f_{\tilde{S}_{i_{L_n}  }}(Y_x = y)}} } - \underline{{\underline{f_{\tilde{S}_{i_{L_n}}}(Y_x = y)}} }\mid  \\
    \overset{\textbf{(3)}}{=} & O(dia(\tilde{S}_{i_{L_n}  })) &{{\textbf{Bounding}}}\\
    \overset{\textbf{(4)}}{=} & O((\frac{\sqrt{3}}{2})^{\lfloor\frac{L_n}{4d}\rfloor} ). &{{\textbf{Global\_error}}}
    \end{aligned}\label{final_conclusion}
\end{equation}
 
$*$ is directly by \textbf{(2)} and we have previously mentioned it in Eqn~\eqref{shrinking_diameter}. In the following demonstration, we mainly focus on procedure $\textbf{(1)(2)(3)(4)}$, corresponding to the algorithm part \textbf{Initialization}, \textbf{Bisection}, \textbf{Bounding}, \textbf{Global\_error} in order. 

\noindent{\textbf{ (1) Initialization()}} 
\noindent We will claim that $IR_{\Gamma} \subseteq S_{0}$.
\begin{lemma}
The original $S_{0}$ satisfies $IR_{\Gamma} \subseteq S_{0}$, and thus $\underline{f(Y_{x} = y)} = \underline{f_{S_{0}}(Y_{x} = y)}$.
\label{lemma_enclose}
\end{lemma}



\textbf{The proof of lemma.~\eqref{lemma_enclose}}
\noindent 
The simplex construction is as follows. $S_{0}$ is spanned by $\{S_{0}^0, S_{0}^1, ...S_{0}^{4d}\}$, where
 \begin{equation}
     \begin{aligned}
      S_{0}^{i} =  \begin{cases}
 \bm{\gamma^l},~i=0 \\ \bm{\gamma^l}+ (\alpha - \bm{1_{1*4d}}\bm{\gamma^l})*\bm{\vec{e_i}} ,~i \in \{1,2,...4d\}
 \end{cases}, \text{~where~} \bm{\gamma^{l}} = (\gamma^{l}_1, \gamma^{l}_2,...\gamma^{l}_{4d})^T,
     \end{aligned}\label{span_S00}
 \end{equation}

where $S_{0}^{i}$ is the supporting vertices set described in our main text. For each $ \bm{\gamma} \in IR_{\Gamma}$, we attempt to provide a direct construction as follows:
\begin{equation}
    \begin{aligned}
    \forall \bm{\gamma} \in IR_{\Gamma}, \text{we~have~} \bm{\gamma} \overset{*}{=} \sum_{i=0}^{4d}\beta_{i}S_{0}^i, ~\beta_i = \begin{cases}
1-\sum_{i=1}^{4d}\beta_i, i=0 \\ \frac{\bm{\gamma}\bm{\vec{e_i}}-\gamma_i^l}{\alpha - \bm{1_{1*4d}}\bm{\gamma^l}}, i=1,2,...4d \\
 \end{cases}, \beta_i \in [0,1],
    \end{aligned} \label{original_beta}
\end{equation}
To prove \eqref{original_beta}, we only need to prove the correctness of the equality $*$ and the fact $\beta_i \in [0,1], \forall i =0,1,...4d.$
 
First, we demonstrate the correctness of this construction. 
\begin{equation}
    \begin{aligned}
      \sum_{i=0}^{4d}\beta_{i}S_{0}^i &= \beta_0 S_{0} + \sum_{i=1}^{4d}\beta_{i}S_{0}^i \\
      &= \beta_0 \bm{\gamma^l} + \sum_{i=1}^{4d}\beta_{i} \left(\bm{\gamma^l}+ (\alpha - \bm{1_{1*4d} }\bm{\gamma^l} )\bm{\vec{e_i}}\right) &\text{(definition of } S_{0}^i\text{)}\\
      &= (1-\sum_{i=1}^{4d}\beta_i) \bm{\gamma^l} + \sum_{i=1}^{4d}\beta_{i} \left(\bm{\gamma^l}+ (\alpha - \bm{1_{1*4d} }\bm{\gamma^l} )\bm{\vec{e_i}}\right) &\text{(definition of } \beta_i \text{)}\\
      &= \bm{\gamma^l} + \sum_{i=1}^{4d}\frac{\bm{\gamma}\bm{\vec{e_i}}-\gamma_i^l}{\alpha - \bm{1_{1*4d}}\bm{\gamma^l}} \left((\alpha - \bm{1_{1*4d} }\bm{\gamma^l} )\bm{\vec{e_i}}\right) &\text{(definition of } \beta_i \text{)} \\
      &= \bm{\gamma^l} + \left(\sum_{i=1}^{4d} \bm{\gamma}\bm{\vec{e_i}}-\gamma_i^l \right)\bm{\vec{e_i}} = \bm{\gamma}.
    \end{aligned}
\end{equation}





Second, we claim $\forall i \in \{1,...4d\}, \beta_i \in [0,1]$. Since we already have $\beta_i>0, i=1,2,...4d$ according to the construction of $\{\alpha, \bm{\gamma^l}\}$, we only need to prove the left: $\beta_0 >0$. Notice that
\begin{equation}
    \begin{aligned}
    \sum_{i=1}^{4d} \beta_i &= \sum_{i=1}^{4d}\frac{\bm{\gamma}\bm{\vec{e_i}}-\gamma_i^l}{\alpha - \bm{1_{1*4d}}\bm{\gamma^l}} = \frac{\bm{1_{1*4d}\bm{\gamma}} - \bm{1_{1*4d}{\bm{\gamma^l}}}}{\alpha - \bm{1_{1*4d}}\bm{\gamma^l}}.
    \end{aligned}
\end{equation}
Due to $\beta_0 = 1-  \sum_{i=1}^{4d} \beta_i$, it is equal to prove 
\begin{equation}
    \bm{1_{1*4d}{\bm{\gamma}}} \leq \alpha = 1 + f(y, x) + 
\frac{d^2 (\psi^{l}+ \psi^{u})^2 }{4f(x) \psi^{l} \psi^{u}  }  ,
\end{equation}
where $\psi^{l}, \psi^{u}$ are identified in the main text. It is equivalent to
\begin{equation}
    \begin{aligned}
        \sum_{i=1}^{d} \psi^o_i + \sum_{i=1}^d \theta_i +  \sum_{i=1}^d \psi_i +  \sum_{i=1}^d \omega_i  \leq 1 + f(y, x) + 
\frac{d^2 (\psi^{l}+ \psi^{u})^2 }{4f(x) \psi^{l} \psi^{u}  },
    \end{aligned}
\end{equation}
namely that
\begin{equation}
    \begin{aligned}
        \sum_{i=1}^{d} \psi_{i}^o \leq \frac{d^2 (\psi^{l}+ \psi^{u})^2 }{4f(x) \psi^{l} \psi^{u}  }.
    \end{aligned}\label{inverse_chauchy}
\end{equation}
We only need prove the inequality~\eqref{inverse_chauchy}. It is due to the fact $(\psi_{i} -  \psi^{l})(\frac{1}{\psi_{i}} - \frac{1}{{\psi^{u}}}) \geq 0$, namely $1+\frac{ \psi^{l}}{ {\psi^{u}}} \geq \frac{ \psi^{l}}{\psi_{i}} + \frac{\psi_{i}}{\psi^{u}}$. By which we have
\begin{equation}
    \begin{aligned}
& (1+\frac{ \psi^{l}}{ {\psi^{u}}})d \geq  \psi^{l} \sum_{i=1}^{d} \frac{1}{\psi_{i}} + \frac{1}{ {\psi^{u}}} \sum_{i=1}^{d} \psi_{i} \geq 2\sqrt{\frac{ \psi^{l}}{{\psi^{u}}}} \sqrt{\sum_{i=1}^{d} \frac{1}{\psi_{i}}} \sqrt{f(x)}.
    \end{aligned}
\end{equation}
It is equal to
\begin{equation}
    \begin{aligned}
    \sum_{i=1}^{d} \psi_{i}^o = \sum_{i=1}^{d} \frac{1}{\psi_{i}}  \leq \frac{( \psi^{u} +  \psi^{l})^2 d^2}{4 f(x) \psi^{u} \psi^{l}}, \text{and thus} \sum_{i=1}^{4d}  \beta_i \in [0,1].
    \end{aligned}
\end{equation}



On this basis, $\beta_0 = 1-\sum_{i=1}^{4d}\beta_i \in [0,1]$. Combining with $\beta_i \geq 0, i\in\{0,1,...4d\}$ and Eqn~\eqref{original_beta}, we claim that $\forall \bm{\gamma} \in IR_{\Gamma}$, we have $\bm{\gamma} \in S_{0}$. Due to the arbitrary of $\bm{\gamma}$, we have $IR_{\Gamma} \subseteq S_{0}$, and thus $\underline{f(Y_{x} = y)} = \underline{f_{S_{0}}(Y_{x} = y)}$.
\quad


\noindent{\textbf{(2) Bisection()}}
\noindent We introduce the following lemma:
\begin{lemma}\label{lemma_ObjS0_equal_ObjSk}
The partitioning set $\cS_{k}$ satisfies $\underline{f_{S_{0}}(Y_{x} = y)} = \min\limits_{S\in \cS_k}  \underline{f_{S}(Y_{x} = y)}$.

\end{lemma}

\textbf{The proof of lemma.~\eqref{lemma_ObjS0_equal_ObjSk}}
By definition of bisection process, $\tilde{S}_k$ is bisectioned into $\tilde{S}_{k1}, \tilde{S}_{k2}$. Then
\begin{equation}
    \begin{aligned}
        \cS_{k + 1} := \left(\cS_k \setminus \tilde{S_k}\right) \cup \{\tilde{S}_{k1}, \tilde{S}_{k2}\}
    \end{aligned}
\end{equation}
Hence we have $\cup_{S\in \cS_{k} } S = \cup_{S\in \cS_{k+1} } S, \forall k=0,1,...$ Thus $S_{0} = \cup_{S\in \cS_{k} } S$, and we have
\begin{equation}
\begin{aligned}
     \underline{f_{S_{0}}(Y_{x} = y)}
        = \underline{f_{(\cup_{S\in \cS_{k} } S)}(Y_x = y)}
        = \min\limits_{S\in \cS_k}  \underline{f_{S}(Y_{x} = y)} .
    \end{aligned}
\end{equation}
Hence we have proved.
\quad


\noindent{\textbf{(3) Bounding()}}
\noindent We first introduce lemma.~\eqref{dc_decom_lemma} and lemma.~\eqref{tan_sec_bounded} for preparation, then the procedure \textbf{(3)} is proved by lemma.~\eqref{second_bounded}.

\begin{lemma}
The decomposition of \eqref{re-formulation} can be established as Eqn~\eqref{dc_decomposition}.
\label{dc_decom_lemma}
\end{lemma}
\textbf{The proof of lemma.~\eqref{dc_decom_lemma}}
Specifically, we give the explicit decomposition as follows, and the sub-script $cyc$ means the cycle of symbol set $[\psi_{i}^o, \theta_{i}, \omega_{i}]$:
\begin{equation}
    \begin{aligned}
    & \psi_{i}^o \theta_{i} \omega_{i} \\
    = & \left[ \frac{1}{2}\sum\limits_{cyc} (\psi_{i}^o)^2 \theta_{i} + \frac{1}{2}\sum\limits_{cyc} (\psi_{i}^o) \theta_{i} ^2 + \sum\limits_{i=1}^d \psi_{i}^o \theta_{i} \omega_{i} \right] - \frac{1}{2}\sum\limits_{cyc} (\psi_{i}^o)^2 \theta_{i} - \frac{1}{2}\sum\limits_{cyc} (\psi_{i}^o) \theta_{i} ^2 \\
    =& \left[\frac{1}{6}(\sum\limits_{cyc} \psi_{i}^o )^3 -  \frac{1}{6}(\sum\limits_{cyc} (\psi_{i}^o)^3) \right] -  \frac{1}{2}\sum\limits_{cyc} (\psi_{i}^o)^2 \theta_{i} - \frac{1}{2}\sum\limits_{cyc} (\psi_{i}^o) \theta_{i} ^2 \\
    =& \left[\frac{1}{6}(\sum\limits_{cyc} \psi_{i}^o )^3 -  \frac{1}{6}(\sum\limits_{cyc} (\psi_{i}^o)^3) \right] + \frac{1}{2}\sum\limits_{cyc}(\psi_{i}^o)^4 - \frac{1}{4}\sum\limits_{cyc}(\psi_{i}^o)^4 - \frac{1}{4}\sum\limits_{cyc}(\theta_{i})^2 -  \frac{1}{2}\sum\limits_{cyc} (\psi_{i}^o)^2 \theta_{i} \\ &+  \frac{1}{2}\sum\limits_{cyc}(\psi_{i}^o)^2 - \frac{1}{4}\sum\limits_{cyc}(\psi_{i}^o)^2 - \frac{1}{4}\sum\limits_{cyc}(\theta_{i})^4 -  \frac{1}{2}\sum\limits_{cyc} \psi_{i}^o \theta_{i}^2 \\
    =& \left[\frac{1}{6}(\sum\limits_{cyc} \psi_{i}^o )^3 -  \frac{1}{6}(\sum\limits_{cyc} (\psi_{i}^o)^3) \right] + \frac{1}{2}\sum\limits_{cyc}(\psi_{i}^o)^4 - \frac{1}{4}\sum\limits_{cyc}((\psi_{i}^o)^2+\theta_{i})^2 + \frac{1}{2}\sum\limits_{cyc}(\psi_{i}^o)^2 - \\ &\frac{1}{4}\sum\limits_{cyc}(\psi_{i}^o+\theta_{i}^2)^2.
    \end{aligned}
\end{equation}
On this basis, if we choose 
\begin{equation}
    \begin{aligned}
    &C_1(\bm{\gamma}) =\sum_{i=1}^{d} \left[ \frac{1}{6}(\sum\limits_{cyc} \psi_{i}^o )^3+\frac{1}{2}\sum\limits_{cyc}(\psi_{i}^o)^4 + \frac{1}{2}\sum\limits_{cyc}(\psi_{i}^o)^2 \right],\\
    &C_2(\bm{\gamma}) = \sum_{i=1}^{d} \left[ \frac{1}{6}\sum\limits_{cyc} (\psi_{i}^o)^3 + \frac{1}{4}\sum_{cyc}[(\psi_{i}^o)^2+\theta_{i}]^2 + \frac{1}{4}\sum_{cyc}[\psi_{i}^o+\theta_{i}^2]^2\right],
    \end{aligned}
\end{equation}
then we have 
\begin{equation}
    \begin{aligned}
    \sum_{i=1}^{d}\psi_{i}^o \theta_{i} \omega_{i} = C_1(\bm{\gamma}) - C_2(\bm{\gamma}). 
    \end{aligned}
\end{equation}
Here the Hessian matrix $\frac{\partial^{2}C_1(\bm{\gamma})}{\partial^2( \bm{\gamma})}$ and $\frac{\partial^{2}C_2(\bm{\gamma})}{\partial^2( \bm{\gamma})}$~is~positive~semi-definite:
\begin{equation}
    \begin{aligned}
       \frac{\partial^{2}C_1(\bm{\gamma})}{\partial^2( \bm{\gamma})} =\frac{\partial^{2}C_2(\bm{\gamma})}{\partial^2( \bm{\gamma})} = \left[\bm{\gamma} + 6 \bm{\gamma} \circ \bm{\gamma} + \bm{1_{4d*1}}\right] \circ \left[\begin{matrix}
       \bm{1_{1*2d}}& \bm{0_{1*d}} &\bm{1_{1*d}}
       \end{matrix}\right]^T \geq \bm{0_{4d*1}},
    \end{aligned}
\end{equation}




where $\circ$ denotes the Hadamard product. Moreover,
\begin{equation}
    \begin{aligned}
    \frac{\partial^{2}D_{i1}(\bm{\gamma})}{\partial^2( \bm{\gamma})} = \frac{\partial^{2}D_{i2}(\bm{\gamma})}{\partial^2( \bm{\gamma})} = \left[\begin{matrix}
       \bm{1_{1*d}}& \bm{0_{1*d}} &\bm{1_{1*d}} & \bm{0_{1*d}}
       \end{matrix}\right]^T \geq \bm{0_{4d*1}}.
    \end{aligned}
\end{equation}
$D_{i1}(\bm{\gamma}), D_{i2}(\bm{\gamma})$ are also positive semi-definite.
\quad


On this basis, we further give the upper and lower bound of the convex function as follows:
\begin{lemma}
If function $F(\bm{\gamma})$ is differential and convex restricted by any simplex $S$, then $F^{\text{tan}}(\bm{\gamma}) \leq F(\bm{\gamma}) \leq F^{\text{sec}}(\bm{\gamma})$, where $\bm{\gamma_0} \in S$. In our paper, function $F(\cdot)$ can be chosen as $C_1(\cdot),C_2(\cdot),D_{i1}(\cdot),D_{i2}(\cdot)$, and $F^{\text{tan}}(\cdot), F^{\text{sec}}(\cdot)$ hold the same construction as in Formulation~\ref{tan_sec}.\label{up_low_bound_convex}  \footnote{The matrix of the starting simplex $\left[\begin{matrix} S_{0}^0, ... , S_{0}^{4d} \\ 1,...,1 \end{matrix}\right]^{}$ is reversible by the construction in lemma.~\eqref{lemma_enclose}. Moreover, the reversibility of $\left[\begin{matrix} S_{}^0, ... , S_{}^{4d} \\ 1,...,1 \end{matrix}\right]^{}, S\in \cS_{k}, k=0,1,...$ still holds during bisection, since each bisection can be seen as a linear transformation between different columns.} \label{tan_sec_bounded}
\end{lemma}

\textbf{The proof of lemma.~\eqref{tan_sec_bounded}}
The left part is intuitive. It is the tangent line equation of $F(\bm{\gamma})$. We only consider the right part by the convex property of $F(\bm{\gamma})$, whose construction is motivated by~\cite{pei2013global}. We use $\bm{\gamma}_i, i=1,2,...4d$ to denote the value of $\bm{\gamma}$ on each dimension ($\lambda_i \in [0,1],~\sum_{i=0}^{4d}\lambda_i = 1$):
\begin{equation}
    \begin{aligned}
    F(\bm{\gamma}) &= F(\sum_{i=0}^{4d} \lambda_i S^i) \leq \sum_{i=0}^{4d} \lambda_{i} F(S^i) 
    = \sum_{i=0}^{4d} \lambda_i [F(S^{0}), F(S^1), ..., F(S^{4d})] \left[\begin{matrix} S^0, ... , S^{4d} \\ 1,...,1 \end{matrix}\right]^{-1}[\begin{matrix}S^i \\ 1\end{matrix}] \\
    & = [F(S^{0}), F(S^2), ..., F(S^{4d})] \left[\begin{matrix} S^0, ... , S^{4d} \\ 1,...,1 \end{matrix}\right]^{-1}[\begin{matrix}\bm{\gamma} \\ 1\end{matrix}] = F^{\text{sec}}(\bm{\gamma}).
    \end{aligned}
\end{equation}
Hence we have proved our lemma.
\quad


On this basis, we can claim \eqref{lemma_sub_linear} provides the lower bound of $\underline{f_{S}(Y_x = y)}$, namely $\underline{\underline{f_{S}(Y_x = y)}} \leq \underline{f_{S}(Y_x = y)}$. After the above difference-in-convex linear construction, we introduce the following lemma to approximate $\underline{f_{S_{}}(Y_x = y)}$ by $\underline{\underline{f_{S_{}}(Y_x = y)} } $:

\begin{lemma}

    $\forall S, \mid \underline{f_{S}(Y_x = y)}  - \underline{\underline{f_{S}(Y_x = y)} }      \mid \leq A * dia(S_{})^2$,
 where
 $A = \max\limits_{\bm{\gamma} \in S_{0}} \|\frac{\partial{(C_1(\bm{\gamma}) - C_2(\bm{\gamma}))}}{\partial{\bm{\gamma}}}\|\frac{2(\sqrt{2}+1)\sqrt{d}}{\delta}  + \max\limits_{\bm{\gamma} \in S_{0}} \|\frac{\partial^2 C_1(\bm{\gamma)}}{\partial \bm{\gamma}^2}\|_F +  \frac{1}{2} \max\limits_{\bm{\gamma} \in S_{0}} \|\frac{\partial^2 C_2(\bm{\gamma)}}{\partial \bm{\gamma}^2} \|_F < +\infty$.
\label{second_bounded}
\end{lemma} 
\textbf{The proof of lemma.~\eqref{second_bounded}}
Since $dia(S_{0}) < +\infty$, we have that each element of $\gamma \in S_{0}$ can be bounded, namely $\| \gamma \|_{+\infty}<+\infty $. Then $\|\frac{\partial C_1(\bm{\gamma)}}{\partial \bm{\gamma}}\|$,  $\|\frac{\partial C_2(\bm{\gamma)}}{\partial \bm{\gamma}}\|$, $\|\frac{\partial^2 C_1(\bm{\gamma)}}{\partial \bm{\gamma}^2}\|_F$, $\|\frac{\partial^2 C_2(\bm{\gamma)}}{\partial \bm{\gamma}^2}\|_F$ are all finite. Here $\|\cdot \|$ denotes the Euclidean norm, and $\|\cdot \|_F$ denotes the Frobenius norm. 

If the corresponding optimal solution of 
$\underline{f_S(Y_x = y)}$ and $\underline{\underline{f_S(Y_x = y)}}$ are denoted as $\underline{\bm{\gamma}}$ and $\underline{\underline{{\bm{\gamma}}}}$ ($\underline{\bm{\gamma}}, \underline{\underline{{\bm{\gamma}}}} \in S$). Then according to lemma.~\eqref{tan_sec_bounded}, $\mid \underline{f_{S_{}}(Y_x = y)}  - \underline{\underline{f_{S_{}}(Y_x = y)} }\mid$ can be bounded as follows:
\begin{equation}
\begin{aligned}
       0 \leq &{\underline{f_{S_{}}(Y_x = y)}}  - \underline{\underline{f_{S_{}}(Y_x = y)} } \\ = &\mid C_1({\underline{{\bm{\gamma}}}}) - C^{\text{tan}}_1(\underline{\underline{{\bm{\gamma}}}})  - C_2({\underline{{\bm{\gamma}}}}) + C^{\text{sec}}_2(\underline{\underline{{\bm{\gamma}}}}) \mid\\
       \leq &\mid C_1(\underline{\underline{{\bm{\gamma}}}}) -  C^{\text{tan}}_1(\underline{\underline{{\bm{\gamma}}}})  -
       C_2(\underline{\underline{{\bm{\gamma}}}}) + C^{\text{sec}}_2(\underline{\underline{{\bm{\gamma}}}}) \mid + 
       \mid C^{\text{}}_1({\underline{{\bm{\gamma}}}}) -  C^{\text{}}_1(\underline{\underline{{\bm{\gamma}}}}) - C^{\text{}}_2({\underline{{\bm{\gamma}}}}) +  C^{\text{}}_2(\underline{\underline{{\bm{\gamma}}}}) \mid \\
       \overset{*}{\leq} & {\underbrace{\mid C_1(\underline{\underline{{\bm{\gamma}}}}) -  C^{\text{tan}}_1(\underline{\underline{{\bm{\gamma}}}}) \mid }_{(1)}} + \underbrace{\mid  C_2(\underline{\underline{{\bm{\gamma}}}}) - C^{\text{sec}}_2(\underline{\underline{{\bm{\gamma}}}}) \mid}_{(2)} +
       \underbrace{\mid (C^{\text{}}_1({\underline{{\bm{\gamma}}}}) -  C^{\text{}}_2({\underline{{\bm{\gamma}}}})) - ( C^{\text{}}_1({\underline{\underline{{\bm{\gamma}}}}}) -  C^{\text{}}_2(\underline{\underline{{\bm{\gamma}}}}))  \mid}_{(3)} \\
       \end{aligned}\label{linear_bound_main}
\end{equation}

\noindent \noindent \textbf{item~(1)}:We consider the last line. The tangent line equation satisfies the following bound by Taylor expansion:
\begin{equation}
    \mid C_1(\bm{\gamma}) - C^{\text{tan}}_1(\bm{\gamma}) \mid = O(\max\limits_{\bm{\gamma} \in S_{0}} \|\frac{\partial^2 C_1(\bm{\gamma)}}{\partial \bm{\gamma}^2} \|_F (dia(S_{}))^2) = O( dia(S_{})^2), \label{bound_1}
\end{equation}



\noindent \textbf{item~(2)}: On the other hand, note that $\bm{\gamma} = \sum\limits_{j=0}^{4d} \lambda_j S_{}^j$, here $\sum\limits_{j=0}^{4d} \lambda_j = 1, \lambda_j \geq 0$:
\begin{equation}
    \begin{aligned}
    \mid C_2(\bm{\gamma}) - C^{\text{sec}}_2(\bm{\gamma}) \mid  =& -[C_2(S_{}^0), C_2(S_{}^1),... C_{2}(S_{}^{4d})] \left[\begin{matrix} S_{}^0, ... , S_{}^{4d} \\ 1,...,1 \end{matrix}\right]^{-1}\left[\begin{matrix}{\sum\limits_{i=0}^{4d} \lambda_i S_{}^{i}} \\ 1\end{matrix}\right] + C_2(\sum_{j=0}^{4d} \lambda_j S_{}^{j}) \\
    = &  \sum_{j=0}^{4d} \lambda_j C_2(S_{}^j) -C_2(\sum_{j=0}^{4d} \lambda_j S_{}^{j}).
    \label{bound_2}
    \end{aligned}
\end{equation}

We now aim to bound Eqn~\eqref{bound_2}, inspired by \citep{budimir2001further}. For simplicity, we use $\triangledown$ to denote the derivative of a vector. Notice that the convex function has the property:
\begin{equation}
    \begin{aligned}
    C_2(\sum\limits_{i=0}^{4d}\lambda_j S^j ) - C_2(S^j) \geq \langle  \nabla C_2(S^j), \sum\limits_{j=0}^{4d} \lambda_j S^j - S^j\rangle 
    \end{aligned}
\end{equation}
By summation, we have
\begin{equation}
    \begin{aligned}
    \eqref{bound_2} &= \sum\limits_{j=0}^{4d} \lambda_j C_2(S^j) - C_2(\sum\limits_{i=0}^{4d}\lambda_j S^j ) \\ &\leq \sum_{j=0}^{4d} \lambda_j \langle  \nabla C_2(S^j), -\sum\limits_{j=0}^{4d} \lambda_j S^j + S^j\rangle \\
    &= \sum_{j=0}^{4d} \lambda_j \langle \nabla C_2(S^j), S^j \rangle - \langle \sum_{j=0}^{4d} \lambda_j S^j, \sum_{j=0}^{4d} \lambda_j \nabla C_2(S^j) \rangle
    \end{aligned}\label{inverse_qinsen}
\end{equation}

\eqref{inverse_qinsen} equals to
\begin{equation}
    \begin{aligned}
     &\frac{1}{2} \sum_{i=0}^{4d} \sum_{j=0}^{4d} \lambda_i \lambda_j \left[\left[ \langle \nabla C_2(S^j), S^j \rangle + \langle \nabla C_2(S^i), S^i  \rangle \right]- \left[\langle \nabla C_2(S^j), S^i \rangle + \langle \nabla C_2(S^i), S^j \rangle\right] \right] \\
     = & \frac{1}{2} \sum_{i=0}^{4d} \sum_{j=0}^{4d} \lambda_i \lambda_j \langle S^i - S^j, \nabla C_2(S^i) - \nabla C_2(S^j) \rangle \\
     \leq & \frac{1}{2} \sum_{i=0}^{4d} \sum_{j=0}^{4d} \lambda_i \lambda_j \| S^i - S^j\| \|\nabla C_2(S^i) - \nabla C_2(S^j) \| \\
     \leq & \frac{1}{2} \sum_{i=0}^{4d} \sum_{j=0}^{4d} \lambda_i \lambda_j (\max\limits_{\bm{\gamma} \in S} \|\frac{\partial^2 C_2(\bm{\gamma})}{\partial \bm{\gamma}^2}\|_F) \| S^i - S^j\|^2 \\
     \leq & \frac{1}{2} \sum_{i=0}^{4d} \sum_{j=0}^{4d} \lambda_i \lambda_j (\max\limits_{\bm{\gamma} \in S}\|\frac{\partial^2 C_2(\bm{\gamma})}{\partial \bm{\gamma}^2}\|_F) dia(S)^2 \\
     \leq & \frac{1}{2}(\max\limits_{\bm{\gamma} \in S_0}\|\frac{\partial^2 C_2(\bm{\gamma})}{\partial \bm{\gamma}^2}\|_F) dia(S)^2 .
    \end{aligned}\label{bound_chauchy}
\end{equation}

\iffalse
$(\uppercase\expandafter{\romannumeral1})$: In order to bound $(\uppercase\expandafter{\romannumeral1})$, we first introduce a pair of vectors $S^{u}, S^{l}$. Notice that $\forall S^u, S^{l} \in {\mathbb{R}}^{4d}$, we have
\begin{equation}
    \begin{aligned}
    ({\uppercase\expandafter{\romannumeral1}})^2 =& \sum\limits_{j=0}^{4d} \lambda_{j} \|S^j\|^2 - \|\sum\limits_{j=0}^{4d} \lambda_{j} S^j \|^2 \\
    = & \langle S^u - \sum\limits_{j=0}^{4d} \lambda_{j} S^j,  \sum\limits_{j=0}^{4d} \lambda_{j} S^j - S^l \rangle - \sum\limits_{j=0}^{4d}\lambda_j \langle S^u - S^j, S^j - S^l \rangle\\
    \leq & \underbrace{\frac{1}{4} \| S^u - S^l \|^2}_{({\uppercase\expandafter{\romannumeral3}})} - \underbrace{\sum\limits_{j=0}^{4d}\lambda_j \langle S^u - S^j, S^j - S^l \rangle}_{ ({\uppercase\expandafter{\romannumeral4}})} 
    \end{aligned}\label{bound_su_sl}
\end{equation}
In \eqref{bound_su_sl}, we choose $S^u, S^l \in {\mathbb{R}}^{4d}$ as follows:
\begin{equation}
    \begin{aligned}
    \begin{cases}
    \langle S^u, S^l \rangle = -\frac{1}{2}dia(S)^2 + \sum\limits_{i=0}^{4d}\sum\limits_{j=0}^{4d}\lambda_i \lambda_j S^i S^j \\
    S^u + S^l = 2\sum\limits_{i=0}^{4d}\lambda_i S^i
    \end{cases}
    \end{aligned}
\end{equation}
Then 
\begin{equation}
    \begin{aligned}
    ({\uppercase\expandafter{\romannumeral3}}) &= \frac{1}{4} \| S^u + S^l \|^2 - \langle S^u, S^l \rangle = \frac{1}{2}dia(S)^2. \\
    ({\uppercase\expandafter{\romannumeral4}}) &=- \sum\limits_{j=0}^{4d} \lambda_j \left[ \langle S^u, S^l \rangle + \langle S^j, S^j \rangle - \langle S^u+S^l, S^j \rangle\right] \\
    &= -\langle S^u, S^l \rangle  - \sum\limits_{j=0}^{4d} \lambda_j \langle S^j, S^j \rangle  + \langle S^u+S^l, \sum\limits_{j=0}^{4d} \lambda_j S^j \rangle \\
    &= \frac{1}{2}dia(S)^2 + \sum\limits_{i=0}^{4d}\sum\limits_{j=0}^{4d}\lambda_i \lambda_j S^i S^j - \frac{1}{2}\sum\limits_{i=0}^{4d}\sum\limits_{j=0}^{4d} \lambda_i \lambda_j \left(\langle S^i, S^i \rangle + \langle S^j, S^j \rangle\right) \\
    &= \frac{1}{2}dia(S)^2 - \frac{1}{2}\sum\limits_{i=0}^{4d}\sum\limits_{j=0}^{4d}\lambda_i \lambda_j \langle S^i - S^j, S^i - S^j \rangle \\
    &\geq \frac{1}{2}dia(S)^2 - \frac{1}{2}\sum\limits_{i=0}^{4d}\sum\limits_{j=0}^{4d}\lambda_i \lambda_j dia(S)^2 =0
    \end{aligned}
\end{equation}

Hence
\begin{equation}
    \begin{aligned}
    ({\uppercase\expandafter{\romannumeral1}}) \leq \frac{1}{\sqrt{2}}dia(S).
    \end{aligned}
\end{equation}

({\uppercase\expandafter{\romannumeral2}}): Analogously, we aim to bound ({\uppercase\expandafter{\romannumeral2}}):
\fi

%we use another proof






We have
\begin{equation}
    \begin{aligned}
         0 \leq \sum_{j=0}^{4d} \lambda_j C_2(S_{}^j) -C_2(\sum_{j=0}^{4d} \lambda_j S_{}^{j}) \leq \frac{1}{2} \max\limits_{\bm{\gamma} \in S_{}} \|\frac{\partial^2 C_2(\bm{\gamma)}}{\partial \bm{\gamma}^2} \|_F (dia(S_{}))^2 = O((dia(S_{}))^2).
    \end{aligned}
\end{equation}
Thus 
\begin{equation}
    \begin{aligned}
    \text{Eqn~\eqref{bound_2}} &= \sum_{j=0}^{4d} \lambda_j C_2(S_{}^j) - C_2(\sum_{j=0}^{4d} \lambda_j S_{}^{j}) 
    = O ((dia(S_{}))^2). \label{bound_2_final}
    \end{aligned}
\end{equation}

\noindent \textbf{item~(3)} We introduce an auxiliary optimization problem as follows:
\begin{equation}
    \begin{aligned}
    &\text{min~}f(y ,x) + C_1^{\text{}}(\bm{\gamma}) - C_2^{\text{}}(\bm{\gamma}) \\
    &\text{subject to}: \bm{\phi} \in IR_{{\Phi}}, D_{i1}^{\text{}}(\bm{\gamma}) - D_{i2}^{\text{}}(\bm{\gamma}) = 1 , \\ &~~~~~~~~~~~~~~~~~~~\bm{\gamma} \in \{\bm{\gamma}^{'}: \text{} \exists \bm{\gamma}^{''}\in S, \|\bm{\gamma}^{'} - \bm{\gamma}^{''}\|\leq \frac{(\sqrt{2}+1)\sqrt{d}}{\delta} dia(S)^2 \} \cap S_0.
    \end{aligned}   \label{auxiliary-formulation}
\end{equation}

Compared with the optimization problem of $\underline{ f_S(Y_x = y)}$ (by \eqref{auxiliary-formulation} with an additional constraint $\gamma \in S$), \eqref{auxiliary-formulation} provides a relaxed constraint on $\bm{\gamma}$. We denote the optimal solution of \eqref{auxiliary-formulation} as $\uwave{\bm{\gamma}}$, and the optimal value as $\uwave{f_S(Y_x = y)}$.


On the one hand, \eqref{auxiliary-formulation} slightly relaxes the constraint $\bm{\gamma} \in S_0$. Namely for each $\uwave{\bm{\gamma}}$, there exists a corresponding $\bm{\gamma}^{''} \in S$ with a distance less than $\frac{Q}{\delta}dia(S)^2$. Hence
\begin{equation}
    \begin{aligned}
    &[C_1(\underline{\bm{\gamma}}) -  C_2(\underline{\bm{\gamma}})] - [C_1(\uwave{\bm{\gamma}}) -  C_2(\uwave{\bm{\gamma}})] \\
    \leq& [C_1({\bm{\gamma}}^{''}) -  C_2({\bm{\gamma}}^{''})] - [C_1(\uwave{\bm{\gamma}}) -  C_2(\uwave{\bm{\gamma}})] \\
    \leq& \max\limits_{\bm{\gamma} \in S_0} \|\frac{\partial (C_1(\bm{\gamma}) - C_2(\bm{\gamma}))}{\partial \bm{\gamma}}\| \left(\frac{(\sqrt{2}+1)\sqrt{d}}{\delta}dia(S)^2\right).
    \end{aligned}\label{claim_1}
\end{equation}

On the other hand, we consider the optimal solution $\underline{\underline{\bm{\gamma}}}$ of $\underline{\underline{f_S(Y_x = y)}}$. We identify the elements $\underline{\underline{\bm{\gamma}}}^{T} = ((\underline{\underline{\bm{\psi^o}}})^T, \underline{\underline{\bm{\theta}}}^T, \underline{\underline{\bm{\psi}}}^T, \underline{\underline{\bm{\omega}}^T})$. Then we introduce an auxiliary solution as follows:
\begin{equation}
    \begin{aligned}
    &{\uwave{\psi^o_i}} = \frac{1}{\underline{\underline{\psi_i}}}, {\uwave{\bm{\psi^o}}} = ({\uwave{\psi^o_1}},... {\uwave{\psi^o_{d}}}), \uwave{\bm{\gamma}}^{'} =  (({\uwave{\bm{\psi^o}}})^T, \underline{\underline{\bm{\theta}}}^T, \underline{\underline{\bm{\psi}}}^T, \underline{\underline{\bm{\omega}}}^T).
    \end{aligned}\label{identification_new_gamma}
\end{equation}
We will show that $\uwave{\bm{\gamma}}^{'}$ is within the feasible region of \eqref{auxiliary-formulation}. By identification in \eqref{identification_new_gamma}, the first row of constraints in \eqref{auxiliary-formulation} can be directly satisfied. Moreover, by Assumption.~\ref{positive definite assumption}, we have 

\begin{equation}
    \begin{aligned}
    \|\uwave{\bm{\gamma}}^{'} - \underline{\underline{\bm{\gamma}}}\| =&  \left(\sum\limits_{i=1}^{d} (\frac{1}{\underline{\underline{\psi_i}}} - \underline{\underline{\psi_i^o}} )^2\right)^{\frac{1}{2}} \\
    \leq& \frac{1}{\delta} (\sum\limits_{i=1}^{d}( \underline{\underline{\psi_i}}\underline{\underline{\psi_i^o}} - 1)^2)^{\frac{1}{2}}\\
    =& \frac{1}{\delta} (\sum\limits_{i=1}^{d}( D_{i1}(\bm{\underline{\underline{\gamma}}}) - D_{i2}(\bm{\underline{\underline{\gamma}}}) - 1)^2)^{\frac{1}{2}}\\
    \leq& \frac{\sqrt{d}}{\delta} \max\limits_{i=1,...d} \left(1- \left(D_{i1}(\bm{\underline{\underline{\gamma}}}) - D_{i2}(\bm{\underline{\underline{\gamma}}})\right) \right) \\
    \leq& \frac{\sqrt{d}}{\delta} \max\limits_{i=1,...d}\left[ \left( D_{i1}^{sec}(\bm{\underline{\underline{\gamma}}}) - D_{i2}^{tan}(\bm{\underline{\underline{\gamma}}})\right) - \left( D_{i1}^{}(\bm{\underline{\underline{\gamma}}}) - D_{i2}^{}(\bm{\underline{\underline{\gamma}}})\right)\right].
    \end{aligned}\label{gamma_difference_1}
\end{equation}


Symmetrically, we have 
\begin{equation}
    \begin{aligned}
    \|\uwave{\bm{\gamma}}^{'} - \underline{\underline{\bm{\gamma}}}\| \leq \frac{\sqrt{d}}{\delta} \max\limits_{i=1,...d}\left[ \left( D_{i1}^{}(\bm{\underline{\underline{\gamma}}}) - D_{i2}^{}(\bm{\underline{\underline{\gamma}}})\right) - \left( D_{i1}^{tan}(\bm{\underline{\underline{\gamma}}}) - D_{i2}^{sec}(\bm{\underline{\underline{\gamma}}})\right) \right].
    \end{aligned}\label{gamma_difference_2}
\end{equation}


By the same strategy in \textbf{item(1)-(2)}, and noticing the fact that 
\begin{equation}
    \begin{aligned}
    \max\limits_{\bm{\gamma} \in S_{}} \|\frac{\partial^2 D_{i1}(\bm{\gamma)}}{\partial \bm{\gamma}^2} \|_F = 2, \max\limits_{\bm{\gamma} \in S_{}} \|\frac{\partial^2 D_{i2}(\bm{\gamma)}}{\partial \bm{\gamma}^2} \|_F = \sqrt{2},
    \end{aligned}
\end{equation}
\eqref{gamma_difference_1} and \eqref{gamma_difference_2} can be combined as
\begin{equation}
    \begin{aligned}
    &\|\uwave{\bm{\gamma}}^{'} - \underline{\underline{\bm{\gamma}}}\|  \leq   \frac{\sqrt{d}}{\delta} \min \{\frac{1}{2}*2+\sqrt{2}, 2 + \frac{1}{2}\sqrt{2}\} dia(S)^2 = \frac{\sqrt{d}}{\delta} (\sqrt{2}+1)dia(S)^2.
    \end{aligned}
\end{equation}

Hence we claim this $\uwave{\bm{\gamma}}^{'}$ is within the feasible region of \eqref{auxiliary-formulation}. Then
\begin{equation}
    \begin{aligned}
     \left[C_1(\uwave{\bm{\gamma}}) - C_2(\uwave{\bm{\gamma}})\right] - \left[C_1(\underline{\underline{\bm{\gamma}}}^{}) - C_2(\underline{\underline{\bm{\gamma}}}^{})\right]  &\leq \left[ C_1(\uwave{\bm{\gamma}}^{'}) - C_2(\uwave{\bm{\gamma}}^{'}) \right] - \left[C_1(\underline{\underline{\bm{\gamma}}}^{}) - C_2(\underline{\underline{\bm{\gamma}}}^{})\right] \\
     &\leq \max\limits_{\bm{\gamma} \in S_{}} \|\frac{\partial{(C_1(\bm{\gamma}) - C_2(\bm{\gamma}))}}{\partial{\bm{\gamma}}}\|  \frac{\sqrt{d}}{\delta} (\sqrt{2}+1)dia(S)^2.
     \end{aligned}\label{claim_2}
\end{equation}

Combining \eqref{claim_1} and \eqref{claim_2}, we have
\begin{equation}
    \begin{aligned}
    \mid (C^{\text{}}_1({\underline{{\bm{\gamma}}}}) -  C^{\text{}}_2({\underline{{\bm{\gamma}}}})) - ( C^{\text{}}_1({\underline{\underline{{\bm{\gamma}}}}}) -  C^{\text{}}_2(\underline{\underline{{\bm{\gamma}}}}))  \mid \leq \max\limits_{\bm{\gamma} \in S_{0}} \|\frac{\partial{(C_1(\bm{\gamma}) - C_2(\bm{\gamma}))}}{\partial{\bm{\gamma}}}\|  \frac{2\sqrt{d}}{\delta} (\sqrt{2}+1)dia(S)^2.
    \end{aligned}
\end{equation}

\noindent \textbf{Combination of} \textbf{item(1)-(3)} Combining with Eqn~\eqref{bound_1} and Eqn~\eqref{bound_2_final}, and recalling the bound in (\ref{linear_bound_main}), we have:
\begin{equation}
    \underline{\underline{f_{S_{}}^{}(Y_x = y)}} \leq \underline{f_{S_{}}(Y_x = y)} \leq   \underline{\underline{f_{S_{}}^{}(Y_x = y)}} +A * dia(S_{})^2.
\end{equation}
In brief, we have $\mid  \underline{\underline{f_{S_{}}^{}(Y_x = y)}} -  {\underline{f_{S_{}}^{}(Y_x = y)}}\mid = O(dia(S)^2)$. Thus we have proved our lemma.
\quad

\begin{remark}
We can do enhancement in $\textbf{Bounding}$ as follows. It is through taking advantage of the information from the parent simplex $\text{pa}(S)$ \footnote{$S_1 = \text{pa}(S_2)$ denotes $S_2$ is bisectioned from $S_1$.} and encapsulating the above bounding strategy into a recursive form during partitioning.
\begin{equation}
    \begin{aligned}
        \underline{\underline{f_{S_{}}(Y_{x} = y)}} =  &\max \{ \textbf{Bounding}(\text{pa}(S_{})), \underline{\underline{f_{S_{}}^{}(Y_{x} =  y)}\}}
    \end{aligned}
\end{equation}
\end{remark}


\noindent{\textbf{(4) \textbf{Global\_error}}}
For the final preparation, we introduce the bisection theorem:
\begin{theorem}{(\cite{kearfott1978proof}, Theorem~3.1)}
When $\tilde{S}_{i_{k}}$ is bisectioned from $S_{0}$ by $k$ times, we have
$
    dia(\tilde{S}_{i_{k}} ) \leq  (\frac{\sqrt{3}}{2})^{\lfloor\frac{k}{4d}\rfloor } dia(S_{0}). 
 $
\end{theorem}

On this basis, notice that lemma.~\eqref{second_bounded} holds on each iteration, and $dia(S_{0})<+\infty$, then we have 
\begin{equation}
    \begin{aligned}
    \mid {\underline{f_{\tilde{S}_{i_{L_n} }}(Y_x = y)}}  -    \underline{\underline{f_{\tilde{S}_{i_{L_n} }}(Y_x = y)}} \mid  \leq A ((\frac{{3}}{4})^{\lfloor\frac{L_n}{4d}\rfloor} )  = O((\frac{{3}}{4})^{\frac{L_n}{4d}}),
    \end{aligned}\label{second_bounded_result}
\end{equation}
where $A$ is identified in our main text.
~\quad \\

Until here we have proved procedure \textbf{(1)-(4)}, thus the main part of Theorem.~\ref{convergence_theorem} has been proved.
~\quad \\

Additionally, consider the infinite case. Due to $L_n \geq log(n)$ (the worst case is that simplices set is bisectioned like a complete binary tree), we have $L\rightarrow +\infty$ when $n\rightarrow +\infty$, thus 
$
    \lim_{n\rightarrow +\infty} \mid \underline{f(Y_x = y)}  - \underline{f_{opt}^n (f(Y_x=y))} \mid = 0.$
     Done.



\subsection{Extension to the ACE case}\label{app_corollary}
\label{corollary_ACE_appendix}


Taking advantage of PI-SFP, we can further achieve the valid bound of $ACE_{\bm{X} \rightarrow \bm{Y}}$. The above PI-SFP algorithm is to seek $\underline{f(Y_x = y)}$ when $x$ is fixed. We do further extension to consider all values of $\bm{X}$ simultaneously. In this sense, we reorganize (\ref{eqn_basic_bound}) to bound ACE as follows:
% \begin{equation}
%     \begin{aligned}
%     &\text{min~} \sum_x \pi(x) \int_{Y^L}^{Y^U} yf(y,x) dy +  \sum_x \pi(x) \sum_{i=1}^{d} \frac{ \left(\int_{Y^L}^{Y^U}yf(y,u_i,x)dy\right) f(u_i, \neg x)}{f(u_i,x)}\\  &\text{subject to: $f(y,\bm{U},\bm{W}, \bm{X}) \in \mathcal{F}$}.
%     \label{eqn_basic_bound_ace}
%     \end{aligned}
% \end{equation}


{
\begin{equation}
    \begin{aligned}
    &\text{min~} \sum_x \pi(x) \int_{Y^L}^{Y^U} yf(y,x) dy \\&+  \sum_x \pi(x) \sum_{i=1}^{d} \frac{ \left(\int_{Y^L}^{Y^U}yf(y,u_i,x)dy\right) f(u_i, \neg x)}{f(u_i,x)}\\  &\text{subject to: $f(y,\bm{U},\bm{W}, \bm{X}) \in \mathcal{F}$}.
    \label{eqn_basic_bound_ace}
    \end{aligned}
\end{equation}
}
Using the same strategy as in Section.~\ref{framework}-\ref{section_algorithm}, we can achieve the valid bound of ${ ACE_{\bm{X} \rightarrow \bm{Y}}}$ in~(\ref{tight_bound_ace}),



We first illustrate the construction of (\ref{eqn_basic_bound_ace}):
\begin{equation}
\begin{aligned}
     &\int_{X^L}^{X^U} \int_{Y^L}^{Y^U} f(Y_x=y)\pi(x) dx dy \\
     =&\int_{X^L}^{X^U} \int_{Y^L}^{Y^U} \sum_{i=1}^{d} \left(\frac{f(y,u_i,x) f(u_i,\neg x)}{f(u_i,x)}+f(y,x)\right)\pi(x) dx dy \\
     =& \sum_x \pi(x) \int_{Y^L}^{Y^U} yf(y,x) dy +  \sum_x \pi(x) \sum_{i=1}^{d} \frac{ \left(\int_{Y^L}^{Y^U}yf(y,u_i,x)dy\right) f(u_i, \neg x)}{f(u_i,x)}.
\end{aligned}
\end{equation}

Let $X=\{x_{1},x_{2},...x_{d_x}\}$. In this section, we extend PI-SFP method from bounding $f(Y_x = y)$ to bounding $ACE.$ For simplicity, we extend the denotations in our main text as follows:
\begin{equation}
    \begin{aligned}
    &\theta_{i \mid x} = {\int_{Y^{L}}^{Y^U}yf(y,U=u_i,x)dy},~~~~~~\bm{\theta_x} = (\theta_{1\mid x}, \theta_{2\mid x}, ...\theta_{d \mid x})^T,\\ 
    &\psi_{i \mid x} = {f(U=u_i,x)},~~~~~~~~~~~~~~~~~~~~~~~~\bm{\psi_x} = (\psi_{1\mid x}, \psi_{2\mid x}, ...\psi_{d \mid x})^T,\\ 
    &\omega_{i \mid x} = {f(U=u_i,\neg x)}, ~~~~~~~~~~~~~~~~~~~~~~~~ \bm{\omega_x} = (\omega_{1\mid x}, \omega_{2\mid x}, ...\omega_{d \mid x})^T,\\
    &\psi_{i \mid x} \psi_{i \mid x}^o = 1,  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\bm{\psi_x^o} = (\psi^o_{1\mid x}, \psi^o_{2\mid x}, ...\psi^o_{d \mid x})^T,\\
    &\bm{\phi_x} = (\bm{\theta_{x}}, \bm{\psi_{x}}, \bm{\omega_{x}}), ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\bm{\gamma}_x = \left(\begin{matrix}
    (\bm{\psi_{x}^o})^T,  \bm{\theta_{x}}^T,  \bm{\psi_x}^T,  \bm{\omega_x}^T
    \end{matrix}\right)^T.
    \end{aligned}
\end{equation}
On this basis, the independent variables are transformed to $\bm{\gamma} = (\bm{\gamma}_{x_1}, \bm{\gamma}_{x_2},...\bm{\gamma}_{x_{d_x}} )$. Following the same strategy as in Section.~\ref{framework} and Section.~\ref{section_algorithm}, we can relax the programming (\ref{eqn_basic_bound_ace}) in our main text as follows. It is a natural extension of (\ref{re-formulation}) in Section.~\ref{framework}, by which we seek the valid bound of ACE: 

\begin{equation}
    \begin{aligned}
    \underline{ACE_{\bm{X}\rightarrow \bm{Y}}} {=} &\min \sum_{x}\int_{Y^{L}}^{Y^U} \pi(x) yf(y ,x) dy +  \sum_{x}\sum_{i=1}^{d} \psi_{i\mid x}^{o} \theta_{i\mid x} \omega_{i\mid x}\pi(x), \\
    &\text{~subject to~} 
    \forall x \in X, \psi_{i\mid x}^{o} \psi_{i\mid x} = 1 , 
    \bm{\phi_x} \in IR_{\bm{\Phi_x}} =  IR^{1}_{\bm{\Phi_x}} \cap  IR^2_{\bm{\Phi_x}}, \\
    \end{aligned}   \label{PI-SFP_ACE}
\end{equation}

where the set $IR^{1}_{\bm{\Phi}}$ is constructed as

\begin{equation}
\begin{aligned}
   &IR^{1}_{\bm{\Phi_x}} = \{\bm{\phi_x}: \left[\begin{matrix}
   -\bm{I_{d*d}} \\ \bm{I_{d*d}}
   \end{matrix}\right] \left[\begin{matrix}
   &(\int_{Y^L}^{Y^U} yf(y,\bm{W},x) dy )^{T} \\ &f(\bm{W},x)^{T} \\ &f(\bm{W},\neg x)^{T}
   \end{matrix} \right]^{T}  -  \left[ \begin{matrix}
   &-\overline{P(\bm{W}\mid \bm{U})} \\  &\underline{P(\bm{W} \mid \bm{U})}
   \end{matrix} \right] \bm{\phi_x} \geq \bm{0}\}.\\
   \end{aligned}
\end{equation}

$\bm{I_{d*d}}$ is the $d*d$ identity matrix. Moreover, the set $IR^{2}_{\bm{\Phi}}$ indicates the natural constraints by default:

\begin{equation}
    \begin{aligned}
    IR_{\bm{\Phi}_x}^{2} = \left\{ \bm{\phi}_x:
    \left[\begin{matrix}
    &\bm{1_{1*d}}^{} \bm{\theta_x} \\ &\bm{1_{1*d}}^{} \bm{\phi_x} \\ &\bm{1_{1*d}}^{} \bm{\omega_x} 
    \end{matrix}\right] = \left[ \begin{matrix}
    &\int^{Y^U}_{Y^L} yf(y,x)dy \\ &f(x)\\ &f(\neg x) \}
    \end{matrix}\right], \forall i,
\left\{\begin{matrix}
    \theta_{i\mid x} \in [0,f(y,x)] \\
    \phi_{i\mid x} \in (0,f(x)] \\
    \omega_{i\mid x} \in [0,f(\neg x)]
\end{matrix} \right\} \right\}    .
    \end{aligned}
\end{equation}
$\bm{1_{1*d}}$ is the $1*d$ all-ones vector. Then (\ref{re-formulation_linear_weaker}) in our main text is extended as
\begin{equation}
    \begin{aligned}
   &\min \sum_{x} \pi(x)\{[C_1^{\text{tan}}(\bm{\gamma}_{x}) - C_2^{\text{sec}}(\bm{\gamma}_{x})]\mathbbm{1}_{\pi(x)>0}+[C_1^{\text{sec}}(\bm{\gamma}_{x}) - C_2^{\text{tan}}(\bm{\gamma}_{x})]\mathbbm{1}_{\pi(x)<0}\},  \\
   &\text{~subject to~}D_{i}^{l}(\bm{\gamma}_{x}) \leq 1,  D_{i}^{u}(\bm{\gamma}_{x}) \geq 1, \forall i=1,2,...d, 
    \bm{\phi_x} \in IR_{\bm{\Phi_x}}.
    \end{aligned}
\end{equation}
Here the function $C_k^{\text{tan}}(\bm{\gamma}_{x}), C_k^{\text{sec}}(\bm{\gamma}_{x}), k=1,2, D_{i}^{l}(\bm{\gamma}_{x}), D_{i}^{u}(\bm{\gamma}_{x}), i=1,2,...d$ are all following (\ref{tan_sec}) in our main text. After this construction, we adopt the same simplicial partition strategy as in our main text.



\subsection{The proof of further discussions and extensions}\label{app_discussion}
This part is the supplement of the discussion in the main paper.
\subsubsection{Discussion 1: the proof of lemma.~\eqref{lemma_no_assumption} and the justification of our Assumption~\ref{ass_partial_bounded}}\label{pro_lemma_no_assumption}

\textbf{}
For simplification, the denotations $Y=y$, $x$ are simplified as $y$ and $x$, the denotation $\neg x$ is simplified as $x^c$, and $d_w$ is simplified as $\mathscr{W}$. Samely, we use $\bm{E_{i*i}}$ to denote the $i*i$ identity matrix, $\bm{J_{i,j}}$ to denote the $i*j$ all-ones matrix, and $\bm{0_{i*j}}$ to denote the $i*j$ all-zero matrix.

\begin{itemize}
\item \textbf{Conclusion 1:} The tight lower bound of ${f(Y_x = y)}$ is $f(y, x)$.
\end{itemize}

We divide it into two parts. On the one hand, if $\mathscr{W} \geq d$, $P(\bm{W} \mid \bm{U})$ can be constructed as follows.
\begin{equation}
    \begin{aligned}   P(\bm{W}\mid \bm{U}) = \left[
\begin{BMAT}{c.c}{c}
    \begin{matrix} 
    \overbrace{\bm{P_{11}}}^{m*m} \\
    {\overbrace{\bm{P_{21}}}^{(\mathscr{W}-m)*m}}
    \end{matrix} 
    & 
    \begin{matrix} 
    \overbrace{\bm{P_{12}}}^{(d-m)*(d-m)}
    \\   
    \overbrace{\bm{P_{22}}}^{(\mathscr{W}-d+m)*(d-m)}
    \end{matrix} \\
\end{BMAT}
\right]
,
    \end{aligned} \label{trivial_construction}
\end{equation}  
where $\bm{P_{11}}, \bm{P_{12}}, \bm{P_{21}}, \bm{P_{22}}$ are matrices whose upper brackets indicate their rows and columns~($m\in [1,d-1]$). Specifically,
\begin{equation}
    \begin{aligned}
&\bm{P_{11}} = {\sum\limits_{i=1}^{m}f(W=w_i \mid y,x) }  \bm{E_{m*m}},~\bm{P_{12}} =  {\sum\limits_{i=1}^{d-m} f(W=w_i\mid x^c) } \bm{E_{(d-m)*(d-m)}}, \\
&\bm{P_{21}} = { \left[ \begin{matrix}  {f(W=w_{m+1}\mid y,x)}\\
    ... \\
    {f(W=w_{\mathscr{{W}}}\mid y,x)}\\  
    \end{matrix}\right]}  \bm{J_{1*m}},~\bm{P_{22}} = {\left[ \begin{matrix}  {f(W=w_{d-m+1}\mid x^c)}  \\
    ...  \\ 
    {f(W=w_{\mathscr{W}}\mid x^c)} \end{matrix}\right]}   \bm{J_{1*(d-m)}}.
\end{aligned} 
\end{equation} 

There is a solution for $f(y,\bm{U},x)$, $f(\bm{U},x^c)$ respectively as
\begin{equation}
\begin{aligned}
     \frac{1}{\sum\limits_{i=1}^{m}f(W=w_i\mid y,x)}\left[
     \begin{matrix}
     {f(y,W=w_1,x) }\\
     ...\\
     {f(y,W=w_m,x)}\\
       \bm{0_{(d-m)*1}}
     \end{matrix} \right],~~ \frac{1}{\sum\limits_{i=1}^{{d-m}}P(W=w_i\mid x^c)} \left[
     \begin{matrix}
          \bm{0_{m*1}}\\
     {P(W=w_1,x^c)}{}\\
     ...\\
     {P(W=w_{d-m},x^c)}\\
     \end{matrix}\right].
    \end{aligned}
\end{equation}

Due to $f(y,\bm{U},x) \circ f(\bm{U},x^c) = 0$ and the condition $f(\bm{U},x)>\bm{0}$, we have
\begin{equation}
    \begin{aligned}
     f(Y_x = y) =  f(y,x)+\sum_{i=1}^d\frac{f(y,u_i,x)}{P(u_i,x)}P(u_i, x^c) = f(y,x).
    \end{aligned}
\end{equation}

On the other hand, if $ \mathscr{W} < d $, we make adjustments on \eqref{trivial_construction} ($m_1+m_2 \leq \mathscr{W}$):
\begin{equation}
    \begin{aligned} \left[
\begin{BMAT}{c.c.c}{c}
    \begin{matrix} 
    \overbrace{\bm{P_{11}}}^{m_1*m_1} \\
    \overbrace{\bm{P_{21}}}^{(\mathscr{W}-m_1)*m_1}
    \end{matrix} 
    & 
    \begin{matrix} 
    \overbrace{\bm{P_{12}}}^{m_2 * m_2}
    \\   
    \overbrace{\bm{P_{22}}}^{(\mathscr{W}-m_2) * m_2}
    \end{matrix} 
    &
    \overbrace{\bm{P_3}}^{\mathscr{W} * (d-m_1-m_2)}
\end{BMAT}
\right] ,
    \end{aligned} \label{trivial_construction_extend}
\end{equation}
Specifically,
\begin{equation}
    \begin{aligned}
&\bm{P_{11}} = {\sum\limits_{i=1}^{m_1} P(W=w_i\mid y,x) }  \bm{E_{m_1 * m_1}},~\bm{P_{12}} = {\sum\limits_{i=1}^{m_2} P(W=w_i\mid x^c) }{}\bm{E_{m_2*m_2}}. \\
&\bm{P_{21}} = {\left[ \begin{matrix}  {f(W=w_{m_1+1}\mid y,x)}\\
    ... \\
    {f(W=w_{\mathscr{W}}\mid y,x)}\\  
    \end{matrix}\right]}^{} \bm{J_{1,m_1}},~\bm{P_{22}} = {\left[ \begin{matrix}  {f(W=w_{m_2+1} \mid x^c)} \\
    ...  \\ 
    {f(W=w_{\mathscr{W}}\mid x^c)}  \end{matrix}\right]} \bm{J_{1*m_2}}.\\
    &\bm{P_3} = \frac{1}{\mathscr{W}} {\bm{J_{\mathscr{W} * (d-m_1-m_2)}}}.
\end{aligned} 
\end{equation} 

Analogously, there is a solution for $f(y,\bm{U},x)$, $f(\bm{U},x^c)$ as follows respectively:


\begin{equation}
\begin{aligned}
     \frac{1}{\sum\limits_{i=1}^{m_1}f(W=w_i\mid y,x)}\left[
     \begin{matrix}
     {f(y,W=w_1,x)}\\
     ...\\
     {f(y,W=w_{m_1},x)}\\
       \bm{0_{(d-m_1)*1}}
     \end{matrix} \right],~~  \frac{1}{\sum\limits_{i=1}^{m_2}P(W=w_i\mid x^c)}\left[
     \begin{matrix}
         \bm{0_{m_1*1}} \\
     {P(W=w_1, x^c)}\\
     ...\\
     {P(W=w_{m_2}, x^c)}\\
    \bm{0_{(d-m_1-m_2)*1}}
     \end{matrix}\right].
    \end{aligned}
\end{equation}



In this case, we also have $f(Y_x = y) =  f(y,x)+\sum\limits_{i=1}^d\frac{f(y,u_i,x)}{f(u_i,x)}f(u_i,x^c) = f(y,x)$. In conclusion, if no assumptions are imposed, we have $\min {f(Y_x = y)} = f(y, x)$. Proved.\\

\begin{itemize}
\item \textbf{Conclusion 2:} If $P(\bm{W} \mid \bm{U})$ is restricted to be left-reversible and $ f(\bm{W} \mid \neg x) \neq  f(\bm{W} \mid x,y)$, then the tight lower bound of ${f(Y_x = y)}$ is $f(y, x)$.
\end{itemize}

Without loss of generalization, we can assume that $\exists i_0 \in \{d,d+1,...\mathscr{W}\}$, such that $f(W=w_{i_0} \mid \neg x) \neq f(W = w_{i_0} \mid x,y)$, or else we just need to relabel $\bm{W}$ in another order. 

On this basis, we still follow the Construction.~\ref{trivial_construction} in the first part. The tight lower bound has already been proved as $f(y,x)$, thus we only need demonstrate that with some choice of $m$, $P(\bm{W} \mid \bm{U})$ is left-reversible with the above assumption. In practice, we choose $m=d-1$. Then the $P(\bm{W} \mid \bm{U})$ is reformulated as
\begin{equation}
\begin{aligned}
    \left[
\begin{BMAT}{c.c}{c}
    \begin{matrix} 
    \overbrace{\bm{P_{11}}}^{(d-1)*(d-1)} \\
    {\overbrace{\bm{P_{21}}}^{(\mathscr{W}-d+1)*(d-1)}}
    \end{matrix} 
    & 
    \begin{matrix} 
    \overbrace{\bm{P_{12}}}^{1*1}
    \\   
    \overbrace{\bm{P_{22}}}^{(\mathscr{W}-1)*1}
    \end{matrix} \\
\end{BMAT}
\right]
:=
\left[
\begin{BMAT}{c.c}{c}
    \begin{matrix}
     \sum\limits_{i=1}^{d-1}P(W=w_i\mid y,x) \bm{E_{(d-1)*(d-1)}} \\
    \left[\begin{matrix}
     {f(W=w_d\mid y,x)} \\{f(W=w_{d+1}\mid y,x)} 
     \\...
     \\ {f(W=w_{\mathscr{W}}\mid y,x)}  
    \end{matrix} \right] \bm{J_{1*(d-1)}}
\end{matrix}
    & \left[
    \begin{matrix} 
   {f(W=w_1\mid x^c)} \\
   {f(W=w_2\mid x^c)} \\
   ...\\
   {f(W=w_{\mathscr{W}}\mid x^c)}
    \end{matrix} \right] \\
\end{BMAT}
\right].
\end{aligned}\label{left_reversible_wu_construct}
\end{equation}

We make equivalent denotations:
\begin{equation}
    \begin{aligned}
\left[
\begin{BMAT}{c}{c}
    \begin{matrix} 
    \overbrace{\bm{P^{'}_{12}}}^{(d-1)*1}
    \\   
    \overbrace{\bm{P^{'}_{22}}}^{(\mathscr{W}-d+1)*1}
    \end{matrix} \\
\end{BMAT}
\right] := 
 \left[
\begin{BMAT}{c}{c}
    \begin{matrix} 
    \overbrace{\bm{P_{12}}}^{1*1}
    \\   
    \overbrace{\bm{P_{22}}}^{(\mathscr{W}-1)*1}
    \end{matrix} \\
\end{BMAT}
\right]
    \end{aligned}
\end{equation}


In the following part, we claim that we only need to prove $\bm{P^{'}_{22} - P_{21} P_{11}^{-1} P^{'}_{12}} \neq \bm{0}$. We do the following algebraic distortion:
\begin{equation}
    \begin{aligned}
    \left[ \begin{matrix}
    \bm{E_{(d-1)*(d-1)}}& \bm{0_{(d-1)*(\mathscr{W}-d+1)}} \\ \bm{-P_{21} P_{11}^{-1}}& \bm{E_{(\mathscr{W}-d+1)*(\mathscr{W}-d+1)}}
    \end{matrix} 
     \right] *  \left[ \begin{matrix}
   \bm{P_{11}} & \bm{P^{'}_{12}} \\ \bm{P_{21}} & \bm{P^{'}_{22}}
    \end{matrix} 
     \right]  =  \left[ \begin{matrix}
   \bm{P_{11}} & \bm{P^{'}_{12}} \\ \bm{0_{(\mathscr{W}-d+1)*(d-1)}} & \bm{P^{'}_{22} - P_{21} P_{11}^{-1} P^{'}_{12}}
    \end{matrix} 
     \right].
    \end{aligned}
\end{equation}

According to the well-known Sylvester’s inequality~\citep{matsaglia1974equalities}: $\forall$$\bm{A_{m*n}},\bm{B_{n*p}}$, we have $\min \{rank(\bm{A}), rank(\bm{B})\} \geq rank(\bm{AB}) \geq rank(\bm{A})+rank(\bm{B})-n$. Then we have
\begin{equation}
    \begin{aligned}
        rank \left(\left[ \begin{matrix}
   \bm{P_{11}} & \bm{P^{'}_{12}} \\ \bm{P_{21}} & \bm{P^{'}_{22}}
    \end{matrix} 
     \right]\right) = rank\left(\left[ \begin{matrix}
   \bm{P_{11}} & \bm{P^{'}_{12}} \\ \bm{0_{(\mathscr{W}-d+1)*(d-1)}} & \bm{P^{'}_{22} - P_{21} P_{11}^{-1} P^{'}_{12}}
    \end{matrix} 
     \right]\right).
    \end{aligned}
\end{equation}

If $\bm{P^{'}_{22} - P_{21} P_{11}^{-1} P^{'}_{12}} = \bm{0_{(\mathscr{W}-d+1)*(d-1)}}$, then the right side of $rank()$ will be equal to $rank(\left[\bm{P_{11}},\bm{P_{12}^{'}}\right]) = d-1 <d$. On the other hand, if $\bm{P^{'}_{22} - P_{21} P_{11}^{-1} P^{'}_{12}} \neq \bm{0_{(\mathscr{W}-d+1)*(d-1)}}$, then it will turn to be $d$ (full column rank). In conclusion, to demonstrate the left-reversibility of $P(\bm{W}\mid\bm{U})$, $\bm{P^{'}_{22} - P_{21} P_{11}^{-1} P^{'}_{12}} \neq \bm{0}$ is all we need.

If we use $[\cdot]_{(i)}$ to denote the $i$-th element of vector $i=d,...\mathscr{W}$, then
\begin{equation}
    {[\bm{P^{'}_{22} - P_{21} P_{11}^{-1} P^{'}_{12}}]}_{(i)} = {\sum\limits_{i=1}^{d-1}f(W=w_i\mid x^c) }  \left[\frac{f(W=w_i,x^c)}{\sum\limits_{i=1}^{d-1}f(W=w_i,x^c)} - \frac{f(y,W=w_i,x)}{\sum\limits_{i=1}^{d-1}f(y,W=w_i,x)}\right]. \label{each_element}
\end{equation}
We make the contradiction. If we have ${\bm{P^{'}_{22} - P_{21} P_{11}^{-1} P^{'}_{12}}} = \bm{0_{(\mathscr{W}-d+1)*(d-1)}}$, then 
\begin{equation}
\begin{aligned}
\|{\bm{P^{'}_{22} - P_{21} P_{11}^{-1} P^{'}_{12}}}\|_1 &= {\sum\limits_{i=1}^{d-1}f(W=w_i\mid x^c) } \left[ \frac{\sum\limits_{i=d}^{\mathscr{W}}f(W=w_i,x^c)}{\sum\limits_{i=1}^{d-1}f(W=w_i,x^c)} - \frac{\sum\limits_{i=d}^{\mathscr{W}}f(y,W=w_i,x)}{\sum\limits_{i=1}^{d-1}f(y,W=w_i,x)}\right] \\
&= {\sum\limits_{i=1}^{d-1}f(W=w_i\mid x^c) } \left[ \frac{f(x^c)}{\sum\limits_{i=1}^{d-1}f(W=w_i,x^c)} - \frac{f(y,x)}{\sum\limits_{i=1}^{d-1}f(y,W=w_i,x)}\right]\\
&={\sum\limits_{i=1}^{d-1}f(W=w_i\mid x^c) }\left[\frac{1}{\sum\limits_{i=1}^{d-1}f(W=w_i \mid x^c)} - \frac{1}{\sum\limits_{i=1}^{d-1}f(W=w_i \mid y,x)}\right] = 0.
\end{aligned}
\end{equation}
Thus we have $\sum\limits_{i=1}^{d-1} f(W=w_i \mid x^c) = \sum\limits_{i=1}^{d-1} f(W=w_i \mid y,x)$. Then we substitute it into Eqn~\eqref{each_element}, we have
\begin{equation}
    \begin{aligned}
        f(W=w_i \mid x^c) - f(W=w_i \mid x,y) = 0, \forall i\in\{d,...\mathscr{W}\}.
    \end{aligned}
\end{equation}
Contradiction! Hence we have $ {\bm{P_{22} - P_{21} P_{11}^{-1} P_{12}}} \neq \bm{0_{(\mathscr{W}-d+1)*(d-1)}}$, and then $P(\bm{W} \mid \bm{U})$ in Construction.~\ref{left_reversible_wu_construct} is left-reversible. Proved.

\begin{itemize}
\item \textbf{Conclusion 3:} If $P(\bm{W} \mid \bm{U})$ is restricted to be left-reversible and $ f(\bm{W} \mid \neg x) =  f(\bm{W} \mid x,y)$, then the tight lower bound of ${f(Y_x = y)}$ is $f(y\mid x)$.
\end{itemize}
If this assumption holds, we will have $\bm{P^{'}_{22} - P_{21} P_{11}^{-1} P^{'}_{12}} = \bm{0}$ in the above construction, thus $P(\bm{W}\mid \bm{U})$ will be irreversible and validates the condintion here. Hence we need another way. 

According to the left-reversibility of $P(\bm{W} \mid \bm{U})$, we have
\begin{equation}
    \begin{aligned}
      f(\bm{U}\mid x,y) = P(\bm{W} \mid \bm{U})^{-1} f(\bm{W}\mid  x,y) = P(\bm{W} \mid \bm{U})^{-1} f(\bm{W}\mid x^c) = f(\bm{U} \mid x^c)
    \end{aligned}\label{totally_equal}
\end{equation}
Then we have
\begin{equation}
    \begin{aligned}
        f(Y_x = y) &= f(x,y) + \sum_{i=1}^{d}\frac{f(x,y,u_i)}{f(x,u_i)}f(u_i,x^c) \\
        &= f(x,y) + f(x,y)f(x^c) \sum_{i=1}^d \frac{f(u_i \mid x,y)}{f(x, u_i)} f(u_i \mid x^c)\\
        &= f(x,y) + f(x,y)f(x^c) \sum_{i=1}^d \frac{f(u_i \mid x,y)^2}{f(x, u_i)}\\
        & \overset{*}{\geq} f(x,y) + f(x,y)f(x^c) \frac{(\sum_{i=1}^d f(u_i\mid x,y))^2}{\sum_{i=1}^d f(x,u_i)}\\
        & = f(x,y)\left(1+\frac{f(x^c)}{f(x)}\right)\\
        & = f(y \mid x). 
    \end{aligned}
\end{equation}
According to the Chauchy's inequality, the $'\geq'$ ($*$) turns to be $'='$ if and only if $f(\bm{U} \mid x,y) = f(\bm{U} \mid x)$. Combining with Eqn~\eqref{totally_equal}, we have $f(\bm{U} \mid x,y) = f(\bm{U} \mid x) = f(\bm{U} \mid x^c) = f(\bm{U})$. It holds if and only if $f(\bm{W} \mid x,y) = f(\bm{W} \mid x) = f(\bm{W} \mid x^c) = f(\bm{W})$, or else the lower bound is not tight.

\noindent \textbf{Generalisability}~Our PI-SFP approach's generalizability can be highlighted in two ways: 1) PI-SFP can handle cases where either reversibility or total observability, or both, do not exist, which renders the literature on single-proxy control ineffective. 2) Extending PI-SFP to incorporate negative control (as shown in Fig.\ref{Fig.sub.2} and Fig.\ref{Fig.sub.3}) is an optional add-on and not a necessity. This simplicity eliminates the need for numerous assumptions, such as completeness and bridge function, which are present in previous double negative control literature~\citep{miao2018identifying, cui2020semiparametric, tchetgen2020introduction, deaner2018proxy,shi2020multiply,singh2020kernel, nagasawa2018identification,kallus2021causal}. The negative control extension to Fig.~\ref{Fig.sub.2}-\ref{Fig.sub.3} will be discussed in the next subsection.

\noindent \textbf{Verifiability} The feasibility of Assumption~\ref{ass_partial_bounded} has been suggested in previous work. Kuroki et al.~\cite{kuroki2014measurement} suggested that the bounds $\underline{P(\bm{W}\mid \bm{U})}$ and $\overline{P(\bm{W}\mid \bm{U})}$ can be determined a priori through the Bayesian strategy \citep{greenland2005multiple} and some re-calibration methods \citep{rothman2008modern, selen1986adjusting}. In their "Head Start Program," they provide a detailed estimation of $P(\bm{W}\mid \bm{U})$ to support this claim.

\noindent \textbf{Practical correspondence}

\begin{itemize}
    \item Some general cases (just conduct sampling upon $\bm{U}$): This hypothesis is commonly encountered in real situations, with Kuroki, Judea Pearl~\cite{kuroki2014measurement} (page 4) and Li, Judea Pearl~\cite{li2022bounds} specifically illustrating how to sample $U$ to infer approximate/bounded estimates of $P(\bm{W}\mid \bm{U})$, and mentioning such sampling method has been previously and commonly used, such as fundamental work~\cite{greenland2005multiple, rothman2008modern, carroll2006measurement, selen1986adjusting}.
    \item Concrete example 1 for $P(\bm{W}\mid \bm{U})$ (recommendation system: $\bm{W}$ denotes the popularity of $\bm{U}$): There are also some more practical examples in our life. For instance, in the context of recommendation systems, a significant amount of work uses the representation of product-user features as a confounder $\bm{U}$. However, this representation often includes sensitive information, leading to incomplete observations of $\bm{U}$. Building upon this, the popularity ranking of products in different regions and time periods is publicly available information, which can be used as a proxy variable $\bm{W}$ for products. By analyzing the different purchasing tendencies of various demographic groups, we can obtain upper and lower bounds estimates for the transition matrix $P(\bm{W}\mid \bm{U})$. In this scenario, firstly, $P(\bm{W}\mid \bm{U})$ is often irreversible because popularity ranking information itself can be seen as an indicator/projection, and the information it carries is not as rich as the product features. Secondly, we typically can only estimate the upper and lower bounds of $P(\bm{W}\mid \bm{U})$ (through methods like Bayesian estimation), as the sampling estimation process for $\bm{U}$ is likely to be biased.
    \item Concrete example 2 for $P(\bm{W}\mid \bm{U})$ (privacy protection: $\bm{W}$ is the de-identified $\bm{U}$): Our PI-SFP framework is also related to privacy-protecting scenario. Survey collectors often need to gather some sensitive information $\bm{U}$. To obtain more accurate responses and avoid the risk of disclosing personal privacy, they often ask survey respondents to answer some yes-or-no questions using the Randomized Response method. We consider the survey results as $\bm{W}$, with the following steps: First, the survey respondent flips a coin (with equal probability of heads or tails), and only they know the result. If it lands heads, they answer the question truthfully; if it lands tails, they flip the coin again (with only them knowing the result); if the second toss is headed, they answer “Yes”; if it is tails, they answer “No”. With this setup, even if we do not know $\bm{U}$, we can deduce $P(W)$ and $P(\bm{W}|\bm{U})=[3/4,1/4;1/4,3/4]$. Of course, this simple setup may still expose other sensitive information, such as the joint distribution of $P(\bm{Y},\bm{U},\bm{X})$, which is something the government would not want to see or make public. Therefore, in practical use, for highly confidential information which requires the strongest privacy protection, we tend to develop a more complex/dynamic/irreversible $P(\bm{W}|\bm{U})$ (i.e., privacy-protecting algorithms) and manually set its upper and lower bounds.
\end{itemize}







\subsubsection{Discussion 2: algorithm comparison and acceleration} \label{pre_train}



In this section, we discuss two additional optimization methods which are potential to solve our partial observability problem. We subsequently illustrate the superior performance of PI-SFP compared to these methods. Furthermore, we introduce a novel pruning strategy supported by a local optimization method, which accelerates the optimization process.

\noindent\textbf{Algorithm comparison} The author of \cite{shen2017solving} derived an $\varepsilon-$approximation method that can be utilized in our problem, and the outcome will lie within $[\underline{f(Y_x = y)}, (1+\varepsilon)\underline{f(Y_x = y)}]$. However, this algorithm exhibits an exponential time complexity for the dimension $d_u$, making it challenging to operate effectively in high-dimensional confoundings. Additionally, an iterative algorithm was developed in \cite{le2014dc} to search the Karush-Kuhn-Tucker (KKT) point of the difference-in-convex (DC) problem, which can be applied to~\eqref{re-formulation}. Nevertheless, KKT theory cannot guarantee global optimality compared to our PI-SFP.

\noindent\textbf{Algorithm acceleration} To expedite the PI-SFP process, we propose setting sufficient criteria to evaluate whether the current partition contains the optimal solution. If the criteria are not met, we can delete the branch online and narrow our search. To this end, we propose a new auxiliary algorithm specifically designed to search for the local minimum of $f(Y_x = y)$, which serves as an upper-bound of $\underline{f(Y_x = y)}$. We achieve this by implementing the algorithm in the sub-simplex $S$. Specifically, if the optimal value $\underline{\underline{f_S(Y_x = y)}}$ is even larger than the local minimum, then it will be larger than $\underline{f(Y_x = y)}$. Therefore, we claim that this partition must not include the optimal solutions and can be removed permanently. The auxiliary local optimization algorithm is provided as follows. The principle of our algorithm is based on the lemma:
\begin{lemma}
$\forall i,j$, if we make adjustment:
\begin{equation}{
    \begin{aligned}\left[
     \begin{matrix}
     &\breve{\theta}_i & \breve{\theta}_j \\
     &\breve{\psi}_i & \breve{\psi}_j \\
     &\breve{\omega}_i & \breve{\omega}_j
     \end{matrix}
     \right]
     =  \left[
     \begin{matrix}
     &\theta_i & \theta_j \\
     &\psi_i & \psi_j \\
     &\omega_i & \omega_j
     \end{matrix}
     \right]
     \left[
     \begin{matrix}
     &\alpha &1-\alpha \\
     &1-\alpha & \alpha 
     \end{matrix} \right],
     \text{~where~} \alpha \in \begin{cases}
 (0,1] \text{~if~} (\frac{\theta_{i}}{\theta_j}-\frac{\psi_{i}}{\psi_{j}})(\frac{\omega_{i}}{\omega_{j}}-\frac{\psi_{i}}{\psi_{j}}) \geq 0. \\ [1,+\infty) \text{~if~} (\frac{\theta_{i}}{\theta_j}-\frac{\psi_{i}}{\psi_{j}})(\frac{\omega_{i}}{\omega_{j}}-\frac{\psi_{i}}{\psi_{j}}) \leq 0,
 \end{cases}
    \end{aligned}}
\end{equation}
Then we have
\begin{equation}
    \begin{aligned}
     \sum_{m=i,j}\frac{\breve{\theta}_m}{\breve{\psi}_m}\breve{\omega}_m  \leq
     \sum_{m=i,j}\frac{{\theta}_m}{{\psi}_m}{\omega}_m.
    \end{aligned}
\end{equation}\label{adjustment}

\end{lemma}

\textbf{}
We consider the case $\alpha \in (0,1)$, and the second case is symmetric. Due to $(\frac{\theta_i}{\theta_{j}}-\frac{\psi_{i}}{\psi_{j}}) (\frac{\omega_{i}}{\omega_{j}}-\frac{\psi_{i}}{\psi_{j}}) \geq 0$, we have
\begin{equation}
    \begin{aligned}
        (\theta_i - \theta_j)(\omega_j - \omega_i)\psi_i \psi_j + (\theta_j \omega_j \psi_i - \theta_i \omega_i \psi_j)(\psi_j - \psi_i) \leq 0.
    \end{aligned}\label{original_condition}
\end{equation}
If we denote that 
\begin{equation}
    \begin{aligned}
        Q_{ij} := \alpha \psi_i \psi_j (\theta_i - \theta_j) (\omega_j - \omega_i)
        \left[ (1-\alpha)\psi_i + \alpha \psi_j \right] + (\theta_j \omega_j \psi_i - \theta_i \omega_i \psi_j)  \left[ (1-\alpha)\psi_i + \alpha \psi_j \right] \psi_j.
    \end{aligned}
\end{equation}
Then Eqn~\eqref{original_condition} is equal to 
\begin{equation}
    \begin{aligned}
        Q_{ij}  \leq - Q_{ji}. 
    \end{aligned}\label{leq_q}
\end{equation}
Furthermore, we find 
\begin{equation}
    \begin{aligned}
        Q_{ij} &= \psi_j \breve{\psi}_j \left[ \alpha \psi_i (\theta_i - \theta_j)(\omega_j - \omega_i) + \theta_j \omega_j \psi_i - \theta_i \omega_i \psi_j\right]= \frac{1}{1-\alpha} \psi_j \breve{\psi}_j \left[\breve{\theta}_i \breve{\omega}_i \psi_i - \theta_i \omega_i \breve{\psi}_i \right].
    \end{aligned}
\end{equation}
Hence Eqn~\eqref{leq_q} can be transformed as 
\begin{equation}
    \begin{aligned}
        \psi_j \breve{\psi}_j \left[\breve{\theta}_i \breve{\omega}_i \psi_i - \theta_i \omega_i \breve{\psi}_i\right] < - \psi_i \breve{\psi}_i \left[\breve{\theta}_j \breve{\omega}_j \psi_j - \theta_j \omega_j \breve{\psi}_j\right]
    \end{aligned}
\end{equation}
Hence 
\begin{equation}
    \begin{aligned}
        \sum_{m=i,j}\frac{\breve{\theta}_m}{\breve{\psi}_m}\breve{\omega}_m  \leq
     \sum_{m=i,j}\frac{{\theta}_m}{{\psi}_m}{\omega}_m.
    \end{aligned}
\end{equation}

Thus we have proved. By this strategy, we should choose suitable $\alpha$ to satisfy $\bm{\phi} \in IR_{\bm{\Phi}}$, namely that $f(y,\bm{W},\bm{U},\bm{X})\in \mathcal{\widetilde{F}}$. 

\quad

%接下来任务

% 1 对P（W|U）的假设转移到P（U）的假设上来                               (OK)
% 2 补充工具变量的描述                                                   (OK)
% 3 补充对假设的验证方案,(more CONCRETE)                                              
% 4 待定：补充对连续域的思考 (注意看那篇新论文以及变分手段)                                             
% 5 补充从f(y \mid do(x))到E(Y)上的乃至general case的讨论                (OK)

\iffalse
\subsection{Discussion 4: the proof of lemma.~\eqref{exposure_bound} }\label{pro_exposure_bound}
\textbf{}
The left-reverse is denoted as $P^{*}(\bm{W}\mid \bm{U})$. First, we notice that

\begin{equation}
\begin{aligned}
   P(y \mid Z=z,x) &= [P(y \mid \bm{U}, x)]P(\bm{U}\mid Z=z,x) \\
      P(Y_x = y) &= [ P(y \mid \bm{U},x)]P(\bm{U}).
     \end{aligned}\label{basic_equation}
\end{equation}

On this basis, note that all the above variables is observed instead of $f(Y_x = y)$, we only need to estimate two parts: 1) the distance between $ P(\bm{U} \mid Z=z,x)$ and $P(\bm{U}) $, 2) the bound of $P(y \mid \bm{U}, x)$.

{\textbf{The distance between} $ P(\bm{U} \mid Z=z,x)$ and $P(\bm{U} )$:}

\begin{equation}
    \begin{aligned}
     &\| \int_{z \in Z} g(z) P(\bm{U} \mid Z=z,x) -P(\bm{U}) dz \|_1 \\ =& \| P^{*}(\bm{W} \mid \bm{U}) [\int_{z \in Z} g(z) P(\bm{W} \mid Z=z,x) - P(\bm{W})] dz \|_1 \\
     = & O(M * \gamma) 
    \end{aligned}
\end{equation}
where $*$ means the left-inverse matrix, and $\gamma =  \|P(\bm W)-\int_{z \in Z} \Widehat{g(z)} P(\bm W\mid Z=z,x)\|_1$ is usually sufficiently small in practice. 

{\textbf{The estimation of} $[P(y \mid \bm{U}, x)]$:}

\begin{equation}
    \begin{aligned}
    \|P(y \mid \bm{U}, x)\|_1 
    = O(\| P(y \mid \bm{W}, x) \|_1 *  M), \\
    \end{aligned}
\end{equation}





In conclusion, we have :
\begin{equation}
    \begin{aligned}
    &\Widehat{f(Y_x = y)} - f(Y_x = y) \\
    =& \int_{z \in Z} g(z)[P(y \mid Z=z,x) ]  dz -f(Y_x = y)\\
    =& P(y \mid \bm{U}, x) (\int_{z \in Z}g(z)P(\bm{U}\mid Z=z,z)dz - P(\bm{U})) \\
    = & O(\gamma)
    \end{aligned}
\end{equation}
\quad
\fi
\subsubsection{Discussion 3: Extension to Fig.~\ref{Fig.sub.2} and \ref{Fig.sub.3}}\label{diss_graph}




\noindent \textbf{Fig.~\ref{Fig.sub.1}} Our algorithm PI-SFP mainly focuses on Fig.~\ref{Fig.sub.1}. Moreover, if $\bm{W} \rightarrow \bm{Y}$ is added, the optimization problem will be transferred under Assumption.~\ref{ass_partial_bounded}.
%  \begin{equation}
%     \begin{aligned}
%     &  \min f(y \mid \bm{U},\bm{X} = x)f(\bm{U}),\\ &\text{~subject to~}
%      f(y \mid \bm{U},\bm{X} = x)f(\bm{U}\mid \bm{X} = x) = f(y \mid x),~\text{where~}f(\bm{U} \mid \bm{X}=x) \text{~satisfies}\\
%     & \left[\begin{matrix}
%     \overline{f(\bm{W} \mid \bm{U})}f(\bm{U} \mid \bm{X}=x) - f(\bm{W} \mid \bm{X} = x)\\ f(\bm{W} \mid \bm{X} = x) - \underline{f(\bm{W} \mid \bm{U})}f(\bm{U} \mid \bm{X}=x)
%     \end{matrix} \right] \geq 0, 
%     \left[\begin{matrix}
%     \overline{f(\bm{W} \mid \bm{U})}f(\bm{U}) - f(\bm{W}) \\ f(\bm{W}) - \underline{f(\bm{W} \mid \bm{U})}f(\bm{U})
%     \end{matrix}\right] \geq 0.
%     \end{aligned}  \label{wy_formulation}
% \end{equation}

{
 \begin{equation}
    \begin{aligned}
    &  \min f(y \mid \bm{U},\bm{X} = x)f(\bm{U}),\\ &\text{~subject to~}
     f(y \mid \bm{U},\bm{X} = x)f(\bm{U}\mid \bm{X} = x) = f(y \mid x),\\&\text{where~}f(\bm{U} \mid \bm{X}=x) \text{~satisfies}\\
    & \left[\begin{matrix}
    \overline{f(\bm{W} \mid \bm{U})}f(\bm{U} \mid \bm{X}=x) - f(\bm{W} \mid \bm{X} = x)\\ f(\bm{W} \mid \bm{X} = x) - \underline{f(\bm{W} \mid \bm{U})}f(\bm{U} \mid \bm{X}=x)
    \end{matrix} \right] \geq 0, \\
    &\left[\begin{matrix}
    \overline{f(\bm{W} \mid \bm{U})}f(\bm{U}) - f(\bm{W}) \\ f(\bm{W}) - \underline{f(\bm{W} \mid \bm{U})}f(\bm{U})
    \end{matrix}\right] \geq 0.
    \end{aligned}  \label{wy_formulation}
\end{equation}
}
Notice that the feasible region of $f(y \mid \bm{U},\bm{X} = x)$ and $f(\bm{U})$ is even more irregular than in \eqref{re-formulation}. Nevertheless, we can still adopt an analogous strategy to PI-SFP to approximate its optimal value. This remains a topic for future research.


\noindent \textbf{Fig.~\ref{Fig.sub.2} and \ref{Fig.sub.3}} The double negative control via introducing auxiliary exposure $\bm{Z}$ can enhance our estimation. Due to the fact $f(y\mid u,x) = f(y \mid u,x, Z)$, we have:
% \begin{equation}
%     \begin{aligned}
%      \underline{f(Y_x = y)} := f(y,x)  + \max_{\mathcal{Z}\subseteq \bm{Z}}\min_{f(y,\bm{W},\bm{U}, \bm{X}, \mathcal{Z}) \in \mathcal{\widetilde{F} }_{Z}}  \sum_{u=1}^{d} \frac{f(y,u_i,x, z \in \mathcal{Z}) f(u_i, \neg x)}{f(u_i,x,z \in \mathcal{Z})}.
%     \end{aligned}
% \end{equation}
{
\begin{equation}
    \begin{aligned}
     &\underline{f(Y_x = y)} := f(y,x)  + \max_{\mathcal{Z}\subseteq \bm{Z}}\\&\min_{f(y,\bm{W},\bm{U}, \bm{X}, \mathcal{Z}) \in \mathcal{\widetilde{F} }_{Z}} \!\! \sum_{u=1}^{d}  \!\! \frac{f(y,u_i,x, z \in \mathcal{Z}) f(u_i, \neg x)}{f(u_i,x,z \in \mathcal{Z})}.
    \end{aligned}
\end{equation}
}
The feasible region $\mathcal{\widetilde{F} }_{Z}$ of $f(y,\bm{W},\bm{U}, \bm{X}, \mathcal{Z})$ is constructed analogously to that of $\mathcal{\widetilde{F} }$. Specifically, for each subset $\mathcal{Z} \subseteq Z$, we can apply PI-SFP and select the maximum of them as the best lower bound of $\underline{f(Y_x = y)}$. We will conduct a more detailed analysis in our subsequent work, particularly in comparison to the performance of the single-proxy control.

\iffalse
In addition, we illustrate how the reversibility assumption can enhance the estimation of $f(Y_x = y)$. The original 'partial observability' assumption~\ref{ass_partial_bounded} is slightly upgraded as follows.

\begin{assumption}
The transition operator $P(\bm{W}\mid \bm{U})$ is left-reversible which is denotes as $P^{*}(\bm{W}\mid \bm{U})$. Moreover, $\|(P^{*}(\bm{W}\mid\bm{U}))\| \leq M$, $M$ is a constant.  \label{left_inverse_bounded}
\end{assumption}

$M$ can be achieved by the partial boundedness of $\overline{P(\bm{W}\mid \bm{U})}$ and $\underline{P(\bm{W}\mid \bm{U})}$. We can also refer to some original technique such as \citep{VARAH19753} to make the upper bound $P(\bm{W}\mid \bm{U})^{*}$ as $M$. Then we introduce our new estimation result in the following lemma:
\begin{lemma}
{We construct the estimation of $f(Y_x=y)$ as  
\begin{equation}
    \Widehat {f(Y_x=y)} = \int_{z \in Z}\Widehat {g(z)}[f(y\mid Z=z,x)]dz,  
\end{equation}
where $\Widehat {g(z)} = \arg\min\limits_{g(z)} \|P(\bm W)-\int_{z\in Z} {g}(z)P(\bm W\mid Z=z,x)\|_1 $. } Then the estimation error can be bounded by
\begin{equation}
    \begin{aligned}
  \mid  \Widehat {f(Y_x=y)} - f(Y_x=y) \mid \leq M^2 \Widehat {g(z)}. \label{negative_exposure_estimation}
    \end{aligned}
\end{equation}

\label{exposure_bound}
\end{lemma}


The proof is in Appendix~\ref{pro_exposure_bound}.

To achieve the result in lemma.~\eqref{exposure_bound}, we only need to replace Assumption.~\ref{ass_partial_bounded} with Assumption.~\ref{left_inverse_bounded}. On the one hand, it's only a subset of reversibility assumptions in \citep{shi2020multiply,miao2018identifying}. On the other hand, it doesn't need those 'bridge function' assumptions in~\citep{miao2018identifying, shi2020multiply, singh2020kernel, kallus2021causal, cui2020semiparametric, tchetgen2020introduction, deaner2018proxy}.



This estimation result is independent of our main algorithm's by negative outcome control. Which of these two algorithms is better depends on the size of $\gamma$. On the other hand, we will achieve a stronger result if we combine 1) main algorithm and 2) negative exposure control in Eqn~\eqref{negative_exposure_estimation}. In fact, the combining estimation on ACE performs better than just taking the intersection of the two after computing ACE separately.
\fi

\iffalse
In another perspective, our main estimation result can test the existence of $\bm{U} \rightarrow \bm{Z} $, i.e., we provide evidence of whether exposure control $Z$ are valid instruments or not. The result is summarized as follows.

\begin{table*}[h]
\begin{floatrow}
\capbtabbox{
\resizebox{0.48\textwidth}{13mm}{
 \begin{tabular}{ll}
 \toprule  Observed    & Estimation \\    \midrule  $f(y,W,x) $ & $f(y,U,x)$  \\   $f(W,x)$   & $f(U,x)$ \\ $f(W,\neg x)$     &  $f(U,\neg x)$ \\ \midrule \multicolumn{2}{c}{$f(Y_x = y) = f(x,y)+ \sum\limits_{u}\frac{f(y,U=u,x)P(U=u,\neg x)}{P(U=u,x)}$} \\   \bottomrule 
\end{tabular}}
}{
 \caption{Estimation of $f(Y_x = y)$ through $P(\bm{W} \mid \bm{U})$.}

}
\capbtabbox{
\resizebox{0.48\textwidth}{13mm}{
 \begin{tabular}{ll}
 \toprule  Observed    & Estimation \\    \midrule  $f(y,W,x) $ & $f(y,U,x)$  \\   $f(W,x)$   & $f(U,x)$ \\ $f(W \mid Z=z)$     &  $f(U\mid Z= z)$ \\ \midrule \multicolumn{2}{c}{$f(Y_x = y\mid Z=z) = \sum\limits_{u}\frac{f(y,U=u,x)P(U=u \mid Z = z)}{P(U=u,x)}$} \\   \bottomrule 
\end{tabular}}
}{
 \caption{Estimation of $f(Y_x = y\mid Z=z)$ through $P(\bm{W}\mid \bm{U})$.}
}
\end{floatrow}

\end{table*} \label{test_IV}
In Table.~\ref{test_IV} (left) and Table.~\ref{test_IV} (right), we do the estimation by the same algorithm as above. Each estimation can be derived by the corresponding observed by $P(\bm{W} \mid \bm{U})$. We denote the estimation result of $f(Y_x = y\mid Z=z)$ as $[ \underline{f(Y_x = y\mid Z=z)}, \overline{f(Y_x = y\mid Z=z)}]$. If the instruments $Z$ are valid, we have $f(Y_x = y) = f(Y_x=y\mid Z=z)$. Thus the following lemma holds:
\begin{corollary}{\textbf{(instrument variable validation)}}
If $\exists y, x$, s.t.,
\begin{equation}
    [\underline{f(Y_x=y )}, \overline{f(Y_x=y )}] 
    \cap \{ \cap[\underline{f(Y_x = y\mid Z=z)}, \overline{f(Y_x = y\mid Z=z)}]\}_{z\in Z} = \varnothing, 
\end{equation}
then the instruments $Z$ are invalid.\zh{This part should rewrite.}
\end{corollary}
\fi

$\mathcal{\widetilde{F}}_{Z}$ is identified as follows:

We denote
\begin{equation}
    \begin{aligned}
    \begin{matrix}
    ~~~~~~~~~~~~~~~~\theta_i = f(y,u_i,x,z\in \mathcal{Z})\\
    ~~~~~~~~~~~~\psi_i = f(u_i,x,z\in \mathcal{Z})\\
    \omega_i = f(u_i,\neg x)\\
    \end{matrix},~
    \begin{matrix}
    \bm{\theta_{\mathcal{Z}}} &= (\theta_1, \theta_2,...\theta_d)^T\\
    \bm{\psi_{\mathcal{Z}}} &= (\psi_1, \psi_2,...\psi_d)^T\\
    \bm{\omega_{\mathcal{Z}}} &= (\omega_1, \omega_2,...\omega_d)^T\\
    \end{matrix},
    ~\bm{\phi_{\mathcal{Z}}} = \left(\begin{matrix}
    \bm{\theta_{\mathcal{Z}}}~  \bm{\psi_{\mathcal{Z}}}~ \bm{\omega_{\mathcal{Z}}}
    \end{matrix} \right).
    \end{aligned}
\end{equation}


where $f(y,\bm{W}, \bm{U}, \bm{X},\mathcal{Z}) \in \mathcal{\widetilde{F}}_{Z} = \{\bm{\phi}_{\mathcal{Z}} \in IR_{{Z}}, IR_{Z} = IR^1_{Z} \cap IR^2_{Z}\}$ leads to the following constraints that we really use:

\begin{equation}
\begin{aligned}
   &IR^{1}_{{Z}} = \{\bm{\phi}_{Z}: \left[\begin{matrix}
   -\bm{I_{d*d}} \\ \bm{I_{d*d}}
   \end{matrix}\right] \left[\begin{matrix}
   &f(y,\bm{W},x,z \in \mathcal{Z})^{T} \\ &f(\bm{W},x,z \in \mathcal{Z})^{T} \\ &f(\bm{W},\neg x,z \in \mathcal{Z})^{T}
   \end{matrix} \right]^{T}  -  \left[ \begin{matrix}
   &-\overline{P(\bm{W}\mid \bm{U})} \\  &\underline{P(\bm{W} \mid \bm{U})}
   \end{matrix} \right] \bm{\phi}_{\mathcal{Z}} \geq \bm{0}\}.\\
   \end{aligned}
\end{equation}

Moreover, the set $IR^{2}_{\bm{\Phi}}$ indicates the natural constraints by default:

\begin{equation}
    \begin{aligned}
    IR_{Z}^{2} = \left\{ \bm{\phi}_Z:
    \left[\begin{matrix}
    &\bm{1}^{T} \bm{\theta} \\ &\bm{1}^{T} \bm{\phi} \\ &\bm{1}^{T} \bm{\omega} 
    \end{matrix}\right] = \left[ \begin{matrix}
    &f(y,x,z \in \mathcal{Z})\\ &f(x,z \in \mathcal{Z})\\ &f(\neg x,z \in \mathcal{Z}) \}
    \end{matrix}\right], \forall i,
\left\{\begin{matrix}
    \theta_i \in [0,f(y,x,z \in \mathcal{Z})] \\
    \phi_i \in (0,f(x,z \in \mathcal{Z})] \\
    \omega_i \in [0,f(\neg x,z \in \mathcal{Z})]
\end{matrix} \right\} \right\}.
    \end{aligned}
\end{equation}









\subsubsection{Discussion 4: the continuous confoundings}\label{proof_continuous}


\begin{assumption}{(partial observability assumption for continuous confoundings)} $P(\bm{W} \mid \bm{U}) \in \mathscr{P}$, where $ \mathscr{P} = \{{P^{}(\bm{W} \mid \bm{U})}: \left[\begin{matrix} \overline{P^{}(\bm{W} \mid \bm{U} \in [u_{i-1}, u_{i}])} - {P^{}(\bm{W} \mid \bm{U} \in [u_{i-1}, u_{i}])} \\
 {P^{}(\bm{W} \mid \bm{U} \in [u_{i-1}, u_{i}])} - \underline{P^{}(\bm{W} \mid \bm{U} \in [u_{i-1}, u_{i}])} \end{matrix}\right]\text{is~ non-negative}\}$.

\label{ass_partial_bounded_continous}

\end{assumption}

\begin{assumption}{(Lipschitz condition)} $\forall y \in \bm{Y}, \forall \{u^{'},u^{''}\} \in \bm{U} $, we have
$
      \left| \frac{f(y, u^{'}, x) - f(y, u^{''}, x)}{f(u^{'}, x) - f(u^{''}, x)} \right| \leq C_1, \left| \frac{f(u^{'}, x) - f(u^{''}, x)}{u^{'} - u^{''}} \right| \leq C_2,
$\label{ass_lipschitz}
where $ C_1, C_2$ are positive constants.
\end{assumption}

\begin{lemma}\label{lemma_continuous}
Suppose that Assumption.~\ref{ass_partial_bounded_continous}-\ref{ass_lipschitz} hold. $\forall i \in \{0,1,...d-1\}, \forall u \in [u_{i}, u_{i+1}]$, we have
\begin{equation}
    \frac{\int_{u_i}^{u_{i+1}}f(y,u,x)du}{\int_{u_i}^{u_{i+1}}f(u,x)du} \leq \frac{f(y,u^{},x)}{f(u^{},x)}\frac{1}{1-\frac{1}{2}C_2 \eta} + \frac{\frac{1}{2}C_1 C_2 \eta}{1-\frac{1}{2}C_2 \eta}. 
\end{equation}
On the other hand,
\begin{equation}
    \frac{\int_{u_i}^{u_{i+1}}f(y,u,x)du}{\int_{u_i}^{u_{i+1}}f(u,x)du} \geq \frac{f(y,u,x)}{f(u,x)}\frac{1}{1+\frac{1}{2}C_2 \eta} - \frac{\frac{1}{2}C_1 C_2 \eta}{1+\frac{1}{2}C_2 \eta}. 
\end{equation}

\end{lemma}

\textbf{The proof of lemma.~\eqref{lemma_continuous}} We do partition on the confounding interval $[U^{L}, U^{U}]$ as $[u_{0}, u_{1}, u_2, ...,u_{d-1}, u_{d}]$, where $u_0 = U^{L}, u_d = U^U$. The independent variables is re-defined by
    \begin{equation}
    \begin{aligned}
    \begin{matrix}
     \theta_i = {f(y,U\in [u_i, u_{i+1}],x)},
    \psi_i = {f(U\in [u_i,u_{i+1}],x)}^{},
    \omega_i = {f(U\in [u_i, u_{i+1}],\neg x)}^{},
    \end{matrix}
    \end{aligned}
\end{equation}




%/proof

%$\forall u^{'} \in [u_{i}, u_{i+1}]$, we have

\begin{equation}
    \begin{aligned}
    \forall u^{'} \in [u_{i}, u_{i+1}], \frac{\int_{u_i}^{u_{i+1}}f(y,u,x)du}{\int_{u_i}^{u_{i+1}}f(u,x)du} &\leq \frac{\int_{u_i}^{u_{i+1}}\left[f(y,u^{'},x) + C_1 \left|f(u, x) - f(u^{'}, x)\right| \right] du}{\int_{u_i}^{u_{i+1}}\left[f(u^{'},x)+\left(f(u,x)-  f(u^{'},x) \right) \right] du} \\
    &\leq \frac{f(y,u^{'}, x)(u_{i+1} - u_i) + C_1 C_2 \frac{1}{2}(u_{i+1}-u_i)^2}{f(u^{'},x)(u_{i+1} - u_i) - C_2 \frac{1}{2}(u_{i+1} - u_i)^2}\\
    &\leq \frac{\frac{f(y, u^{'},x)}{f(u^{'},x)}+\frac{\frac{1}{2} C_1 C_2 \eta \delta}{f(u^{'}, x)}}{1-\frac{\frac{1}{2} C_2 \eta \delta}{f(u^{'}, x)}} \\
    &\leq \frac{f(y,u^{'},x)}{f(u^{'},x)}\frac{1}{1-\frac{1}{2}C_2 \eta} + \frac{\frac{1}{2}C_1 C_2 \eta}{1-\frac{1}{2}C_2 \eta}. 
    \end{aligned}
\end{equation}


Here $i=0,1,...d-1$. Analogously, we can prove the other direction. Thus we have proved the lemma.
%\end{proof}
\quad




\begin{corollary}{(PI-SFP's error for continuous confoundings)}
Suppose that Assumption.~\ref{ass_partial_bounded_continous}-\ref{ass_lipschitz} holds. When $U$ is continuous, and $\max\limits_{i \in \{1,2,...d\}} |u_i - u_{i-1}| < \eta \delta$. Then
$
          \underline{f(Y_x = y)} \leq \lim \limits_{n\rightarrow +\infty}\underline{\underline{f_{opt}^n (Y_x = y)}} \leq \frac{1}{1-\frac{1}{2}C_2 \eta}\underline{f(Y_x = y)} + \frac{C_1f(\neg x) -  f(y, x)}{2 - C_2 \eta} C_2 \eta.
        $
   \label{theorem_continuous}
\end{corollary}




Then we prove this corollary. 
\textbf{The proof of Corollary.~\ref{theorem_continuous}}
If we use $\underline{f(y,\bm{U},x)}, \underline{f(\bm{U},x)},\underline{f(\bm{U},\neg x)}$ to denote the optimal solution of the optimal value $\underline{f(Y_x = y)}$ in the continuous case, then we have
\begin{equation}
    \begin{aligned}
    &\underline{f(Y_x = y)} -  f(y,x) \\
    =& \int_{U^{L}}^{U^U} \frac{\underline{f(y,u,x)}}{\underline{f(u,x)}}\underline{f(u,\neg x)}du \\
    =& \sum_{i=0}^{d-1}\int_{u_i}^{u_{i+1}} \frac{\underline{f(y,u,x)}}{\underline{f(u,x)}}\underline{f(u,\neg x)}du \\
    \geq &  \sum_{i=0}^{d-1}\int_{u_i}^{u_{i+1}} \left[\frac{\int_{u_i}^{u_{i+1}}\underline{f(y,u,x)}du}{\int_{u_i}^{u_{i+1}}\underline{f(u,x)}du} - \frac{\frac{1}{2}C_1 C_2 \eta}{1-\frac{1}{2} C_2 \eta}\right] (1-\frac{1}{2} C_2 \eta) \underline{f(u,\neg x)}du \\
    =& (1-\frac{1}{2} C_2 \eta) \sum_{i=0}^{d-1} \frac{\int_{u_i}^{u_{i+1}}\underline{f(y,u,x)}du}{\int_{u_i}^{u_{i+1}}\underline{f(u,x)}du} \int_{u_i}^{u_{i+1}}\underline{f(u,\neg x)}du - \frac{1}{2}C_1 C_2 \eta f(\neg x).
    \end{aligned}\label{partial_bound}
\end{equation}
Here $\{\int_{u_i}^{u_{i+1}}\underline{f(y,u,x)}du, \int_{u_i}^{u_{i+1}}\underline{f(u,x)}du, \int_{u_i}^{u_{i+1}}\underline{f(u,\neg x)}du, i=0,1,...d-1\}$ is within the feasible region of PI-SFP in the discrete case. Then we have
\begin{equation}
    \begin{aligned}
    \eqref{partial_bound} &\geq (1-\frac{1}{2} C_2 \eta) \left( \lim\limits_{n \rightarrow +\infty}\underline{\underline{f_{opt}^n (Y_x = y)}} - f(y, x)\right) - \frac{1}{2}C_1 C_2 \eta f(\neg x)\\
    \underline{f(Y_x = y)} &\geq (1-\frac{1}{2} C_2 \eta)\lim\limits_{n \rightarrow +\infty} \underline{\underline{f_{opt}^n (Y_x = y)}} + \frac{1}{2}C_2 \eta f(y, x) - \frac{1}{2}C_1 C_2 \eta f(\neg x)\\
    \lim\limits_{n \rightarrow +\infty} \underline{\underline{f_{opt}^n (Y_x = y)}} &\leq \frac{1}{1-\frac{1}{2}C_2 \eta}\underline{f(Y_x = y)} + \frac{\frac{1}{2}C_1f(\neg x) - \frac{1}{2} f(y, x)}{1-\frac{1}{2}C_2 \eta} C_2 \eta 
    \end{aligned}
\end{equation}

On the other hand, each optimal solution by PI-SFP corresponds to a solution in the continuous case. Namely if the discrete PI-SFP's optimal solution is denoted as $\{\int_{u_i}^{u_{i+1}}{f(y,u,x)}du, \int_{u_i}^{u_{i+1}}{f(u,x)}du, \int_{u_i}^{u_{i+1}}{f(y,u,\neg x)}du, i=0,1,...d-1\}$. Then we can construct  
\begin{equation}
    \begin{aligned}
    f^{opt}(y,u,x) &= \frac{\int_{u_i}^{u_{i+1}}{f(y,u,x)}du}{u_{i+1} - u_i}, u\in [u_i, u_{i+1}).\\
    f^{opt}(u,x) &= \frac{\int_{u_i}^{u_{i+1}}{f(u,x)}du}{u_{i+1} - u_i}, u\in [u_i, u_{i+1}).\\
    f^{opt}(u,\neg x) &= \frac{\int_{u_i}^{u_{i+1}}{f(u,\neg x)}du}{u_{i+1} - u_i}, u\in [u_i, u_{i+1})
    \end{aligned}
\end{equation}
as one of the solution in the continuous case. Hence $\lim\limits_{n \rightarrow +\infty}\underline{\underline{f_{opt}^n (Y_x = y)}} \geq \underline{f(Y_x = y)}$. We have finished the proof.




\quad


%\appendix

%%%%%%%%%%%%%%%%%




\subsubsection{Comment on Theorem~\ref{convergence_theorem}: the relationship between $L_n$ and $n$}\label{comment}

\begin{remark}{(Hardness of establishing functional associations between $L_n$ and $n$)} The worst-case scenario for $L_n$ is $L_n = \lfloor log(n) \rfloor +1$. In this situation, PI-SFP is equivalent to the method of exhaustion, which exhibits slow polynomial convergence, as shown in Theorem~\ref{convergence_theorem}, with a rate of $O(n^{-\alpha})$, where $\alpha = \frac{1}{4d}log( \frac{{2}}{\sqrt{3}})$. However, empirical evidence suggests that this scenario is rare, and in the simulation section, the convergence rate is faster than $O(n^{-\alpha})$. Additionally, pruning strategies can be employed to further improve the convergence rate, which is discussed in Section.~\ref{section_dis_ext}.

Despite this empirical observation, it is well beyond the scope of this paper to theoretically estimate $L_n$ w.r.t $n$. During iteration, each optimal solution (converging point) may be covered by increasing number of nested sequences\footnote{Notice that it is equivalent to guarantee each converging point is covered by finite partitions. It resorts to the regularity condition of simplices (identified in~\citep{ciarlet2002finite}). However, whether Longest-edge bisection can promise a family of regular partitions is still a conjecture~\citep{korotov2016longest} to be solved. }. These sequences possess different lengths and are difficult to estimate. More seriously, the number of optimal solutions is not necessarily finite either, namely $|\bm{\Phi}_{opt}| < +\infty$ may not be guaranteed.









\end{remark}

Notice that $L_n = O(n)$ when each $\phi_{opt}$ is partitioned via finite number of simplices and $|\bm{\phi_{opt}}| < +\infty$ in the infinite process. By this motivation, we aim to prove that such finiteness is true under a fairly broad assumption mentioned in \citet{ciarlet2002finite, korotov2016longest}. In fact, we address this conjecture and it is our by-contribution:

\emph{what is the maximum intersection number of regular simplicial partitions? In other words, for arbitrary point in $\mathcal{S}_0$, what is the maximum number of simplices it can be affiliated with during partitioning, under fairly broad assumption?}


\begin{assumption}


\begin{equation}
    \begin{aligned}
    vol(S) \geq \eta {h(S)}^{d}.
    \end{aligned}
\end{equation}
Here $vol(S)$ denotes the volumn of simplex $S$, $h(S)$ denotes the longest edge of $S$, and $d\geq 2$ is the dimension.

\label{ass}
\end{assumption}


Roughly speaking, the regularity assumption guarantees the simplex would not degenerate to the hyper-plane, or else $\lim\limits_{vol(S)\rightarrow 0} \frac{vol(S)}{{h(S)}^d} = 0$. In other words, $\frac{vol(S)}{{h(S)}^d}$ can be higher when the simplex ``seems to be regular"", namely each edge keeps the same length. In addition, the simplicial partitoins during partitioning are denoted as $\mathcal{S}_0, \mathcal{S}_1, \mathcal{S}_2...$. Our problem can be summarized as follows: 







\begin{theorem} 
Suppose that Ass.~\ref{ass} holds. Each point in $\mathcal{S}_0$ is included within at most $\frac{1}{\eta}\left(\frac{2e \pi}{ d}\right)^{\frac{d}{2}}$ simplices. 
 
\end{theorem}

\textbf{proof}
For $\gamma_0 \in \mathcal{S}_0$, we construct a ball $\mathcal{B}(\gamma_0, r)$. We use $A(\cdot)$ to denote the surface area of sphere:
\begin{equation}
    \begin{aligned}
    A\left(\mathcal{B}\left(\gamma_0, r\right) \cap \mathcal{S}_0\right)=\sum_{S \in \mathcal{S}_k, \gamma_0 \in S} A\left(\mathcal{B}\left(\gamma_0, r\right) \cap {S} \right).
    \end{aligned}\label{original}
\end{equation}
On the one hand, the LHS of Eqn.~\ref{original} can be upper bounded as:
\begin{equation}
    \begin{aligned}
    A\left(\mathcal{B}\left({\gamma_0}, r\right) \cap S_0\right) \leq A\left(\mathcal{B}\left({\gamma_0}, r\right)\right)=\frac{2 \pi^{\frac{ d}{2}}}{\Gamma\left(\frac{ d}{2}\right)} r^{d-1}<+\infty .
    \end{aligned}\label{A_S_0}
\end{equation}
On the other hand, we calculate the RHS of Eqn.~\ref{original}. To solve it, we take
advantage of the following integral (we take $\gamma_0$ as the origin):
\begin{equation}
    \begin{aligned}
    \int_{\mathcal{W}_S \gamma \geq 0} e^{-\|\gamma\|^2} d \gamma.
    \end{aligned}
\end{equation}
Here for each term in the right side, the sphere of the ball is cut by certain facets of each S, whose
normal vector is denoted as a set $\mathcal{W}_S$, whose each row denotes a $1 * 4d$ normal vector. Without loss of generation, for any facets, we assume that its normal vector points to the remaining
supporting vector, namely their inner product is positive.

Here $\| \cdot \|$ also denotes the Euclidean norm. If we use the polar coordinates, we can get a new expression by differential element method:

\begin{equation}
    \begin{aligned}
    \int_{\mathcal{W}_S \gamma \geq 0} e^{-\|\gamma\|^2} d \gamma=\int_{\mathcal{W}_S \gamma \geq 0} d \Omega \int_0^{+\infty} e^{-l^2} l^{ d-1} d l=\frac{A\left(\mathcal{B}\left(\gamma, r\right) \cap S\right)}{r^{ d-1}} \int_0^{+\infty} e^{-l^2} l^{ d-1} d l.
    \end{aligned}\label{A_S}
\end{equation}
According to Eqn.~\ref{A_S_0}-\ref{A_S}, we have
\begin{equation}
    \begin{aligned}
A\left(\mathcal{B}\left(\gamma_0, r\right) \cap S\right) &=\frac{2 \int_{\mathcal{W}_S \gamma \geq 0} e^{-\|\gamma\|^2} d \gamma}{\Gamma\left(\frac{d}{2}\right)} r^{d-1} \\
\forall k, \frac{A\left(\mathcal{B}\left(\gamma_0, r\right) \cap S\right)}{A\left(\mathcal{B}\left(\gamma_0, r\right) \cap \mathcal{S}_0\right)} & \geq \frac{\int_{\mathcal{W}_S \gamma \geq 0} e^{-\|\gamma\|^2} d \gamma}{\pi^{\frac{d}{2}}}.
    \end{aligned}
\end{equation}
Hence we only need prove the integral is lower bounded by a constant above zero. We will do this by extracting a sub-space from $\mathcal{W}_S \gamma \geq 0$, which is easy to be integrated. Specifically, we consider a sub space which is an affine transformation on $\mathcal{W}_S \gamma \geq  0$. We introduce the diameter of simplex $S$ as $\operatorname{dia}(S) = \max _{s_1, s_2 \in S}\left\|s_1-s_2\right\|$.

\begin{equation}
    \begin{aligned}
\int_{\mathcal{W}_S \gamma \geq 0} e^{-\|\gamma\|^2} d \gamma & \geq \int_{\gamma=t \gamma^{\prime}, \gamma^{\prime} \in S} e^{-\|\gamma\|^2} d \gamma \quad(\forall t>0, t \text { is arbitrarily chosen }) \\
&=\int_{\gamma^{\prime} \in S} t^{4 d} e^{-t^2\left\|\gamma^{\prime}\right\|^2} d \gamma^{\prime} \\
& \stackrel{*}{\geq } t^* \operatorname{Vol}(S) e^{-t^2 d i a(S)^2} \\
& \stackrel{* *}{\geq} \eta t^{4 d} h(S)^{4 d} e^{-t^2 d i a(S)^2}
\end{aligned}
\end{equation}
$*$ is due to $\forall \gamma^{\prime} \in S$, we have $\left\|\gamma^{\prime}\right\| \leq \operatorname{dia}(S)$, since $\gamma_l$ is chosen as the origin. $* *$ is due to Ass.~\ref{ass}. Additionally, we further show that in the above Formulation, $h(S)=\operatorname{dia}(S)$. Namely the longest edge of simplex always serves as the diameter. It is equal to prove ($S^i$ denotes the supporting vector):
\begin{equation}
    \begin{aligned}
\operatorname{dia}(S) &=\max _{s_1, s_2 \in S}\left\|s_1-s_2\right\| \quad\left(s_2=\sum_i \lambda_i S^i, \lambda_i \in[0,1]\right) \\
&=\max _{s_1, s_2 \in S}\left\|\left(\sum_i \lambda_i\right) s_1-\sum_i\left(\lambda_i S^i\right)\right\| \\
&=\max _{s_1, s_2 \in S}\left\|\sum_i \lambda_i\left(s_1-S^i\right)\right\| \\
& \leq \max _{s_1 \in S} \max _i\left\|s_1-S^i\right\| \\
& \stackrel{*}{\leq} \max _{i, j}\left\|S^j-S^i\right\| \leq h(S).
\end{aligned}
\end{equation}
$*$ is due to we also do the expansion on $s_1$, namely $s_1=\sum_i \lambda_i^{\prime} S^i, \lambda_i^{\prime} \in[0,1]$. On the other hand, we have $h(S) \leq \max _{s_1, s_2 \in S}\left\|s_1-s_2\right\|=\operatorname{dia}(S)$ by definition. Thus $\operatorname{dia}(S)=h(S)$. Hence
\begin{equation}
    \begin{aligned}
    \forall t, \int_{\mathcal{W}_S \boldsymbol{\gamma} \geq 0} e^{-\|\gamma\|^2} d \boldsymbol{\gamma} \geq \eta(t d i a(S))^{d} e^{-\left(t \operatorname{dia}(S)^2\right)}.
    \end{aligned}
\end{equation}

Due to the arbitrary of $t$, we have
\begin{equation}
    \begin{aligned}
    \int_{\mathcal{W}_S \boldsymbol{\gamma} \geq 0} e^{-\|\gamma\|^2} d \boldsymbol{\gamma} \geq \eta \max _x x^{d} e^{-x^2}=\left.\eta x^{d} e^{-x^2}\right|_{x=\sqrt{\frac{d}{2}}}=\eta(\frac{d}{2})^{\frac{d}{2}} e^{- \frac{d}{2}}.
    \end{aligned}
\end{equation}
Finally, we have
\begin{equation}
    \begin{aligned}
    \forall k, \frac{A\left(\mathcal{B}\left(\gamma, r\right) \cap S\right)}{A\left(\mathcal{B}\left(\gamma, r\right) \cap S_0\right)} \geq \eta\left(\frac{d}{2e \pi}\right)^{\frac{d}{2}}, \text{the~intersection~number~} N \leq \frac{1}{\eta}\left(\frac{2e \pi}{ d}\right)^{\frac{d}{2}}<+\infty.
    \end{aligned}
\end{equation}
We have finished the proof.




\newpage
\subsection{Simulations and auxiliary experiments}\label{fig}

\paragraph{Simulations}
The visualization of our PI-SFP's performance in simulations are presented in Figure~\ref{minimum_graph} and Figure~\ref{PI-SFP_0.5}.



\begin{figure*}[h]
    \centering
    \includegraphics[width =3.3cm, height = 3cm]{minimum_graph_0_1.pdf}
    \includegraphics[width =3.3cm, height = 3cm]{minimum_graph_0_2.pdf}
    \includegraphics[width =3.3cm, height = 3cm]{minimum_graph_0_3.pdf}
    \includegraphics[width =3.3cm, height = 3cm]{minimum_graph_0_4.pdf}
    \caption{We search the minimum of $f(Y_x = y)$ in the binary case, by conducting naive linear programming on each fixed $\theta_1 (\theta_2)$ and $\psi_1 (\psi_2)$ in Eqn~\eqref{re-formulation}.}
    \label{minimum_graph}
\end{figure*}

\begin{figure*}[h]
    \centering
    \includegraphics[width = 4cm, height = 3cm]{PI-SFP_0_1.pdf}
    \includegraphics[width = 4cm, height = 3cm]{PI-SFP_0_2.pdf}
    \includegraphics[width = 4cm, height = 3cm]{PI-SFP_0_3.pdf}
    \includegraphics[width = 4cm, height = 3cm]{PI-SFP_0_4.pdf}
    \caption{Results of PI-SFP. PI-SFP (blue) converges to the optimal value of $\underline{f(Y_x = y)}$ with $\varepsilon$ changing from $0.1$ to $0.4$. The red line denotes the theoretical convergence rate (ground truth).}
    \label{PI-SFP_0.5}
\end{figure*}

\paragraph{Real-world experiments}

We present the numerical result of real-world experiments as follows. Here semi-parametric COCA and Doubly-robust parametric COCA method is followed by~\citet{tchetgen2023single, park2023single}; moreover, the standard difference-in-difference (DID) method is followed by~\citet{card1993minimum, angrist2009mostly}.



To facilitate fair comparison, we adopt the same experimental setting as in~\citet{tchetgen2023single} and we refer readers to specific details in their experimental part. Specifically, in the dataset, we set  $185$ and $488$ municipalities as samples from two areas Pernambuco (PE),Rio Grande do Sul (RS), as the treatment and control group, respectively. The covariate $\bm{U}$ contain three parts: (i) municipality-level population size, (ii) population density, and (iii) proportion of females measured
in $2014$. Moreover, we force ``post-epidemic municipality-level birth rate'' in $2016$ as the outcome $Y$, and whether individuals are infected by the virus as treatment control $X$. Finally, we choose ``preepidemic municipality-level birth rates in 2013 and 2014'' as the outcome proxies $W_1, W_2$, respectively, which is so-called NCO in Table~\ref{real}. Our transition matrix $P(\bm{W} \mid \bm{U})$ is also approximated from the observations in the public data.

In this process, we choose $P(\bm{X},\bm{Y}), P(\bm{W})$ and partial observed transition matrix $P(\bm{W} \mid \bm{U})$ as observed data, and others as the protected feature. Our PI-SFP exhibits a narrower lower and upper bound compared with the previous literature in most cases.






\begin{table}
\begin{tabular}{|c|c|c|c|c|}
\hline \multirow{2}{*}{ Estimator } & \multirow{2}{*}{ Statistic } & \multicolumn{3}{|c|}{ NCO } \\
\cline { 3 - 5 } & & $W_1$ & $W_2$ & $\left(W_1, W_2\right)$ \\
\hline \multirow{3}{*}{ Semi-parametric COCA } & Estimate & $-2.410$ & $-2.182$ & $-2.180$ \\
\cline { 2 - 5 } & $\mathrm{SE}$ & $0.356$ & $0.503$ & $0.342$ \\

\cline{2-5} & $95 \% \mathrm{CI}$ & $(-3.107,-1.713)$ & $(-3.168,-1.196)$ & $(-2.850,-1.510)$ \\
\hline \multirow{3}{*}{Doubly-robust parametric COCA}& Estimate & $-2.235$ & $-1.833$ & $-2.182$ \\
\cline { 2 - 5 } & $\mathrm{SE}$ & $0.502$ & $0.519$ & $0.415$ \\
\cline { 2 - 5 } & $95 \% \mathrm{CI}$ & $(-3.220,-1.250)$ & $(-2.850,-0.816)$ & $(-2.996,-1.368)$ \\



\hline \multirow{3}{*}{ Standard DiD } & Estimate & $-1.156$ & $-1.041$ & $-1.041$ \\
\cline { 2 - 5 } & SE & $0.199$ & $0.195$ & $0.195$ \\
\cline { 2 - 5 } & $95 \%$ CI & $(-1.546,-0.767)$ & $(-1.424,-0.658)$ & $(-1.424,-0.658)$ \\

\hline PI-SFP (ours) & bound & $[-3.012,-1.201]$ & $[-2.732,-1.232]$ & $[-2.742,-1.203]$ \\

\hline
 
\end{tabular}\caption{Real-world experiment.}\label{real}
\end{table}





\end{document}
