\documentclass{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
%\usepackage{xr} 
\usepackage{xr-hyper}
% \externaldocument{uai2023main}
\externaldocument{shi_139}


%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example


%\usepackage[colorlinks=true,linkcolor=black, citecolor=blue]{hyperref}
% \usepackage{hyperref}
\usepackage{url}
\usepackage{graphicx}
\usepackage{wrapfig}
\usepackage{subfigure}

\usepackage{amssymb}




\usepackage{tikz}
% Tikz settings optimized for causal graphs.
% Just copy-paste this part
\usetikzlibrary{shapes, decorations,arrows,calc,arrows.meta,fit,positioning}
\tikzset{
    -Latex,auto,node distance =1. cm and 1. cm,semithick,
    state/.style ={ellipse, draw, minimum width = 0.4 cm},
    point/.style = {circle, draw, inner sep=0.04cm,fill,node contents={}},
    directed/.style={Latex-Latex,dashed},
    el/.style = {inner sep=2pt, align=left, sloped}
}

\def\z{{\phi(Z)}}
\def\E{{\mathcal {E}}}
\def\Ex{{\mathbb{E}}}
\def\H{{\mathcal{H}}}
\def\X{{\mathcal{X}}}
\def\Y{{\mathcal{Y}}}
\def\Z{{\mathcal{Z}}}
\def\L{L}
\def\A{{\mathcal{A}}}
\def\B{{\mathcal{B}}}
\def\N{{\mathcal{N}}}
\def\sumni{\sum_{i=1}^{n_1}}
\def\sumnii{\sum_{i=1}^{n_2}}
\def\sumnj{\sum_{j=1}^{n_2}}
\def\summ{\sum_{l=1}^m}
\def\a{{\text{a}}}
% \def\T{\text{T}}
\def\T{\top}
\def\op{O_p}
\def\di{{d_1}}
\def\dii{{d_2}}
\def\para{{||}}
\def\res{{\bot}}
\def\S{\mathbb S}


\newcommand{\R}{{{\mathbb R}}} 
%\newcommand{\E}{{{\mathbb E}}}
\newcommand{\PP}{{{\mathbb P}}} 
\newcommand{\cP}{{{\mathcal P}}} 

\newcommand{\F}{{{\mathcal F}}} 
\newcommand{\cH}{{{\mathcal H}}} 
\newcommand{\cE}{{{\mathcal E}}}

\usepackage{comment}
% \theoremstyle{definition}
\newtheorem{assumption}{Assumption}
% \newtheorem*{theorem*}{Theorem}
\newtheorem{theorem}{Theorem}
\newtheorem{proposition}{Proposition}
\newtheorem{lemma}{Lemma}
\newtheorem{remark}{Remark}
\newtheorem{example}{Example}
\newtheorem{definition}{Definition}
\newtheorem{corollary}{Corollary}
% \newtheorem*{corollary*}{Corollary}
\newtheorem{condition}{Condition}
\newtheorem{proof}{Proof}

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\usepackage{cleveref}
\usepackage{todonotes}

\renewcommand\thelemma{A\arabic{lemma}}
\setcounter{lemma}{0}
\renewcommand\theremark{A\arabic{remark}}
\setcounter{remark}{0}

\renewcommand\thesection{\Alph{section}}



\title{Learning Nonlinear Causal Effect via Kernel Anchor Regression\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2023 paper}{Jane~J.~von~O'L\'opez}{}}
\author[1]{Harry~Q.~Bovik}
\author[1,2]{Further~Coauthor}
\author[3]{Further~Coauthor}
\author[1]{Further~Coauthor}
\author[3]{Further~Coauthor}
\author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Computer Science Dept.\\
    Cranberry University\\
    Pittsburgh, Pennsylvania, USA
}
\affil[2]{%
    Second Affiliation\\
    Address\\
    …
}
\affil[3]{%
    Another Affiliation\\
    Address\\
    …
  }
  
\begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\vspace{-8em}


\section{Proofs and derivations}
%\wk{I copy paste everything here from draft.tex first}
%\ref{uai2023main-thm::s3c}
\subsection{Proof of Theorem~\ref{thm::s3c}}
Before proving Theorem~\ref{thm::s3c}, we introduce the exact bounds of the approximation errors for estimating $E_X^p$ and $E_Y^p$ in the disjoint sample sets projection stage. Lemma~\ref{lem::as1} and ~\ref{lem:as2} below are adapted from Theorem 2 in \cite{singh2019kernel}.


\begin{lemma}\label{lem::as1}
% $\forall \alpha_1 > 0$, the solution $E_{\alpha_1,X}^{n_1}$ of the regularized empirical objective $\E_{\alpha_1, X}^{n_1}$ exists, is unique, and
% \begin{eqnarray*}
%     E_{\alpha_1,X}^{n_1} &=& (\mathbf{T}_1 + \alpha_1)^{-1} \circ \mathbf{g}_1, \\
%     \mathbf{T}_1 &=& \frac{1}{n_1} \sumni \phi(z_{1,i}) \otimes \phi(z_{1,i}), \\
%     \mathbf{g}_1 &=& \frac{1}{n_1} \sumni \phi(z_{1,i}) \otimes \psi(x_{1,i}).
% \end{eqnarray*}
Under Condition~\ref{cond::s1}, $\forall \delta \in (0,1)$, the following holds w.p. $1 - \delta$:
\begin{eqnarray*}
    \Vert E_{\alpha_1,X}^{n_1} - E_{X}^p \Vert_{\H_\Gamma} \leq
    r_{E_1}(\delta,n_1,c_1) := 
    \frac{ \sqrt{\zeta_1} (c_1 + 1)}{4^{\frac{1}{c_1+1}}} \left( \frac{4\kappa (Q_1 + \kappa \Vert E_{X}^p \Vert_{\H_{\Gamma}} \ln(2/\delta) }{\sqrt{n_1 \zeta_1}(c_1 -1)} \right)^{\frac{c_1-1}{c_1+1}},\\
    \alpha_1 = \left( \frac{8\kappa (Q_1 + \kappa \Vert E_{X}^p \Vert_{\H_\Gamma} \ln(2/\delta) }{\sqrt{n_1 \zeta_1}(c_1 -1)} \right)^{\frac{2}{c_1+1}}.
\end{eqnarray*}
\end{lemma}


\begin{lemma}\label{lem:as2}
% $\forall \alpha_2 > 0$, the solution $E_{\alpha_2,Y}^{n_2}$ of the regularized empirical objective $\E_{\alpha_2}^{n_2}$ exists, is unique, and
% \begin{eqnarray*}
%     E_{\alpha_2,Y}^{n_2} &=& (\mathbf{T}_2 + \alpha_2)^{-1} \circ \mathbf{g}_2, \\
%     \mathbf{T}_2 &=& \frac{1}{n_2} \sumni \phi(z_{2,i}) \otimes \phi(z_{2,i}), \\
%     \mathbf{g}_2 &=& \frac{1}{n_2} \sumni \phi(z_{2,i}) y_{2,i}.
% \end{eqnarray*}
Under Condition~\ref{cond::s1} and Condition~\ref{cond::s2}, $\forall \epsilon \in (0,1)$, the following holds w.p. $1 - \epsilon$:
\begin{eqnarray*}
    \Vert E_{\alpha_2,Y}^{n_2} - E_{Y}^p \Vert_{\H_\Theta} \leq
    r_{E_2}(\epsilon,n_2,c_2) := 
    \frac{ \sqrt{\zeta_2} (c_2 + 1)}{4^{\frac{1}{c_2+1}}} \left( \frac{4\kappa (Q_2 + \kappa \Vert E_{Y}^p \Vert_{\H_{\Theta}} \ln(2/\epsilon) }{\sqrt{n_2 \zeta_2}(c_2 -1)} \right)^{\frac{c_2-1}{c_2+1}},\\
    \alpha_2 = \left( \frac{8\kappa (Q_2 + \kappa \Vert E_{Y}^p \Vert_{\H_\Theta} \ln(2/\epsilon) }{\sqrt{n_2 \zeta_2}(c_2 -1)} \right)^{\frac{2}{c_2+1}}.
\end{eqnarray*}
\end{lemma}


Recall that we define the population-level risk for the regression stage $\E^\gamma(H)$, population-level risk with regularization $\E^\gamma_\xi(H)$, and the empirical risk $\widehat \E^{\gamma, m}_{\xi}(H)$ with $E_X^p$ and $E_Y^p$ being replaced by $E_{\alpha_1,X}^{n_1}$ and $E_{\alpha_2,Y}^{n_2}$, respectively. 
Denote the optimal operator to $\E^\gamma_\xi(H)$ as $H_\xi^{\gamma} = \argmin_{H} \E_\xi^{\gamma}(H)$.
We now define the empirical risk $\E_{\gamma, m}^{\xi}(H)$ with true $E_X^p$ and $E_Y^p$, and the corresponding optimal operator.
\begin{eqnarray*}
    \E_\xi^{\gamma,m}(H) = \frac{1}{m} \summ  \Vert  y_{\gamma,l} - H \psi_{\gamma,l} \Vert_{\Y}^2 + \xi \Vert H \Vert_{\H_\Omega}^2, \quad
    H_\xi^{\gamma,m} = \argmin_{H} \E_\xi^{\gamma,m}(H),
\end{eqnarray*}
where the true transformed inputs and outputs are given by $$\psi_{\gamma,l} = \psi(x_{l}) + (\sqrt{\gamma}-1)  E_{X}^p  \phi(z_{l}) \in \H_\X, \quad y_{\gamma,l} = y_{l} + (\sqrt{\gamma}-1)  E_{Y}^p \phi(z_{l}) \in \Y.$$ 

The closed form solution of $H_\xi^{\gamma,m}$ is given by Lemma~\ref{lem:as3} below, and it's adapted from Theorem 3 in \cite{singh2019kernel}



\begin{lemma}\label{lem:as3}
$\forall \xi > 0$, the solution $H_{\xi}^{\gamma,m}$ to $\E_\xi^{\gamma,m}$ exists, is unique, and
\begin{eqnarray*}
    &\mathbf{T}= \frac{1}{m} \summ T_{\psi_{\gamma,l}}, \quad
    \mathbf{g} = \frac{1}{m} \summ \Omega_{\psi_{\gamma,l}} y_{\gamma,l},
    &H_\xi^{\gamma,m} = (\mathbf{T} + \xi)^{-1} \circ \mathbf{g}.
\end{eqnarray*}
\end{lemma}

%Before showing the convergence rate of $\widehat H_\xi^{\gamma,m}$, 
We then define the following terms.
%can be used to bound the excess error.
% \begin{definition} \label{def::ABN}
% %(Definition 7 of \cite{singh2019kernel})
% The residual $\A(\xi)$, reconstruction error $\B(\xi)$, and effective dimension $\N(\xi)$ are 
% \begin{eqnarray*}
%     \A(\xi) &=& \Vert \sqrt{T} ( H_\xi^\gamma - H^\gamma ) \Vert_{\H_\Omega}^2,\\
%     \B(\xi) &=& \Vert H_\xi^\gamma - H^\gamma \Vert_{\H_\Omega}^2,\\
%     \N(\xi) &=& Tr[(T+\xi)^{-1} \circ T].
% \end{eqnarray*}
% \end{definition}

\begin{definition}\label{def::const}
%(Definition 8 in \cite{singh2019kernel})
Fix $\eta \in (0,1)$ and define the following constants
\begin{eqnarray*}
    C_{\eta} = 96 \ln^2(6/\eta), \quad
    M = 2(C + \Vert H^\gamma \Vert_{\H_\Omega} \sqrt{B}), \quad
    \Sigma = \frac{M}{2}.
\end{eqnarray*}
\end{definition}

For the excess error of \textbf{KAR} estimator $\widehat H_\xi^{\gamma,m}$, 
%in the final regression stage, 
we can bound it by five terms according to Proposition 32 in \cite{singh2019kernel}. 
%we can bound the excess error of kernel anchor regression estimator by five five terms as shown below. 
\begin{lemma}\label{lem::5term}
%(Proposition 32 in \cite{singh2019kernel})
The excess error can be bounded as follows
\begin{eqnarray*}
    \E^\gamma(\widehat H^{\gamma,m}_\xi) - \E^\gamma(H^\gamma) \leq 5 [ S_{-1} + S_0 + \A(\xi) + S_1 + S_2],
\end{eqnarray*}
where
\begin{eqnarray*}
    S_{-1} &=& \Vert \sqrt{T} \circ (\widehat{\mathbf{T}} + \xi)^{-1} (\widehat{\mathbf{g}} - \mathbf{g}) \Vert_{\H_\Omega}^2,\\
    S_0 &=& \Vert \sqrt{T} \circ (\widehat{\mathbf{T}} + \xi)^{-1} (\mathbf{T} - \widehat{\mathbf{T}}) H^{\gamma,m}_\xi \Vert_{\H_\Omega}^2,\\
    S_1 &=& \Vert \sqrt{T} \circ (\widehat{\mathbf{T}} + \xi)^{-1} (\mathbf{g} - \mathbf{T}H^\gamma) \Vert_{\H_\Omega}^2,\\
    S_2 &=& \Vert \sqrt{T} \circ (\widehat{\mathbf{T}} + \xi)^{-1} (T - \mathbf{T}) (H^\gamma_\xi - H^\gamma) \Vert_{\H_\Omega}^2,\\
    \A(\xi) &=& \Vert \sqrt{T} (H_\xi^\gamma - H^\gamma) \Vert_{\H_\Omega}^2.
\end{eqnarray*}
\end{lemma}

For all five terms above, only $\widehat{\mathbf{g}} - \mathbf{g}$ in $S_{-1}$ depends on the approximation error of $E_Y^p$. The bounds for other four terms are same to the \textbf{KIV} case. Below we introduce without proof the bond of $S_0$, $S_1$, $S_2$ and $\A(\xi)$ according to Theorem 7 in \cite{singh2019kernel}.

\begin{lemma}
\label{lem::thm7}
%(Theorem 7 in \cite{singh2019kernel})
Under Condition~\ref{cond::s1}--\ref{cond::s3}, if $m$ is large enough and $\xi \leq \Vert T\Vert_{L(\H_\Omega)}$ then $\forall \delta, \eta \in (0,1)$, the following holds up w.p. $1-\eta-\delta$:
\begin{eqnarray*}
    S_0 &\leq& \frac{4}{\xi} 4BL^2 r_x^{2\iota} \Vert H^{\gamma, m}_\xi \Vert^2_{\H_\Omega}, \\
    S_1 &\leq& 32\ln^2(6\eta) \left[ \frac{BM^2}{m^2\xi} + \frac{\Sigma^2}{m} \beta^{1/b_\gamma} \frac{\pi/b_\gamma}{\sin(\pi\b) \xi^{-1/b_\gamma} } \right],\\
    S_2 &\leq& 8 \ln^2(6/\eta) \left[ \frac{4B^2\zeta\xi^{c_\gamma-1}}{m^2\xi} + \frac{B\zeta\xi^{c_\gamma}}{m \xi} \right],\\
    \A(\xi) &\leq& \zeta \xi^{c_\gamma}.
    % \Vert H^{\gamma, m}_\xi \Vert_{\H_\Omega}^2 &\leq& \frac{16}{\xi} 6 \ln^2(6/\eta) \left[ \frac{M^2B}{m^2\xi} + \frac{\Sigma^2}{m} \beta^{1/b} \frac{\pi/b}{\sin(\pi\b) \xi^{-1/b} } \right]  \\
    % && + \frac{4}{\xi^2} 6\ln^2(6\eta) \left[ \frac{4B^2 \zeta \xi^{c-1}}{m^2} \right] + 6\zeta \xi^{c-1} + 6\Vert H^\gamma \Vert^2_{\H_\Omega}.
\end{eqnarray*}
\end{lemma}

To extend the convergence rate of \textbf{KIV} estimator to \textbf{KAR} estimator. We then illustrate the bound for $S_{-1}$.
To begin with, the bound of term $\sqrt{T} \circ ( \widehat{\mathbf{T}} + \xi)^{-1}$ in $S_{-1}$ is given by Proposition 39 in \cite{singh2019kernel}.
\begin{lemma}
\label{lem::prop39}
%(Proposition 39 in \cite{singh2019kernel})
If $\Vert \widehat \psi_{\gamma} - \psi_{\gamma} \Vert_{\H_\X} \leq r_x$ w.p. $1-\delta$, 
%$m \geq \max\left\{ \frac{2C_\eta B \N(\xi)}{\xi}, \overline{m}(\delta, c_1) \right\}$, 
$\xi \leq \Vert T \Vert_{\L(\H_\Omega)}$, $m$ is sufficiently large and Condition~\ref{cond::s3} holds, then w.p. $1-\eta/3-\delta$
$$
\Vert \sqrt{T} \circ ( \widehat{\mathbf{T}} + \xi)^{-1} \Vert_{\L(\H_\Omega)} \leq \frac{2}{\sqrt{\xi}}.
$$
\end{lemma}

% \begin{lemma}\label{lem::33}
% (Proposition 33 in \cite{singh2019kernel})
% If $m \geq 2C_\eta B\N(\xi)$ and $\xi \leq \Vert T \Vert_{\L(\H_\Omega)}$, then w.p. $1 - \eta/3$
% $$
%     \Vert (T - \mathbf{T}) \circ (T + \xi)^{-1} \Vert_{\L(\H_\Omega)} \leq 1/2.
% $$
% \end{lemma}

% \begin{lemma}\label{lem::S12}
% (Proposition 34 in \cite{singh2019kernel})
% If $m \geq 2C_\eta B\N(\xi)$, $\xi \leq \Vert T \Vert_{\L(\H_\Omega)}$, then with w.p. $1-2\eta/3$
% \begin{eqnarray*}
%     S_1 &\leq& 32 \ln^2(6/\eta)\left[ \frac{BM^2}{m^2\xi} + \frac{\Sigma^2 \N(\xi)}{m} \right],\\
%     S_2 &\leq& 8 \ln^2(6/\eta)\left[ \frac{4B^2 \B(\xi)} {m^2\xi} + \frac{B \A(\xi)}{m \xi}\right]
% \end{eqnarray*}
% \end{lemma}

% \begin{lemma}\label{lem::S-10}
% (Proposition 35 in \cite{singh2019kernel})
% $S_{-1}$ and $S_0$ may be bounded by
% \begin{eqnarray*}
%     S_{-1} &\leq& \Vert \sqrt{T} \circ (\hat{\mathbf{T}} + \xi)^{-1}\Vert_{\L(\H_\Omega)}^2 \Vert(\hat{\mathbf{g}} - \mathbf{g}) \Vert_{\H_\Omega}^2,\\
%     S_0 &\leq& \Vert \sqrt{T} \circ (\hat{\mathbf{T}} + \xi)^{-1} \Vert_{\L(\H_\Omega)}^2 \Vert \mathbf{T} - \hat{\mathbf{T}}\Vert_{\H_\Omega}^2 \Vert H^m_\xi \Vert_{\H_\Omega}^2.\\
% \end{eqnarray*}
% \end{lemma}

% \begin{lemma}\label{lem::Hmxi}
% (Proposition 36 in \cite{singh2019kernel})
% If $m \geq 2C_\eta B\N(\xi)$ and $\xi \leq \Vert T \Vert_{\L(\H_\Omega)}$, then
% \begin{eqnarray*}
%     & & \Vert H^m_\xi \Vert_{\H_\Omega}^2 \\
%     & \leq & 6 \left( \frac{16}{\xi} \ln^2(6/\eta) \left[ \frac{M^2 B}{m^2 \xi} + \frac{\Sigma^2 \N(\xi)}{m} \right] + \frac{4}{\xi^2} \ln^2(6/\eta) \left[ \frac{4 B^2 \B(\xi)}{m^2} + \frac{B \A(\xi)}{m} + \B(\xi) + \Vert H^\gamma \Vert_{\H_\Omega}^2  \right]
%     \right).
% \end{eqnarray*}
% \end{lemma}

% \begin{lemma}\label{lem::T}
% (Proposition 38 in \cite{singh2019kernel})
% \begin{eqnarray*}
%     & & \Vert \sqrt{T} \circ ( T + \xi)^{-1} \Vert_{\L (\H_\Omega)}^2 \leq \frac{1}{2 \sqrt \xi} .
% \end{eqnarray*}
% \end{lemma}

% \begin{lemma}\label{lem::Tdiff}
% (Proposition 37 in \cite{singh2019kernel})
% If $\Vert \hat x_a - x_a \Vert_{\H_\X} \leq r_x$ w.p. $1 - \delta$, then w.p. $1 - \delta$
% $$
%     \Vert \mathbf{T} - \hat{\mathbf{T}} \Vert_{\L(\H_\Omega)}^2 \leq 4BL^2 r_x^{2 \iota}. 
% $$
% \end{lemma}

% \begin{lemma}
% (Proposition 39 in \cite{singh2019kernel})

% \end{lemma}

With the the error propagated from the estimators in the projection stage, we can bound $\widehat \psi_\gamma - \psi_\gamma$ and $\widehat y_\gamma - y_\gamma$ as shown in Lemma~\ref{lem::xadiff}--\ref{lem::yadiff}.
%Here we give bound on $\hat{\mathbf{g}} - \mathbf{g}$, $\widehat \psi_\gamma - \psi_\gamma$ and $\widehat y_\gamma - y_\gamma$, which are different from \cite{singh2019kernel}.
\begin{lemma}\label{lem::xadiff}
Under Condition~\ref{cond::s1},
$\forall \delta \in (0,1)$, the following statement holds w.p. $1-\delta$: $\forall z \in \Z, x \in \X$,
$$
    \Vert \widehat \psi_\gamma - \psi_\gamma \Vert_{\H_\X} \leq r_x(\gamma, \delta, n_1, c_1)
    := \vert \sqrt{\gamma} - 1 \vert \kappa r_{E_1}(\delta, n_1, c_1).
$$
\end{lemma}
\begin{proof}
By definition, we have
\begin{eqnarray*}
\Vert \widehat \psi_\gamma - \psi_\gamma \Vert_{\H_\X} &=&
\Vert \left( \sqrt{\gamma} - 1 \right) \left( E^{n_1}_{\alpha_1,X} - E_{X}^p  \right) \phi(z) \Vert_{\H_\X}\\
&\leq& \vert \sqrt{\gamma} - 1 \vert \Vert E^{n_1}_{\alpha_1,X} - E_{X}^p \Vert_{\H_\Gamma} \Vert \phi(z) \Vert_{\H_\Z}.
\end{eqnarray*}
This, together with Lemma~\ref{lem::as1} and Condition~\ref{cond::s1}, ensures that w.p. $1-\delta$
$$
    \Vert \widehat \psi_\gamma - \psi_\gamma \Vert_{\H_\X} \leq r_x(\gamma, \delta, n_1, c_1)
    := \vert \sqrt{\gamma} - 1 \vert \kappa r_{E_1}(\delta, n_1, c_1).
$$
\end{proof}
\begin{remark}
Corollary 1 in \cite{singh2019kernel} is a special case of Lemma~\ref{lem::xadiff} with $\gamma=0$.
\end{remark}

\begin{lemma}\label{lem::yadiff}
Under Condition~\ref{cond::s1}--~\ref{cond::s2},
$\forall \epsilon \in (0,1)$, the following statement holds w.p. $1-\epsilon$: $\forall z \in \Z, y \in \Y$,
$$
    \Vert \widehat y_\gamma - y_\gamma \Vert_{\H_\Y} \leq r_y(\gamma, \epsilon, n_2, c_2)
    := \vert \sqrt{\gamma} - 1 \vert \kappa r_{E_2}(\epsilon, n_2, c_2).
$$
\end{lemma}
\begin{proof}
Lemma~\ref{lem::yadiff} is analogous to Lemma~\ref{lem::xadiff} by replacing $\psi_\gamma$ with $y_\gamma$. The proof is thus omitted.
\end{proof}

Combining Lemma~\ref{lem::prop39}-~\ref{lem::yadiff}, we can derive the bound of $\widehat{\mathbf{g}} - \mathbf{g}$ and then the bound of $S_{-1}$.
\begin{lemma}\label{lem::gdiff}
If $\Vert \widehat \psi_\gamma - \psi_\gamma \Vert_{\H_\X} \leq r_x$ w.p. $1 - \delta$ and $\Vert \widehat y_\gamma - y_\gamma \Vert_{\Y} \leq r_y$ w.p. $1 - \epsilon$, then w.p. $1 - \delta - \epsilon$
$$
    \Vert\widehat{\mathbf{g}} - \mathbf{g} \Vert_{\H_\Omega}^2
    \leq 3( L^2 r_x^{2\iota} r_y^2 + B^2 r_y^2 + L^2 r_x^{2\iota} C^2 ).
$$
\end{lemma}
\begin{proof}
By definition, we have
\begin{eqnarray*}
    \widehat{\mathbf{g}} - \mathbf{g} 
    &=& \frac{1}{m} \summ  \left ( \Omega_{\widehat \psi_{\gamma, l}} \widehat y_{\gamma,l} - \Omega_{ \psi_{\gamma, l}(x)}  y_{\gamma,l} \right )\\
    &=& \frac{1}{m} \summ \left\{ \Omega_{\widehat \psi_{\gamma, l}} -  \Omega_{ \psi_{\gamma, l}} \right\} \left\{ \widehat y_{\gamma,l} - y_{\gamma,l} \right\} + \Omega_{\widehat \psi_{\gamma, l}} \left\{ \widehat y_{\gamma,l} - y_{\gamma,l} \right\} + \left\{ \Omega_{\widehat \psi_{\gamma, l}} -  \Omega_{ \psi_{\gamma, l}} \right\} y_{\gamma,l}.\\
    % &=& \frac{1}{m} \summ  \left \{ \psi(x_{3,i})  + \left( \sqrt{\gamma}-1 \right) (E_{\alpha_2, X}^{n_2})^*  \phi(z_{3,i}) \right\} \left\{ y_{\a,3,i} + \left( \sqrt{\gamma}-1 \right) (E_{\alpha_1, Y}^{n_1})^* \phi(z_{3,i}) \right\}\\
    % && - \left\{ \psi(x_{3,i}) + \left( \sqrt{\gamma}-1 \right) (E_{\rho, X})^* \phi(z_{3,i}) \right\} \left\{ y_{\a,3,i} + \left( \sqrt{\gamma}-1 \right) (E_{\rho, Y})^* \phi(z_{3,i})\right\}\\
\end{eqnarray*}
We then have
\begin{eqnarray*}
    \Vert \widehat{\mathbf{g}} - \mathbf{g} \Vert_{\H_\Omega}^2 
    &\leq& \frac{3m}{m^2} \summ \Vert \left\{ \Omega_{\widehat \psi_{\gamma, l}} -  \Omega_{ \psi_{\gamma, l}} \right\} \left\{ \widehat y_{\gamma,l} - y_{\gamma,l} \right\} \Vert_{\H_\Omega}^2 + \Vert \Omega_{\widehat \psi_{\gamma, l}} \left\{ \widehat y_{\gamma,l} - y_{\gamma,l} \right\} \Vert_{\H_\Omega}^2  \\
    && + \Vert \left\{ \Omega_{\widehat \psi_{\gamma, l}} -  \Omega_{ \psi_{\gamma, l}} \right\} y_{\gamma,l} \Vert_{\H_\Omega}^2 \\
    &\leq& \frac{3}{m} \summ \Vert  \Omega_{\widehat \psi_{\gamma, l}} -  \Omega_{ \psi_{\gamma, l}} \Vert_{\L(\Y, \H_\Omega)}^2  \Vert \widehat y_{\gamma,l} - y_{\gamma,l}  \Vert_{\Y}^2 + \Vert \Omega_{ \psi_{\gamma, l}} \Vert_{\L(\Y, \H_\Omega)}^2 \Vert \widehat y_{\gamma,l} - y_{\gamma,l}  \Vert_{\Y}^2 \\
    && + \Vert  \Omega_{\widehat \psi_{\gamma, l}} -  \Omega_{ \psi_{\gamma, l}} \Vert_{\L(\Y, \H_\Omega)}^2 \Vert y_{\gamma,l} \Vert_{\Y}^2.
\end{eqnarray*}
By the boundedness and the Hölder property in Condition~\ref{cond::s3}, we obtain that w.p. $1-\delta-\epsilon$,
\begin{eqnarray*}
    \Vert \widehat{\mathbf{g}} - \mathbf{g} \Vert_{\H_\Omega}^2 
    &\leq& \frac{3}{m} \summ L^2 \Vert \widehat \psi_{\gamma, l} -  \psi_{\gamma, l}\Vert_{\H_\X}^{2\iota} \Vert \widehat y_{\gamma,l} - y_{\gamma,l}  \Vert_{\Y}^2 + \Vert \Omega_{ \psi_{\gamma, l}} \Vert_{\L(\Y, \H_\Omega)}^2 \Vert \widehat y_{\gamma,l} - y_{\gamma,l}  \Vert_{\Y}^2 \\
    && + L^2 \Vert \widehat \psi_{\gamma, l} -  \psi_{\gamma, l}\Vert_{\H_\X}^{2\iota} \Vert y_{\gamma,l} \Vert_{\Y}^2\\
    &\leq& 3( L^2 r_x^{2\iota} r_y^2 + B^2 r_y^2 + L^2 r_x^{2\iota} C^2).
\end{eqnarray*}
\end{proof}

\begin{lemma}
\label{lem::S-1}
Under Condition~\ref{cond::s1}--\ref{cond::s3}, then w.p. $1-\delta-\epsilon$
$$
    S_{-1} \leq \frac{4}{\xi} 3( L^2 r_x^{2\iota} r_y^2 + B^2 r_y^2 + L^2 r_x^{2\iota} C^2 ).
$$
%  S_{-1} &=& \Vert \sqrt{T} \circ (\hat{\mathbf{T}} + \xi)^{-1} (\hat{\mathbf{g}} - \mathbf{g}) \Vert_{\H_\Omega}^2,\\
\end{lemma}
\begin{proof}
We can derive from the definition of $S_{-1}$ that
$$
    S_{-1} \leq \Vert \sqrt{T} \circ (\widehat{\mathbf{T}} + \xi)^{-1} \Vert_{\L(\H_\Omega)}^2  \Vert \widehat{\mathbf{g}} - \mathbf{g} \Vert_{\H_\Omega}^2.
$$
This, together with Lemma~\ref{lem::prop39} and Lemma~\ref{lem::gdiff}, ensures
$$
    S_{-1} \leq \frac{4}{\xi} 3( L^2 r_x^{2\iota} r_y^2 + B^2 r_y^2 + L^2 r_x^{2\iota} C^2 ).
$$
\end{proof}

We then show the order of the sum $S_{0} + S_1 + S_2 + \A(\xi)$, which is adapted from Theorem 4 in \cite{singh2019kernel}.
\begin{lemma}
\label{lem::4term}
%(Theorem 4 in \cite{singh2019kernel})
Under Condition~\ref{cond::s1}-- ~\ref{cond::s3}, choose $\alpha_1 = n_1^{-\frac{1}{c_1+1}}$, $n_1 = m^{\frac{d_1(c_1+1)}{\iota(c_1-1)}}$, where $d_1 > 0$. Let 
$$
    f(m) = \frac{1}{m^{2+\di}\xi^3} + \frac{1}{m^{1+\di}\xi^{2+1/b_\gamma}} + \frac{1}{m^\di \xi} + \xi^{c_\gamma} + \frac{1}{m^2\xi} + \frac{1}{m\xi^{1/b_\gamma}},
$$
we then have
$$
    \op(S_0 + \A(\xi) + S_1 + S_2) = O(f(m)).
$$
\begin{itemize}
    \item [\textup(i)] If $\di \leq \frac{b_\gamma({c_\gamma}+1)}{{b_\gamma c_\gamma}+1}$ then $O(f(m)) = O(m^{-\frac{\di {c_\gamma}}{{c_\gamma}+1}})$ with $\xi = m^{-\frac{\di}{{c_\gamma}+1}}$;
    
    \item [\textup(ii)] If $\di > \frac{b_\gamma({c_\gamma}+1)}{{b_\gamma c_\gamma}+1}$ then $O(f(m)) = O(m^{-\frac{b_\gamma {c_\gamma}}{{b_\gamma c_\gamma}+1}})$ with $\xi = m^{-\frac{b_\gamma}{{b_\gamma c_\gamma}+1}}$.
\end{itemize}
\end{lemma}

% \begin{proof}
% [Proof of Theorem~\ref{thm::s3e}]

% The choice of $\alpha_2$ and $n_2$ in the statement of Theorem~\ref{thm::s3e} ensure that
% $$
% r_y^2 = O([(n_2^{-\frac{1}{2}})^{\frac{2}{c_2+1}}]^2) = O(m^{-\dii})
% $$

% Ignoring constants in Lemma~\ref{lem::gdiff}, by Lemma~\ref{lem::4term} we have
% $$
% \E(\hat H^m_\xi) - \E(H^\gamma) = \op(f(m) + \frac{1}{\xi}())
% $$

% Let $\xi = m^{-e}$, we have $e>0$ as $\xi \rightarrow 0$. Note that $f(m) +  m^{-\dii}\xi^{-1}$ is only one term more than $f(m)$ in Lemma~\ref{lem::4term}. Therefore, we only need to include the case where the extra term $m^{-\dii}\xi^{-1}$ has the highest order, which requires
% \begin{eqnarray}
% -\dii + e &\geq& -(2+\di) + 3e,
% \label{eq::1}\\
% -\dii + e &\geq& -(1+\di) + (2+\frac{1}{b}e),
% \label{eq::2}\\
% -\dii + e &\geq& -\di + e,
% \label{eq::3}\\
% -\dii + e &\geq& -ce,
% \label{eq::4}\\
% -\dii + e &\geq& -2+e,
% \label{eq::5}\\
% -\dii + e &\geq& -1 + \frac{1}{b}e.
% \label{eq::6}
% \end{eqnarray}

% Note that $\di, \dii, e >0$, $b >1$ and $c \in (1,2]$, so $  b(1-c) <  2 \Leftrightarrow (b(c+1))/({bc+1}) < 2 $.

% For case (i)(a), $d_1, d_2 \leq \frac{b(c+1)}{bc+1}$ and $d_2 \leq \min (d_1, \frac{b(c+1)(d_1+1)}{bc+2b+1})$,
% \end{proof}

\begin{proof}
[Proof of Theorem~\ref{thm::s3c}]
The choices of $\alpha_1, \alpha_2$ and $n_1, n_2$ in the statement of Theorem~\ref{thm::s3c} ensure that
\begin{eqnarray*}
r_x^2 = O([(n_1^{-\frac{1}{2}})^{\frac{2}{c_1+1}}]^{2\iota}) = O(m^{-\di}), \quad
r_y^2 = O([(n_2^{-\frac{1}{2}})^{\frac{2}{c_2+1}}]^2) = O(m^{-\dii}).
\end{eqnarray*}
Thus, by Lemma~\ref{lem::S-1}, we have $\op(S_{-1}) = \op(1/{\xi}( r_x^{2\iota} r_y^2 + r_y^2 + r_x^{2\iota})) = \op(1/{\xi} \left\{ m^{-\di} + m^{-\dii} + m^{-\di-\dii} \right\} )$. Since $\di, \dii >0$, and $\di \leq \dii$ by Condition~\ref{cond::s1&2}, $m^{-\di}/{\xi}$ then dominates two other terms in $S_{-1}$. 

Note that $f(m)$ in Lemma~\ref{lem::4term} also includes $m^{-\di}/{\xi}$. Therefore, given Condition~\ref{cond::s1&2}, the sum of four terms $S_0 + \A(\xi) + S_1 + S_2$ dominates $S_{-1}$, which suggests that the approximation error of $E_Y^p$ is dominated by that of $E_X^p$. We can then derive the result from Lemma~\ref{lem::4term}.
\end{proof}



% \begin{theorem}\label{thm::s3c}
% Under Condition~\ref{cond::s1}, ~\ref{cond::s2} and ~\ref{cond::s3}, choose $\alpha_1 = n_1^{-\frac{1}{c_1+1}}$, $\alpha_2 = n_2^{-\frac{1}{c_2+1}}$, $n_1 = m^{\frac{d_1(c_1+1)}{\iota(c_1+1)}}$ and $n_2 = m^{\frac{d_2(c_2+1)}{\iota(c_2+1)}}$, where $d_1, d_2 > 0$. We have:
% \begin{itemize}
%     \item [\textup(i)] If $d_1, d_2 \leq \frac{b(c+1)}{bc+1}$,
%     \begin{itemize}
%         \item [\textup{(a)}] if $d_2 \leq \min (d_1, \frac{b(c+1)(d_1+1)}{bc+2b+1})$, then $\E(\hat H_\xi^m) - \E(H^\gamma) = \op(m^{-\frac{d_2c}{c+1}})$ with $\xi = m^{-\frac{d_2}{c+1}}$;
%         \item [\textup{(b)}] otherwise $\E(\hat H_\xi^m) - \E( H^\gamma) = \op(m^{-\frac{d_1c}{c+1}})$ with $\xi = m^{-\frac{d_1}{c+1}}$.
%     \end{itemize}    
    
%     \item [\textup(ii)] If $d_1 \leq \frac{b(c+1)}{bc+1}$ and $d_2 > \frac{b(c+1)}{bc+1}$,
%     \begin{itemize}
%         \item [\textup{(a)}] if $d_2 \leq  \frac{b(c+1)(d_1+1)}{bc+2b+1}$, then $\E(\hat H_\xi^m) - \E( H^\gamma) = \op(m^{-\frac{d_2c}{c+1}})$ with $\xi = m^{-\frac{d_2}{c+1}}$;
%         \item [\textup{(b)}] otherwise $\E(\hat H_\xi^m) - \E( H^\gamma) = \op(m^{-\frac{bc}{c+1}})$ with $\xi = m^{-\frac{b}{bc+1}}$.
%     \end{itemize} 
    
%     \item [\textup(iii)] If $d_1 > \frac{b(c+1)}{bc+1}$ and $d_2 \leq \frac{b(c+1)}{bc+1}$, then $\E(\hat H_\xi^m) - \E(tH^\gamma) = \op(m^{-\frac{d_1c}{c+1}})$ with $\xi = m^{-\frac{d_1}{c+1}}$.
    
%     \item [\textup(iv)] If $d_1, d_2 > \frac{b(c+1)}{bc+1}$,
%     \begin{itemize}
%         \item [\textup{(a)}] if $d_2 \leq \min (2, 1+\frac{(b-1)d_2}{2b}, 1 + \frac{(b-1)d_2-1}{3b-1})$, then $\E(\hat H_\xi^m) - \E( H^\gamma) = \op(m^{-\frac{d_2-b}{b-1}})$ with $\xi = m^{-\frac{b(d_2-1)}{b-1}}$;
%         \item [\textup{(b)}] otherwise $\E(\hat H_\xi^m) - \E( H^\gamma) = \op(m^{-\frac{bc}{c+1}})$ with $\xi = m^{-\frac{b}{bc+1}}$.
%     \end{itemize}    
    
% \end{itemize}
% \end{theorem}

\subsection{Proof of Theorem~\ref{thm::causal}}
\begin{proof}
[Proof of Theorem~\ref{thm::causal}]

Under the kernel structural equation model, simple calculation gives
\begin{align}
C &= B_{CZ}\Phi(Z) +\epsilon_C, \label{eq:sem_c}\\
\Psi(X) &= (B_{XZ} +B_{XC}B_{CZ}) \Phi(Z) + B_{XC}\epsilon_C + \epsilon_X, \label{eq:sem_x}\\
Y &= [ B_{YZ} + B_{YC}B_{CZ} + B_{YX}(B_{XZ} + B_{XC}B_{CZ}) ] \Phi(Z) + (B_{YC} +B_{YX}B_{XC})\epsilon_C + B_{YX}\epsilon_X + \epsilon_Y.\label{eq:sem_y}
\end{align}
We denote $B_{\square \triangle}$ as the adjoint operator of $B_{\triangle\square}$, $B_{\square \triangle} = B_{\triangle \square}^*$. When no ambiguity arise, we use the transpose matrix notation $B_{\square \triangle} = B_{\triangle \square}^\T$. For instance, $B_{XZ} = B_{ZX}^\T$, $B_{YC} = B_{CY}^\T$. 
% 
Recall that the transformed input and output in \Cref{eq:transform_x} and \Cref{eq:transform_y} has the form
$$
\psi_\gamma(X) = \psi(X) - E^p_{X} \phi(Z) + \sqrt{\gamma} E^p_{X} \phi(Z),
$$
and 
$$
Y_\gamma = Y - E^p_{Y} \phi(Z) + \sqrt{\gamma} E^p_{Y} \phi(Z).
$$
In the SEM case, the projections $E_X^p$ and $E_Y^p$ into $\phi(Z)$ are noted by the (composition of) operators in \Cref{eq:sem_x} and \Cref{eq:sem_y}, where 
$$E_X^p = (B_{XZ} +B_{XC}B_{CZ}),$$ 
and 
$$
E_Y^p = [ B_{YZ} + B_{YC}B_{CZ} + B_{YX}(B_{XZ} + B_{XC}B_{CZ}) ]
.$$ As such, the transformed input and output has the form
\begin{equation}\label{eq:transform_x_res}
\psi_\gamma(x) = B_{XC}\epsilon_C + \epsilon_X + \sqrt{\gamma} (B_{XZ} +B_{XC}B_{CZ}) \phi(Z),
\end{equation}
and 
\begin{equation}\label{eq:transform_y_res}
y_\gamma = (B_{YC} +B_{YX}B_{XC})\epsilon_C + B_{YX}\epsilon_X + \epsilon_Y + \gamma [ B_{YZ} + B_{YC}B_{CZ} + B_{YX}(B_{XZ} + B_{XC}B_{CZ}) ] \phi(Z).  
\end{equation}
Define relevant covariance matrix/operators as $\Sigma_C = \Ex[\epsilon_C \epsilon_C^\T]$, $\Sigma_X = \Ex[\epsilon_X \otimes \epsilon_X]$ and $\Sigma_Z = \Ex[\phi(Z) \otimes \phi(Z)]$, where $\otimes$ denotes the tensor outer product. Then the solution for the least square objective on the transformed input output can be written as
$$H^{\gamma} = \Ex[Y_\gamma \psi_\gamma(X)] (\Ex[\psi_\gamma(X) \otimes \psi_\gamma(X)])^{-1}. $$
Plug in the transformed terms in the form of \Cref{eq:transform_x_res} and \Cref{eq:transform_y_res}, we have
\begin{align*}
    & \Ex[\psi_\gamma(X) \otimes \psi_\gamma(X)] \\
    &= \Ex[ ( B_{XC}\epsilon_C + \epsilon_X + \sqrt{\gamma} (B_{XZ} +B_{XC}B_{CZ}) \phi(Z)) ( B_{XC}\epsilon_C + \epsilon_X + \sqrt{\gamma} (B_{XZ} +B_{XC}B_{CZ}) \phi(Z))^\T] \\
    &= B_{XC} \Ex[\epsilon_C \epsilon_C^\T] B_{CX} + \Ex[\epsilon_X \otimes \epsilon_X] + \gamma (B_{XZ} +B_{XC}B_{CZ}) \Ex[\phi(Z)\otimes \phi(Z)] (B_{ZX} +B_{ZC}B_{CX})\\
    &= B_{XC}\Sigma_C B_{CX} + \Sigma_X + \gamma (B_{XZ} +B_{XC}B_{CZ}) \Sigma_Z (B_{ZX} +B_{ZC}B_{CX}).
\end{align*}
Moreover, $\Ex[Y_\gamma  \psi_\gamma(X)]$ has the form
\begin{align*}
    &(B_{YC} +B_{YX}B_{XC})\Ex[\epsilon_C \epsilon_C^\T]B_{CX} + B_{YX}\Ex[\epsilon_X \otimes \epsilon_X] + \\
    &\gamma [ B_{YZ} + B_{YC}B_{CZ} + B_{YX}(B_{XZ} + B_{XC}B_{CZ}) ] \Ex[\phi(Z) \otimes \phi(Z)](B_{ZX} + B_{ZC}B_{CX})\\
    =& (B_{YC} +B_{YX}B_{XC})\Sigma_C B_{CX} + B_{YX}\Sigma_X + \\
    &\gamma [ B_{YZ} + B_{YC}B_{CZ} + B_{YX}(B_{XZ} + B_{XC}B_{CZ}) ] \Sigma_Z (B_{ZX} + B_{ZC}B_{CX})
\end{align*}
as $\epsilon_C$, $\epsilon_X$ and $\epsilon_Y$ are independent 
% mean zero 
variables, which are also independent of $Z$. In overall, we have
\begin{align*}
    H^{\gamma} =& [(B_{YC} +B_{YX}B_{XC})\Sigma_C B_{CX} + B_{YX}\Sigma_X \\
    &+\gamma [ B_{YZ} + B_{YC}B_{CZ} + B_{YX}(B_{XZ} + B_{XC}B_{CZ}) ] \Sigma_Z (B_{ZX} + B_{ZC}B_{CX})]\\
    &\left[ B_{XC}\Sigma_C B_{CX} + \Sigma_X + \gamma (B_{XZ} +B_{XC}B_{CZ}) \Sigma_Z (B_{ZX} +B_{ZC}B_{CX}) \right]^{-1}
\end{align*}

% By the definition of target KAR estimator $H^\gamma$ and standard regression formula we have
% \begin{eqnarray*}
% H^\gamma &=& \left[ B_{XC}\Sigma_C B_{CX}^\T + \Sigma_{X} + \gamma (B_{ZX} + B_{ZC}B_{CX}) \Sigma_Z (B_{ZX} + B_{ZC}B_{CX})^\T \right]^{-1} \\
% && [ B_{CX}\Sigma_{C}(B_{CY} + B_{CX}B_{XY})^\T + \Sigma_X B_{XY}^\T \\
% && + \gamma (B_{ZX} + B_{ZC}B_{CX}) \Sigma_Z (B_{ZY} + B_{ZC}B_{CY} + B_{ZX}B_{XY}+ B_{ZC} B_{CX}B_{XY})^\T ].
% \end{eqnarray*}

The bias of the target KAR estimator is then given by
\begin{align*}
    &H^{\gamma} - B_{YX} = \\ 
    &\Big[(B_{YC} +B_{YX}B_{XC})\Sigma_C B_{CX} + B_{YX}\Sigma_X +\gamma [ B_{YZ} + B_{YC}B_{CZ} + B_{YX}(B_{XZ} + B_{XC}B_{CZ}) ] \Sigma_Z (B_{ZX} + B_{ZC}B_{CX})\Big]\\
    &\Big[ B_{XC}\Sigma_C B_{CX} + \Sigma_X + \gamma (B_{XZ} +B_{XC}B_{CZ}) \Sigma_Z (B_{ZX} +B_{ZC}B_{CX}) \Big]^{-1} - B_{YX}\\
    & = \Big[(B_{YC} +B_{YX}B_{XC})\Sigma_C B_{CX} + B_{YX}\Sigma_X +\gamma [ B_{YZ} + B_{YC}B_{CZ} + B_{YX}(B_{XZ} + B_{XC}B_{CZ}) ] \Sigma_Z (B_{ZX} + B_{ZC}B_{CX})\\
    &\qquad - B_{YX}(B_{XC}\Sigma_C B_{CX} + \Sigma_X + \gamma (B_{XZ} +B_{XC}B_{CZ}) \Sigma_Z (B_{ZX} +B_{ZC}B_{CX})) \Big]\\
    &\Big[ B_{XC}\Sigma_C B_{CX} + \Sigma_X + \gamma (B_{XZ} +B_{XC}B_{CZ}) \Sigma_Z (B_{ZX} +B_{ZC}B_{CX}) \Big]^{-1}
\end{align*}
Collecting all the common terms we get 
\begin{align*}
    H^{\gamma} - B_{YX} = & \Big[\underset{\Sigma_{YX}^{\res}}{\underbrace{B_{YC}\Sigma_C B_{CX}}} + \gamma \underset{\Sigma_{YX}^{\para}}{\underbrace{(B_{YZ} + B_{YC}B_{CZ})\Sigma_Z(B_{ZX} + B_{ZC}B_{CX})}}\Big] 
    \\
    &\Big[ B_{XC}\Sigma_C B_{CX} + \Sigma_X + \gamma (B_{XZ} +B_{XC}B_{CZ}) \Sigma_Z (B_{ZX} +B_{ZC}B_{CX}) \Big]^{-1}
\end{align*}

% \begin{eqnarray*}
% H^\gamma - B_{XY} &=& \left[ B_{CX}\Sigma_C B_{CX}^\T + \Sigma_{X} + \gamma (B_{ZX} + B_{ZC}B_{CX}) \Sigma_Z (B_{ZX} + B_{ZC}B_{CX})^\T \right]^{-1} \\
% && \left[B_{CX}\Sigma_{C}B_{CY} + \gamma (B_{ZX} + B_{ZC}B_{CX}) \Sigma_Z (B_{ZY} + B_{ZC}B_{CY})^\T \right].
% \end{eqnarray*}


Thus, $\forall x \in \X, y\in \Y$,  consider the inner product $y^\T (H^{\gamma} - B_{YX})\psi(x) = 0$ when the following holds: (i) $B_{YC} = 0$ and $\gamma = 0$, or (ii) $B_{YZ} + B_{YC}B_{CZ} = 0$ and $\gamma = \infty$, or (iii) $B_{YC} = 0$, $B_{YZ} + B_{YC}B_{CZ} = 0$ and $\gamma \geq 0$, or (iv) $\Sigma_{YX}^\para = a \Sigma_{YX}^\res$ for some $a > 0$, and $\gamma = \infty$, or (v) $\Sigma_{XY}^\para = - a \Sigma_{XY}^\res$ for some $a > 0$, and $\gamma = 1/c$. As such, we conclude $H^\gamma = B_{XY}$. 


% We the take the derivative of squared bias $\Vert H^\gamma - B_{XY} \Vert_2^2$ 
% \begin{eqnarray*}
% \frac{\partial \Vert H^\gamma - B_{XY} \Vert_2^2}{\partial \gamma} &=& -2 ( B_{CX}\Sigma_{C}B_{CY} )^\T\\
% && \left[ B_{CX}\Sigma_C B_{CX}^\T + \Sigma_{X} + \gamma (B_{ZX} + B_{ZC}B_{CX}) \Sigma_Z (B_{ZX} + B_{ZC}B_{CX})^\T \right]^{-1} \\
% && (B_{ZX} + B_{ZC}B_{CX}) \Sigma_Z (B_{ZX} + B_{ZC}B_{CX})^\T \\
% && \left[ B_{CX}\Sigma_C B_{CX}^\T + \Sigma_{X} + \gamma (B_{ZX} + B_{ZC}B_{CX}) \Sigma_Z (B_{ZX} + B_{ZC}B_{CX})^\T \right]^{-2}\\
% && \left[B_{CX}\Sigma_{C}B_{CY} + \gamma (B_{ZX} + B_{ZC}B_{CX}) \Sigma_Z (B_{ZY} + B_{ZC}B_{CY})^\T \right].
% \end{eqnarray*}

\end{proof}

% \section{Additional details on the background}


% \textbf{Distributional robustness}. In a linear setting, the goal of distributionally robust prediction can be expressed as the optimization problem:
% \begin{equation}\label{eq:distr_robust}
%     \widehat{b}=\argmin_{b \in \R^d} \max_{p \in \cP}\Ex_{p}[(Y-X^T b)^2]
% \end{equation}
% where $\cP$ is a class of distributions. Choosing different classes $\cP$ results in estimators with different properties. If $\cP$ contains only the training (empirical) distribution, {observational data},  denoted $\widehat{p}$, then the solution of the optimization problem \eqref{eq:distr_robust} becomes the ordinary least squares estimator, denoted by $\widehat{b}^{OLS}$. If the class $\cP$ contains all interventions on components of X (and not Y), then the optimizer of \eqref{eq:distr_robust} is the vector of causal coefficients $\widehat{b}^{causal}$ (cite Rojas Carulla 2018). However this estimator may perform badly under the training distribution, i.e. the gap between $\Ex_{\widehat{p}}[(Y-X^T b^{causal})^2]-\Ex_{\widehat{p}}[(Y-X^T b^{OLS})^2]$ may be large.


% In contrast to robustness, stability can be formulated as learning a predictive model which can achieve \textit{uniformly} small error (no matter how the data distribution changes). 

% Several works exploit the idea that \textit{causal} correlations, in contrast to varying/spurious correlations, are \textit{stable}, or \textit{invariant} properties even when the distribution (e.g., environment) changes.


% \paragraph{Three-stage least square (3SLS)}


%\renewcommand\thelemma{B\arabic{lemma}}
%\setcounter{lemma}{0}

\subsection{Convergence rate for KAR.2 estimator}
In this section, we will further discuss the convergence rate of \textbf{KAR.2} estimator, and show that
% we cannot derive a same or
the rate does not
improve upon the convergence rate 
% compared to 
of \textbf{KAR} estimator. 

In the three-stage KAR procedure, we approximate $E_X^p$ and $E_Y^p$ by $E_{\alpha_1, X}^{n_1}$ and $E_{\alpha_2, Y}^{n_2}$, respectively. In the two-stage KAR procedure, instead, we approximate the two operators by $E_{\alpha, X}^{n}$ and $E_{\alpha, Y}^{n}$, respectively. Note that the estimated operators $E_{\alpha, X}^{n}$ and $E_{\alpha, Y}^{n}$ use the same $\alpha$. The shared $\alpha$ may fail to ensure the optimal approximation error for $E_{\alpha, X}^{n}$ and $E_{\alpha, Y}^{n}$ at the same time.
%We note that both $E_{\alpha, X}^{n}$ and $E_{\alpha, Y}^{n}$ are based on more data than $E_{\alpha_1, X}^{n_1}$ and $E_{\alpha_2, Y}^{n_2}$ as $n = n_1 + n_2$. However, we 
\begin{lemma}\label{lem:b1}
Under Condition~\ref{cond::s1}, $\forall \delta \in (0,1)$, the following holds w.p. $1 - \delta$:
\begin{eqnarray*}
    \Vert E_{\alpha,X}^{n} - E_{X}^p \Vert_{\H_\Gamma} \leq
    r_1(\alpha):=
    \frac{4\kappa (Q_1 + \kappa \Vert E_{X}^p \Vert_{\H_\Gamma})\ln(2/\delta)}{\sqrt{n}\alpha} + \alpha^{\frac{c_1-1}{2}}\sqrt{\zeta_1}.
\end{eqnarray*}
Under Condition~\ref{cond::s1} and Condition~\ref{cond::s2}, $\forall \epsilon \in (0,1)$, the following holds w.p. $1 - \epsilon$:
\begin{eqnarray*}
    \Vert E_{\alpha,Y}^{n} - E_{Y}^p \Vert_{\H_\Theta} \leq
    r_2(\alpha):=
    \frac{4\kappa (Q_2 + \kappa \Vert E_{Y}^p \Vert_{\H_\Theta})\ln(2/\epsilon)}{\sqrt{n}\alpha} + \alpha^{\frac{c_2-1}{2}}\sqrt{\zeta_2}.
\end{eqnarray*}
Approximation error bound $r_1(\alpha)$ for $E_{\alpha,X}^{n}$ achieves its minimum at rate $O(n^{-\frac{c_1-1}{2(c_1+1)}})$ when 
$$
    \alpha = \left(\frac{8\kappa (Q_1 + \kappa \Vert E_{X}^p \Vert_{\H_\Gamma})\ln(2/\delta)}{\sqrt{n\zeta_1}(c_1-1)}  \right)^{\frac{2}{c_1+1}} = O(n^{\frac{-1}{c_1+1}});
$$
and approximation error bound $r_2(\alpha)$ for $E_{\alpha,Y}^{n}$ achieves its minimum at rate $O(n^{-\frac{c_2-1}{2(c_2+1)}})$ when 
$$
    \alpha = \left(\frac{8\kappa (Q_2 + \kappa \Vert E_{Y}^p \Vert_{\H_\Theta})\ln(2/\epsilon)}{\sqrt{n\zeta_2}(c_2-1)}  \right)^{\frac{2}{c_2+1}} = O(n^{\frac{-1}{c_2+1}}).
$$
\end{lemma}
Lemma~\ref{lem:b1} above provides the upper bounds of the approximation errors for $E_{\alpha, X}^{n}$ and $E_{\alpha, Y}^{n}$, and it's adapted from Theorem 2 in \cite{singh2019kernel}. We can see that if $c_1 \neq c_2$, we cannot claim the optimal convergence rate for $E_{\alpha, X}^{n}$ and $E_{\alpha, Y}^{n}$ at the same time, which disjoint sample sets projection estimators can guarantee by setting different $\alpha_1$ and $\alpha_2$ as shown in Lemma~\ref{lem::s1} and ~\ref{lem:s2}. In other words, in \textbf{KAR.2} procedure, the error propagated to the final stage, which are caused by using $E_{\alpha, X}^{n}$ and $E_{\alpha, Y}^{n}$, can have larger order than using $E_{\alpha_1, X}^{n_1}$ and $E_{\alpha_2, Y}^{n_2}$ separately in the \textbf{KAR} procedure. Therefore, we cannot ensure a same or improved convergence rate for \textbf{KAR.2} estimator compared to \textbf{KAR} estimator. 
% As such, the optimal rate for the two-stage procedure with joint projection is an interesting future work.





\section{Additional simulation details and results}

\renewcommand\thefigure{B\arabic{figure}}
\setcounter{figure}{0}

\subsection{Synthetic example in KIV setting}\label{app:kiv}
In this section, we show the data generating process and implementation details for the example
%used in the KIV \cite{singh2019kernel} 
that follows the simulation case of learning counterfactual functions \cite{chen2018optimal} studied in \cite{singh2019kernel}. The structural model is set as follows,
$$
    Y = C + \ln(|16X - 8| + 1) sgn (X - 0.5).
$$
The explanatory variables are generated from
\begin{eqnarray*}
\left( \begin{array}{c}
        C  \\
        V \\
        W \end{array} \right) &\sim&
        N \left( \left( \begin{array}{c}
        0  \\
        0 \\
        0 \end{array} \right), \left( \begin{array}{ccc}
        1, 0.5, 0  \\
        0.5, 1, 0 \\
        0, 0, 1 \end{array} \right)
        \right), \\
    X &=& F \left( \frac{W+V}{\sqrt{2}} \right),\\
    Z &=& F(W),
\end{eqnarray*}
where $F$ denote the c.d.f of standard normal distribution. This structural model ensures that anchor $Z$ is a valid instrumental variable, so that KIV is supposed to perform well in this case. We conduct kernel anchor regression with three-stage algorithm (KAR), kernel anchor regression with two-stage algorithm (KAR.2) and multiple $\gamma$s and kernel instrument variable regression (KIV). 
Set $n_1 = 200$, $n_2 = 200$, $m = 600$, $n=n_1+n_2 =400$.
For KAR and KAR.2, we set $\gamma$ to be 0, 0.5, 1, 2, 5, 10, and 100. We set $\alpha_1 = c_\alpha n_1^{-0.5}$, $\alpha_2 =  c_\alpha n_2^{-0.5}$, $\alpha =  c_\alpha n^{-0.5}$, and $\xi = 1 m^{-0.5}$, where $c_\alpha > 0$ is a constant chosen from $\{ 0.01,0.05,0.1,0.5,0.8,1,2,3\}$ for each estimator separately to minimise the corresponding MSE. We use Gaussian kernel for all kernel methods, where the lengthscales are set according to median heuristic \cite{gretton2012kernel}.
% interpoint distance.

For each algorithm, we then implement 50 simulations and calculate MSE with respect to the true causal model $\Ex (Y| do(x))$, which can be computed from the structural model. As shown in Figure~\ref{fig:kiv}, though KIV performs better than most KAR and KAR.2 estimators, KAR and KAR.2 with $\gamma = 2$ defeat KIV in the KIV setting. 
This, together with the the fact that KIV defeat other non kernel-based approaches as shown in \cite{singh2019kernel}, indicates that KAR also outperforms DeepIV and SmoothIV in this setting.
The parameters $c_\alpha$s are chosen to be 1, 0.1, 3, 0.8, 3, 3, 3, 1, 0.1, 3, 1, 3, 3, 3 and 2 for KAR with $\gamma$ being 0, 0.5, 1, 2, 5, 10, 100, KAR.2 with same $\gamma$ series and KIV, respectively.

% \paragraph{Varying levels of unobserved confounding} We also include the result where 


% \begin{table}[ht]
% \footnotesize
% \centering
% \begin{tabular}{c|ccccccc}
%   \hline
% level & 0.05 & 0.25 & 0.5 & 0.75 & 1.0 & 1.5 & 5.0\\ 
%   \hline
% KAR & -1.27 (0.0705) & -1.47 (0.106) & -1.65 (0.104) & -1.63 (0.124) & -1.54 (0.152) & -1.14 (0.223) & 0.176 (0.185) \\ 
%   KAR-2 & -1.05 (0.0737) & -1.22 (0.106) & -1.43 (0.142) & -1.56 (0.162) & -1.66 (0.105) & -1.37 (0.218) & 0.0656 (0.2) \\ 
%   KIV & -0.181 (0.0708) & -0.228 (0.086) & -0.266 (0.0936) & -0.305 (0.107) & -0.352 (0.151) & -0.423 (0.128) & -0.722 (0.369) \\ 
%   KPA & -0.2 (0.417) & -0.0231 (0.38) & 0.0186 (0.399) & 0.207 (0.39) & 0.254 (0.329) & 0.406 (0.355) & 1.26 (0.333) \\ 
%   KernelReg & -0.672 (0.0108) & -0.76 (0.0221) & -0.888 (0.0357) & -1.02 (0.0588) & -1.18 (0.0786) & -1.53 (0.128) & -0.273 (0.158) \\ 
%   AR & 0.0876 (0.014) & 0.0673 (0.0144) & 0.0491 (0.017) & 0.029 (0.024) & 0.0081 (0.0224) & -0.0396 (0.0239) & -0.386 (0.114) \\ 
%   IV & -0.944 (0.105) & -0.945 (0.0994) & -0.975 (0.0837) & -0.928 (0.13) & -0.857 (0.206) & -0.766 (0.21) & 0.252 (0.207) \\ 
%   PA & 0.261 (0.0113) & 0.253 (0.0157) & 0.247 (0.0151) & 0.232 (0.0175) & 0.222 (0.0207) & 0.206 (0.0256) & 0.0677 (0.0725) \\ 
%   OLS & 0.278 (0.0111) & 0.271 (0.015) & 0.266 (0.0142) & 0.253 (0.0153) & 0.245 (0.0198) & 0.231 (0.023) & 0.114 (0.0624) \\ 
%    \hline
% \end{tabular}
% \end{table}


\begin{figure}[t!]
    \centering
    \includegraphics[width=0.7\textwidth]{fig/Sim_fit_Combined2.pdf}
    % \vspace{-0.3cm}
    \caption{Variant synthetic example: fitted nonlinear (left) and linear (right) methods.}
        % \subfigure[Prediction error with interventions.]{\includegraphics[width=0.5\textwidth]{fig/KAR2.pdf}\label{fig:PE_synthetic}}
        % \subfigure[MSE results of all estimators in the variant case. ]{\includegraphics[width=0.5\textwidth]{fig/MSE2-2.pdf}\label{fig::C2}}
        % \vspace{-0.3cm}
        \label{fig:variant_fitting}
\end{figure}
\begin{figure}[t!]
    \centering
    % \includegraphics[width=0.48\textwidth]{fig/IVcase.pdf}\label{fig::2}
    % \caption{MSE results of KAR, KAR.2 and KIV estimators in KIV setting.}
        \subfigure
        [MSE results 
        % of KAR, KAR.2 and KIV estimators 
        in the KIV setting]
        {\includegraphics[width=0.48\textwidth]{fig/IVcase.pdf}\label{fig:kiv}}
        \subfigure
        [MSE results of all estimators in the variant case. ]
        {\includegraphics[width=0.5\textwidth]{fig/MSE2-2.pdf}\label{fig:variant2}}
    \caption{Experimental results for additional experiments.
    % \small (left) MSE results of KAR, KAR.2 and KIV estimators in KIV setting; (right) MSE results of all estimators in the variant case. 
    }
    \label{fig::a1}
\end{figure}




\subsection{Additional synthetic data examples}\label{app:variant}
% \paragraph{A less smooth variant}
We also consider a variant case where the structural equation is same to the case in \Cref{sec:synthetic} in the main text 
\begin{equation*}
    Y = 0.75C - 0.25Z + \ln(|16X - 8| + 1) sgn (X - 0.5),
\end{equation*}
and the explanatory variables are generated as
\begin{eqnarray*}
% \left( 
\begin{pmatrix}
        C  \\
        V \\
        W \end{pmatrix} 
        % \right) 
        \sim
        N \left( 
        % \left( 
        \begin{pmatrix}
        0  \\
        0 \\
        0 \end{pmatrix}, 
        % \right), 
        % \left( 
        \begin{pmatrix}
        1, 0.3, 0.2  \\
        0.3, 1, 0 \\
        0.2, 0, 1 \end{pmatrix} 
        % \right)
        \right).\\
\end{eqnarray*}
Instead, $X$ and $Z$ are set via the following transformation.
\begin{eqnarray}\label{eq:gen_var}
    X = F \left( \frac{ \left|W\right|  +V}{\sqrt{2}} \right), \quad
    Z = F(\left|W\right|) - 0.5.
\end{eqnarray}
The fitted result of nonlinear and linear methods is shown in Figure~\ref{fig:variant_fitting}.
The MSE averaged over 50 simulations is shown in 
Figure~\ref{fig:variant2}. From the result, we can also see that the proposed kernel anchor regression estimators still performs the best among others under the variant case.

% \begin{figure}[t!]
%     \centering
%     \includegraphics[width=0.48\textwidth]{fig/Sim_MSE_2.pdf}\label{fig::C2}
%     \caption{MSE results of all estimators in the variant case.}
%         % \subfigure[Prediction error with interventions.]{\includegraphics[width=0.5\textwidth]{fig/KAR2.pdf}\label{fig:PE_synthetic}}
%         % \subfigure[MSE results of all estimators in the variant case. ]{\includegraphics[width=0.5\textwidth]{fig/MSE2-2.pdf}\label{fig::C2}}
% \end{figure}
%\fi


% \paragraph{A linear variant}
Moreover, we consider a case where the is structural equation is linear,
$$
Y = 0.75C - 0.25Z + 0.5X 
% - 0.5) + 1
+0.75,
$$
where the data-generating process for $X$, $Z$ and $C$ remains the same as Section 5.1 in the main text, 
\begin{eqnarray*}
% \left( 
\begin{pmatrix}
        C  \\
        V \\
        W \end{pmatrix} 
        % \right) 
        \sim
        N \left( 
        % \left( 
        \begin{pmatrix}
        0  \\
        0 \\
        0 \end{pmatrix}, 
        % \right), 
        % \left( 
        \begin{pmatrix}
        1, 0.3, 0.2  \\
        0.3, 1, 0 \\
        0.2, 0, 1 \end{pmatrix} 
        % \right)
        \right),
\end{eqnarray*}
and 
\begin{equation*}
    X = F \left( \frac{W+V}{\sqrt{2}} \right),\quad
    Z = F(W) - 0.5.
\end{equation*}
\begin{figure}[t!]
    \centering
    \includegraphics[width=0.45\textwidth, height=0.2\textwidth]{fig/rebuttal/P1.pdf}   
    \includegraphics[width=0.45\textwidth, height=0.2\textwidth]{fig/rebuttal/P2.pdf}
        % \subfigure
        % [MSE results 
        % % of KAR, KAR.2 and KIV estimators 
        % in the KIV setting]
        % {\includegraphics[width=0.345\textwidth]{fig/rebuttal/P1.pdf}\label{fig:linear_variant_fit}}
        % \subfigure
        % [MSE results of all estimators in the variant case. ]
        % {\includegraphics[width=0.345\textwidth]{fig/rebuttal/P2.pdf}\label{fig:linear_variant2}}
        % % \subfigure
        % % [MSE results of all estimators in the variant case. ]
        % % {\includegraphics[width=0.3\textwidth]{fig/rebuttal/P3.pdf}\label{fig:linear_variant3}}
    \caption{Linear SEM
    % structural equation model 
    example: fitted nonlinear (left) and linear (right) methods.
    % \small (left) MSE results of KAR, KAR.2 and KIV estimators in KIV setting; (right) MSE results of all estimators in the variant case. 
    }
    \label{fig::linear}
\end{figure}


\begin{figure}[t!]
    \centering
    \begin{minipage}[b!]{.5\linewidth}
    \centering
    \includegraphics[width=.75\textwidth, height=0.36\textwidth]{fig/rebuttal/P3.pdf}
\end{minipage}
\caption{MSE for the linear SEM
% % structural equation model 
example.}\label{fig:MSE_linear}
%     \subfigure[MSE for the linear SEM
% % structural equation model 
% example.]{\includegraphics[width=.4\textwidth,height=0.25\textwidth]{fig/rebuttal/P3.pdf}\label{fig:MSE_linear}
% }
%     \subfigure[Cross-validation for Gaussian kernel bandwidth]{\includegraphics[width=0.4\textwidth,height=0.25\textwidth]{fig/rebuttal/CV.pdf}\label{fig:cv}}
\end{figure}

\begin{wrapfigure}{r}{0.pt}
\begin{minipage}[b!]{.4\linewidth}
\vspace{-.5cm}
\includegraphics[width=1\linewidth]{fig/rebuttal/CV.pdf}
\vspace{-2.cm}
\end{minipage}
\end{wrapfigure}

% \begin{wrapfigure}{l}{0.pt}
% % % \vspace{-0.5cm}
% % \begin{minipage}[b!]{.2\linewidth}
% % \vspace{-0.5cm}
% {\includegraphics[width=.45\linewidth,height=0.28\textwidth]{fig/rebuttal/P3.pdf}
% }
% % \vspace{-.6cm}
% \captionof{Fig.4}{MSE for the linear SEM
% % structural equation model 
% example.}\label{fig:MSE_linear}
% % \end{minipage}
% \end{wrapfigure}

We compare KAR with the linear models 
to show the robustness and usefulness of the non-linear anchor regression.
% in the main text.
By cross-validation, we choose $\gamma = 3$ for KAR estimators. 
% The results are shown in Fig.xxx. 
As shown in the \Cref{fig::linear},  
KAR and KAR.2 are able to learn the linear relationship well and both methods achieve the lower MSE among others, outperforming the linear methods, as shown in \Cref{fig:MSE_linear}.
% 


% \begin{figure}[t!]
%     \centering
%     \includegraphics[width=0.4\textwidth]{fig/rebuttal/CV.pdf}
%     \caption{Cross-validation for Gaussian kernel bandwidth}
%     \label{fig:cv}
% \end{figure}


\subsection{Bandwidth Choice for Gaussian Kernel}
We conduct the experiment using different
bandwidths for Gaussian kernels on the setting in \Cref{sec:simulation}; and plot the cross-validation
error on the right. 
% \Cref{fig:cv}. 
The median bandwidth, averaged over 50 trials, are plotted in red vertical
line; and the average cross-validation error are plotted in blue horizontal line. The result
Bandwidth for Gaussian kernel
shows that the median heuristic bandwidth choice achieves close-to-optimal cross-validation error, which reassures the
good results presented in the main text.

% \begin{figure}[t!]
%     \centering
%     \includegraphics[width=0.4\textwidth]{fig/rebuttal/CV.pdf}
%     \caption{Cross-validation for Gaussian kernel bandwidth}
%     \label{fig:cv}
% \end{figure}

\bibliography{shi_139}

\end{document}
