%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{amsthm}
\usepackage{bm}
\usepackage{notation}
\usepackage{bbm}
\usepackage{amssymb}
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\usepackage{alphalph}
\def\theequation{\AlphAlph{\value{equation}}}

%% Self-defined macros
\newcommand{\ulim}[1]{\underset{#1 \rightarrow \infty}{\text{lim}}} % limit
\newcommand{\cons}{\xrightarrow{\text{P}}} % consistency
\newcommand{\asconv}{\xrightarrow{a.s.}} % almost sure convergence
\newcommand{\sampleiid}{\stackrel{\text{iid}}{\sim}} % iid sampling symbol
\newcommand{\liminfm}{\underset{m \rightarrow \infty}{\text{lim} \, \text{inf}}} % lim inf
\newcommand{\limsupm}{\limsup\limits_{m \rightarrow \infty}} % lim sup

%%%%%%%%%%%%%%%%%%%%%%% helper coder for overleaf to reference main file %%%%%%%%%%%%%%%%%%%%%%%
\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
\typeout{(#1)}% latexmk will find this if $recorder=0
% however, in that case, it will ignore #1 if it is a .aux or 
% .pdf file etc and it exists! If it doesn't exist, it will appear 
% in the list of dependents regardless)
%
% Write the following if you want it to appear in \listfiles 
% --- although not really necessary and latexmk doesn't use this
%
\@addtofilelist{#1}
%
% latexmk will find this message if #1 doesn't exist (yet)
\IfFileExists{#1}{}{\typeout{No file #1.}}
}\makeatother

\newcommand*{\myexternaldocument}[1]{%
\externaldocument[main-]{#1}%
\addFileDependency{#1.tex}%
\addFileDependency{#1.aux}%
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\myexternaldocument{kladny_306}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

% extra proposition
% work around
\newtheorem*{prospec}{Proposition 4.2}
% we will need an additional lemma
\newtheorem{lemma}{Lemma}[section]

\title{Causal Effect Estimation from Observational and Interventional Data \\ Through Matrix Weighted Linear Estimators\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,2]{\href{mailto:<klaus-rudolf.kladny@tuebingen.mpg.de>?Subject=Causal Effect Estimation from Observational and Interventional Data
Through Optimal Weight Matrices}{Klaus-Rudolf Kladny}{}}
\author[2,3]{Julius von K\"ugelgen}
\author[1,2]{Bernhard Sch\"olkopf}
\author[2]{Michael Muehlebach}
% Add affiliations after the authors
\affil[1]{%
    Department of Computer Science\\
    ETH Z\"urich\\
    Switzerland
}
\affil[2]{%
    Max Planck Institute for Intelligent Systems
    T\"ubingen, Germany
}
\affil[3]{%
    Department of Engineering, University of Cambridge, United Kingdom
}

\begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\appendix

\section{Proofs}
\iffalse
\begin{proof}
   We denote the decreasingly sorted singular values of some arbitrary matrix $\Mb$ as $s_1(\Mb), s_2(\Mb), ..., s_p(\Mb)$. Then, we have by Weyl's inequality \cite[p. 171]{horn_johnson_1991} that

   \begin{equation*}
       s_p(\Ab + \Bb) \; \geq \; s_p(\Ab) + s_p(\Bb).
   \end{equation*}

   Then we see by the Cauchy-Schwarz inequality that 

   \begin{equation*}
       || (\Ab + \Bb)^{-1} \Bb || \; \leq \; ||(\Ab + \Bb)^{-1} || \; || \Bb || \; \leq \; \frac{s_1(\Bb)}{s_p(\Ab) \; + \; s_1(\Bb)} \; \leq \; 1.
   \end{equation*}
   $|| \Bb \; (\Ab + \Bb)^{-1}|| \; \leq \; 1$ follows by the same argument.
   
\end{proof}
\fi

\subsection{Proposition \ref{main-prop:MSE_greater_zero}}

\begin{proof}
    We begin by observing that we can write $\WP$ as

    \begin{equation} \label{equ:rewrite_pooling_matrix}
        \WP = \left(m^{-1} \XI^{\top} \XI \; + \; \frac{n}{m} n^{-1} \XO^{\top} \XO \right)^{-1} \left( m^{-1} \XI^{\top} \XI \right).
    \end{equation}
    We apply the strong law of large numbers to obtain that 
    \begin{equation*}
        m^{-1} \XI^{\top} \XI \asconv \COV(\XI) \quad \text{and} \quad n^{-1} \XO^{\top} \XO \asconv \COV(\XO).
    \end{equation*}
    Due to the fact that $\ulim{m} \frac{n(m)}{m}=c$ for some $c>0$, we conclude

    \begin{equation*}
        \WP \; \asconv \; \Wb_{\infty} \; := \; \left( \text{\textbf{Cov}}(\XI) \; + \; c \cdot \text{\textbf{Cov}}(\XO)\right)^{-1} \text{\textbf{Cov}}(\XI).
    \end{equation*}

    We observe that 

    \begin{equation*}
        \left( \Ib - \Wb_{\infty}\right) = \left( \COV(\XI) + c \cdot \COV(\XO) \right)^{-1} c \cdot \COV(\XO).
    \end{equation*}
    Since both covariance matrices are positive definite, so is $\COV(\XI) + c \cdot \COV(\XO)$. We conclude that the smallest singular value of $\Ib - \Wb_{\infty}$ is strictly greater than 0. This means
    \begin{equation*}
        \big| \big| \mathbb{E}[\alphaWeight{\Wb_{\infty}}] - \bm{\alpha} \big| \big|_2^2 \; = \; || \left( \mathbf{I}_p - \Wb_{\infty} \right) \bm{\Delta} ||_2^2 \; \geq \; c' || \bm{\Delta} ||_2^2,
    \end{equation*}

    for some fixed constant $c' > 0$. We obtain therefore
    
    \begin{equation*}
        0 < \underset{m \rightarrow \infty}{\text{lim}} \big| \big| \mathbb{E}[\alphaWeight{\Wb_{\infty}}] - \bm{\alpha} \big| \big|_2^2 \leq \underset{m \rightarrow \infty}{\text{lim}} \; \text{MSE} \, \left( \alphaWeight{\Wb_{\infty}} \right),
    \end{equation*}
    where we invoked Jensen's inequality. We see that $\Wb_{\infty}$ is constant and bounded. We note that almost sure convergence implies convergence in probability. We can thus apply Lemma \ref{lem:MSEconvergence}, which yields the desired result
    \begin{equation*}
        0 < \underset{m \rightarrow \infty}{\text{lim}} \; \text{MSE} \, \left( \alphaWeight{\Wb_{\infty}} \right) \leq \underset{m \rightarrow \infty}{\text{lim}} \; \text{MSE} \, \left( \alphaWeight{\WP} \right).
    \end{equation*}
\end{proof}

\subsection{Proposition 4.2}

\begin{prospec} \label{prop:pool_vanishing_MSE}
Let $\lim_{m\to\infty} \frac{n(m)}{m} = 0$. Then, it holds that
\begin{equation*}
    \lim_{m\to\infty} \MSE \left(\alphaP \right) = 0.
\end{equation*}
\end{prospec}

\begin{proof}
    Similar to the proof of Proposition \ref{main-prop:MSE_greater_zero}, we employ the formulation of \eqref{equ:rewrite_pooling_matrix} and consider the term
    \begin{equation*}
         \frac{n}{m} \cdot n^{-1} \XO^{\top} \XO.
    \end{equation*}
    We see that $\ulim{m}\frac{n(m)}{m} = 0$ and by the strong law of large numbers, $n^{-1} \XO^{\top} \XO \asconv \COV(\XO)$. Hence, we obtain that
    \begin{equation*}
        \frac{n}{m} \cdot n^{-1} \XO^{\top} \XO \asconv \mathbf{0}.
    \end{equation*}
    By the continuous mapping theorem, we conclude that
    \begin{equation*}
         \WP \asconv \mathbf{I}_p,
    \end{equation*}
    and by Lemma \ref{lem:upper_bounded}, this implies that
    \begin{equation*}
         \ulim{m} \; \text{MSE} \left( \alphaWeight{\WP} \right) \; \leq \; \ulim{m} \text{MSE} \left( \alphaI \right) \; = \; 0.
    \end{equation*}
\end{proof}

\subsection{Proposition \ref{main-prop:weight_matrix_cons}}
\iffalse
\begin{proof}
    We fix any constant $\bar{\Wb} \in \mathbb{R}^{p \times p}$ for which it holds that $\left( \Ib_p - \bar{\Wb} \right) \bm{\Delta} = \mathbf{0}$ and $\bar{\Wb} \in \mathcal{C}$. Then, we observe that we can write for any constant $\Wb \in \mathbb{R}^{p \times p}$
    \begin{equation*}
        \MSE_{|\Wb} \left(\alphaWeight{\Wb} \right) \; = \; \text{Tr} \left( \left( \Ib_p - \Wb \right) \COV(\alphaO) \left( \Ib_p - \Wb \right)^{\top} + \left( \Ib_p - \Wb \right) \COV(\alphaI) \left( \Ib_p - \Wb \right)^{\top} + \left( \Ib_p - \Wb \right) \bm{\Delta} \bm{\Delta}^{\top} \left( \Ib_p - \Wb \right)^{\top} \right)
    \end{equation*}
    with empirical variant
    \begin{equation*}
        \widehat{\MSE}_{|\Wb} \left(\alphaWeight{\Wb} \right) \; \coloneqq \; \text{Tr} \left( \left( \Ib_p - \Wb \right) \widehat{\COV}(\alphaO) \left( \Ib_p - \Wb \right)^{\top} + \left( \Ib_p - \Wb \right) \widehat{\COV} (\alphaI) \left( \Ib_p - \Wb \right)^{\top} + \left( \Ib_p - \Wb \right) \Deltaest \Deltaest^{\top} \left( \Ib_p - \Wb \right)^{\top} \right).
    \end{equation*}
    We see by construction that
    \begin{equation*}
        \widehat{\MSE}_{|\Wb} \left( \alphaWeight{\Wopthat} \right) \; \leq \; \widehat{\MSE}_{|\Wb} \left( \alphaWeight{\bar{\Wb}} \right),
    \end{equation*}
    almost surely and
    \begin{equation*}
        \ulim{m} \MSE_{|\Wb} \left( \alphaWeight{\bar{\Wb}} \right) =  \ulim{m} \MSE \left( \alphaWeight{\bar{\Wb}} \right) = 0.
    \end{equation*}
    Now, we observe that
    \begingroup
    \addtolength{\jot}{.5em}
    \begin{equation*}
        \begin{aligned}
        \MSE \left( \alphaWeight{\Wopthat} \right) &= \mathbb{E} \left[ \MSE_{|\Wb} \left( \alphaWeight{\Wopthat} \right) \right] \\
        &= \mathbb{E}\left[ \MSE_{|\Wb} \left( \alphaWeight{\Wopthat} \right) - \widehat{\MSE}_{|\Wb} \left( \alphaWeight{\Wopthat} \right) + \widehat{\MSE}_{|\Wb} \left( \alphaWeight{\Wopthat} \right) \right] \\
        &\leq \mathbb{E} \left[ \MSE_{|\Wb} \left( \alphaWeight{\Wopthat} \right) - \widehat{\MSE}_{|\Wb} \left( \alphaWeight{\Wopthat} \right) \right] + \mathbb{E} \left[ \widehat{\MSE}_{|\Wb} \left( \alphaWeight{\bar{\Wb}} \right) \right]. 
        \end{aligned}
    \end{equation*}
    \endgroup
    $\Wopthat$ and $\bar{\Wb}$ are bounded in norm, almost surely, for all $m \in \mathbb{N}$. We conclude that 
\end{proof}
\fi

\begin{proof}
    We rewrite $\Wopthat$ as follows:

    \begin{align*}
        \Wopthat = &\left( n^{-1} \left( n^{-1} \XO^{\top}\XO \right)^{-1} \hat{\sigma}^2_{Y|X} + \hat{\bm{\Delta}} \hat{\bm{\Delta}}^{\top} + \epsilon \Ib_p \right) \\ 
        &\left( n^{-1} \left( n^{-1} \XO^{\top}\XO \right)^{-1} \hat{\sigma}^2_{Y|X} + m^{-1} \left( m^{-1} \XI^{\top}\XI \right)^{-1} \hat{\sigma}^2_{Y|\text{do}(X)} + \hat{\bm{\Delta}} \hat{\bm{\Delta}}^{\top} + \epsilon \Ib_p \right)^{-1},
    \end{align*}
    where we insert any almost surely converging estimators for $\bm{\Delta}$, $\sigma_{Y|X}^2$ and $\sigma_{Y|\text{do}(X)}^2$ instead of their ground-truth values. By almost sure convergence of linear estimators individually, we see that this holds specifically for $\hat{\bm{\Delta}} = \alphaO - \alphaI$. Also, we can use the strong law of large numbers to conclude almost sure convergence of $\hat{\sigma}^2_{Y|X}$ and $\hat{\sigma}^2_{Y|\text{do}(X)}$.
    
    We now show $\Wopthat \asconv \Ib_p$: First, we see that
    \begin{equation*}
        (cm)^{-1} \left( n^{-1} \XO^{\top}\XO \right)^{-1} \hat{\sigma}^2_{Y|X} \asconv \mathbf{0} \quad \text{and} \quad m^{-1} \left( m^{-1} \XI^{\top}\XI \right)^{-1} \hat{\sigma}^2_{Y|\text{do}(X)} \asconv \mathbf{0},
    \end{equation*}
    since $m^{-1} \XI^{\top}\XI \; \hat{\sigma}^2_{Y|\text{do}(X)}$ and $n^{-1} \XO^{\top}\XO \; \hat{\sigma}^2_{Y|X}$ converge almost surely to constants and $m^{-1}$ vanishes. Hence,
    \begin{equation*}
        \Wopthat \asconv \left( \bm{\Delta} \bm{\Delta}^{\top} + \epsilon \Ib_p \right) \; \left( \bm{\Delta} \bm{\Delta}^{\top} + \epsilon \Ib_p \right)^{-1} = \Ib_p.
    \end{equation*}
\end{proof}

\subsection{Theorem \ref{main-theo:no_bias_in_the_limit}}

\begin{proof}
We have that $\Ib_p$ is bounded in norm, almost surely. So we can apply Lemma \ref{lem:upper_bounded} to see that
    \begin{equation*}
        \ulim{m} \; \text{MSE} \, \big(\alphaWeight{\Wopthat}\big) \; \leq \; \ulim{m} \MSE \big( \alphaI \big) \; = \; 0.
    \end{equation*}
\end{proof}

\iffalse
\begin{proof}
    Here, we show that both
    \begin{equation*}
        \Wopt \cons \mathbf{I}_p \quad \text{and} \quad \Wopthat \cons \mathbf{I}_p,
    \end{equation*}
    which then implies $\left[ \Wopthat - \Wopt \right] \cons \mathbf{0}$. We rewrite $\Wopt$ like
    \begin{align*}
        \Wopt = &\left( \COV(\alphaO) \left( \COV(\alphaI) + \bm{\Delta} \bm{\Delta}^{\top} \right)^{-1} + \Ib_p \right)^{-1} \\
        =&\left( \left( \COV(\alphaI)\COV(\alphaO)^{-1} + \bm{\Delta} \bm{\Delta}^{\top}\COV(\alphaO)^{-1} \right)^{-1} + \Ib_p \right)^{-1}.
    \end{align*}
    We now show that
    \begin{equation} \label{eq:part_that_goes_0}
        \left( \COV(\alphaI)\COV(\alphaO)^{-1} + \bm{\Delta} \bm{\Delta}^{\top}\COV(\alphaO)^{-1} \right)^{-1} \cons \mathbf{0}.
    \end{equation}
    To this end, we rewrite the left-hand side of \eqref{eq:part_that_goes_0} like
    \begin{equation*}
        \left( m^{-1} \left( m^{-1}\XI^{\top}\XI \right)^{-1} \left( n^{-1}\XO^{\top}\XO \right) + \bm{\Delta} \bm{\Delta}^{\top} \left( n^{-1}\XO^{\top}\XO \right) \right)^{-1} n^{-1}.
    \end{equation*}
    and see using $n = c \cdot m$ and \citep[pp. 12 -13]{vaart_1998} that
    \begin{equation*}
        \left( m^{-1} \left( m^{-1}\XI^{\top}\XI \right)^{-1} \left( n^{-1}\XO^{\top}\XO \right) \right) \cons \mathbf{0},
    \end{equation*}
    and
    \begin{equation*}
    \bm{\Delta} \bm{\Delta}^{\top} \left( n^{-1}\XO^{\top}\XO \right) \cons \mathbf{C},
    \end{equation*} (SINGULAR)
    for some constant $\mathbf{C} \neq \mathbf{0}$ and so 

    \begin{equation*}
        \left( m^{-1} \left( m^{-1}\XI^{\top}\XI \right)^{-1} \left( n^{-1}\XO^{\top}\XO \right) + \bm{\Delta} \bm{\Delta}^{\top} \left( n^{-1}\XO^{\top}\XO \right) \right)^{-1} \cons \mathbf{C}^{-1}.
    \end{equation*}
    
    By Prohovo's theorem \citep[p. 8]{vaart_1998}, convergence in probability implies boundedness in probability. Consequently, we can use again \citep[pp. 12 -13]{vaart_1998} to get \eqref{eq:part_that_goes_0}. The result follows.

    We easily see that we can apply the same steps for showing that $\Wopthat \cons \mathbf{I}_p$, where we note that we can use any consistent estimates for $\bm{\Delta}$, $\sigma_{Y|X}^2$ and $\sigma_{Y|\text{do}(X)}^2$.
\end{proof}
\fi

\subsection{Proposition \ref{main-prop:weak_weight_consistency}}
\begin{proof}
By Theorem~\ref{main-theo:no_bias_in_the_limit}, it suffices to show that $\Wweak \asconv \Ib_p$. Since the other quantities $\text{\textbf{Cov}}(\alphaI)$, $\text{\textbf{Cov}}(\alphaO)$ for estimating $\Wopt$ remain unchanged compared to $\Wopthat$, it suffices to show that the modified computation of $\Deltaest$ we call $\hat{\bm{\Delta}}_m^{\ell^2}$ converges almost surely to the true $\bm{\Delta} = \alphaItrue - \alphaOtrue$, where $\alphaItrue$ and $\alphaOtrue$ are short-hand for $\mathbb{E}_{\text{int}}[Y | \Xb = \xb]$ and $\mathbb{E}_{\text{obs}}[Y | \Xb = \xb]$, respectively. We observe that  $\hat{\bm{\Delta}}_m^{\ell^2}$ has a closed-form solution
\begin{align} 
    \hat{\bm{\Delta}}_m^{\ell^2} &= -(\XI^{\top}\XI + \lambda_{\ell^2} \mathbf{I}_p)^{-1} \XI^{\top} (\YI - \XI\alphaO) \\
    &= (\XI^{\top}\XI + \lambda_{\ell^2} \mathbf{I}_p)^{-1} \XI^{\top} \XI \alphaO - (\XI^{\top}\XI + \lambda_{\ell^2} \mathbf{I}_p)^{-1} \XI^{\top}\YI,\label{equ:weak_bias_cons}
\end{align}

since  $\alphaO$ is again a closed-form solution to an ordinary least squares problem. Considering the first term in \eqref{equ:weak_bias_cons}, we conclude almost sure convergence with respect to $\alphaItrue$ (it is simply the ridge regression solution on the interventional data, which is well-known to converge almost surely for fixed $\lambda_{\ell^2}$). The second term satisfies
\begin{equation*}
    (\XI^{\top}\XI + \lambda_{\ell^2} \mathbf{I}_p)^{-1} \XI^{\top} \XI \asconv \mathbf{I}_p \quad \text{and} \quad \alphaO \asconv \alphaOtrue.
\end{equation*}
This leads to the desired conclusion.
\end{proof}

\section{Additional Lemmas}

\iffalse
\begin{lemma} \label{lem:bounded_in_limit}
For large enough $m \in \mathbb{N}$, the pooling weight matrix $\WP$ is bounded in spectral norm almost surely, i.e.,
\begin{equation*}
    \text{P} \left( \ulim{m} || \WP ||_2 < c \right) = 1,
\end{equation*}
for some $c > 0$. 
\end{lemma}
\begin{proof}
    We see that by the formulation of \eqref{equ:rewrite_pooling_matrix} and Kolmogorov's strong law of large numbers that
    \begin{equation*}
        \WP \asconv \left(  \right)
    \end{equation*}
\end{proof}
\fi
\begin{lemma} \label{lem:MSEconvergence}
    Let $\West - \Wgeneric \cons \mathbf{0}$ \footnote{We note that $\Wgeneric$ may be random.} and let there exist $c > 0$, $m' \in \mathbb{N}$, such that $||\Wgeneric||_2 \leq c, \; \text{for all} \; m \geq m'$, almost surely. Then, it holds that
    \begin{equation*}
    \ulim{m} \text{MSE} \, \big( \alphaWeight{\Wgeneric} \big) \; \leq \; \ulim{m} \; \text{MSE} \, \big(\alphaWeight{\West}\big),
    \end{equation*}
    where $\cons$ denotes convergence in probability.
\end{lemma}

\begin{proof}
    We derive a lower bound on $\text{MSE} \, \big(\alphaWeight{\West}\big)$ by using the formulation
    \begingroup
    \addtolength{\jot}{.5em}
    \begin{equation} \label{eq:upper_lower_bound}
    \begin{aligned}
        \text{MSE} \, \big(\alphaWeight{\West}\big) \; = \; &\mathbb{E}\left[ \mathbbm{1}\left\{||\West - \Wgeneric||_2 \leq \epsilon \right\} \; ||\alphaWeight{\West} - \bm{\alpha}||_2^2 \right] \; + \\ 
        &\mathbb{E} \left[ \mathbbm{1}\left\{||\West - \Wgeneric||_2 > \epsilon \right\} \;  ||\alphaWeight{\West} - \bm{\alpha}||_2^2 \right], \quad \forall \epsilon > 0.
    \end{aligned}
    \end{equation}
    \endgroup
    We bound the second summand of \eqref{eq:upper_lower_bound} from below by zero. For the first summand, we use reverse triangle inequality, which yields
   \begingroup
    \addtolength{\jot}{.5em}
    \begin{equation}\label{eq:first_summand_upper_bound}
        \begin{aligned}
        & &&\mathbb{E}\left[ \mathbbm{1}\left\{||\West - \Wgeneric||_2 \leq \epsilon \right\} ||\alphaWeight{\West} - \bm{\alpha}||_2^2 \right] \; = \; \mathbb{E}\left[ \mathbbm{1}\left\{||\West - \Wgeneric||_2 \leq \epsilon \right\} ||\alphaWeight{\West} - \alphaWeight{\Wgeneric} - (\bm{\alpha} - \alphaWeight{\Wgeneric} )||_2^2 \right] \\
        &\geq \; &&\mathbb{E}\left[\mathbbm{1}\left\{||\West - \Wgeneric||_2 \leq \epsilon \right\}||\alphaWeight{\Wgeneric} - \bm{\alpha}||_2^2\right] -  2\sqrt{\mathbb{E}\left[\mathbbm{1}\left\{||\West - \mathbf{W}||_2 \leq \epsilon \right\} ||\alphaWeight{\West} - \alphaWeight{\Wgeneric}||_2^2 \right] \; \mathbb{E}\left[||\alphaWeight{\Wgeneric} - \bm\alpha||_2^2\right]}\; + \\ 
        & &&\mathbb{E}\left[\mathbbm{1}\left\{|| \West - \Wgeneric ||_2 \leq \epsilon \right\} ||\alphaWeight{\West} - \alphaWeight{\Wgeneric}||_2^2\right] \\
        &\geq \; &&\text{MSE} (\alphaWeight{\Wgeneric}) - \mathbb{E}\left[\mathbbm{1}\left\{||\West - \Wgeneric||_2 > \epsilon \right\} ||\alphaWeight{\Wgeneric} - \bm{\alpha}||_2^2\right] - \\ 
        & &&2\sqrt{\mathbb{E}\left[\mathbbm{1}\left\{||\West - \Wgeneric||_2 \leq \epsilon \right\} ||\alphaWeight{\West} - \alphaWeight{\Wgeneric}||_2^2 \right] \; \mathbb{E}\left[||\alphaWeight{\Wgeneric} - \bm{\alpha}||_2^2\right]}.
        \end{aligned}
    \end{equation}
    \endgroup
    For any constant $\mathbf{W}, \Wb' \in \mathbb{R}^{p \times p}$, we rewrite
    \begingroup
    \addtolength{\jot}{.5em}
    \begin{equation*}
        \begin{aligned}
            \mathbb{E}\left[ ||\alphaWeight{\Wb'} - \alphaWeight{\Wb}||_2^2 \right] \; &= \; &&\mathbb{E}\left[||(\Wb' - \mathbf{W})\alphaI \; + \; (\mathbf{W} - \Wb')\alphaO||_2^2 \right] \\
            &\leq &&2 \left( ||\mathbf{W} - \Wb'||_2^2 \text{Tr}\left( \mathbb{E}\left[\alphaI \widehat{\bm{\alpha}}_{\textsc{i}}^{m \, \top}\right] \right)  \; + \; ||\mathbf{W} - \Wb'||_2^2 \text{Tr}\left( \mathbb{E}\left[\alphaO \widehat{\bm{\alpha}}_{\textsc{o}}^{n \, \top}\right] \right) \right) \\
            &= &&2 ||\mathbf{W} - \Wb'||_2^2 \Bigg[ \left( ||\mathbb{E}\left[\alphaI\right]||_2^2 \; + \; \text{Tr} \left( \text{\textbf{Cov}} \left(\alphaI \right) \right) \right) \; + \; \left( ||\mathbb{E}\left[\alphaO\right]||_2^2 \; + \; \text{Tr} \left( \text{\textbf{Cov}} \left(\alphaO \right) \right) \right) \Bigg],
        \end{aligned}
    \end{equation*}
    \endgroup
    where we have used Young's inequality in the first step. We see that both $||\mathbb{E}\left[\alphaI\right]||_2^2$ and $||\mathbb{E}\left[\alphaO\right]||_2^2$ remain bounded $\forall m$, while $\text{Tr} \left( \text{\textbf{Cov}} \left(\alphaO \right) \right)$ and $\text{Tr} \left( \text{\textbf{Cov}} \left(\alphaI \right) \right)$ decrease monotonically in $m$. Hence, we conclude that for any $\epsilon' > 0$, there exists an $\epsilon > 0$ such that
    \begingroup
    \addtolength{\jot}{.5em}
    \begin{equation} \label{equ:continuity}
    \begin{aligned}
        &\mathbb{E}\left[\big|\big|\alphaWeight{\Wb'} - \alphaWeight{\Wb}\big|\big|_2^2\right] \leq \epsilon', \; \forall m \in \mathbb{N} \; \text{and} \; \forall \mathbf{W}, \Wb' \in \mathbb{R}^{p \times p} \; \text{s.t.} \; ||\mathbf{W} - \Wb'||_2 \leq \epsilon.
    \end{aligned}
    \end{equation}
    \endgroup
    \iffalse
    $\West - \Wgeneric \cons \mathbf{0}$ implies that there exists a sub-sequence $\left\{ \widehat{\mathbf{W}}^{m_k} - \mathbf{W}^{m_k}, k \geq 0 \right\}$ that converges to $\mathbf{0}$, almost surely (e.g., \citep[p.213]{gut2005probability}). Since $||\mathbf{W}^{m}||_2$ is bounded, we hence obtain boundedness of $||\widehat{\mathbf{W}}^{m_k} - \mathbf{W}^{m_k}||_2$ for large enough $m_k$. Consequently, for large enough $m_k$, we can bound 
    We fix an $\epsilon' > 0$ and choose $\epsilon > 0$ according to \eqref{equ:continuity}. As a result, we obtain
    \begin{equation*}
        \mathbb{E}\left[ \mathbbm{1} \left\{ || \West - \Wgeneric ||_2 \leq \epsilon \right\} || \alphaWeight{\West} - \bm{\alpha} ||_2^2 \right] \geq \MSE(\alphaWeight{\Wgeneric}) - \text{P} \left( || \West - \Wgeneric ||_2 > \epsilon \right) c - 2 \sqrt{\epsilon' c}.
    \end{equation*}
    \fi
    Since $|| \Wgeneric ||_2 \leq c$ for all $m \geq m'$, we have that $|| \alphaWeight{\Wgeneric} - \bm{\alpha} ||_2^2$ is also bounded by some constant $c' > 0$, for all $m \geq m'$, almost surely. We now fix an $\epsilon' > 0$ and choose a corresponding $\epsilon$ such that \eqref{equ:continuity} holds. We then conclude from \eqref{eq:first_summand_upper_bound} that

    \begingroup
    \addtolength{\jot}{.5em}
    \begin{equation*}
    \begin{aligned}
        \MSE \left( \alphaWeight{\West} \right) \geq \quad & \mathbb{E} \left[ \mathbbm{1}\left\{||\West - \Wgeneric||_2 \leq \epsilon \right\} ||\alphaWeight{\West} - \bm{\alpha}||_2^2 \right] \\
        \geq \quad & \MSE \left( \alphaWeight{\Wgeneric} \right) - 2 \sqrt{\epsilon' \; \mathbb{E}\left[ ||\alphaWeight{\Wgeneric} - \bm{\alpha}||_2^2 \right]} - P \left( || \West - \Wgeneric ||_2 > \epsilon \right) c' \\
        \geq \quad & \MSE \left( \alphaWeight{\Wgeneric} \right) - 2\sqrt{\epsilon' c'} - P \left( || \West - \Wgeneric ||_2 > \epsilon \right) c',
    \end{aligned}
    \end{equation*}
    \endgroup

    for all $m \geq m'$. Thus, we conclude 

    \begin{equation*}
        \ulim{m} \MSE \left( \alphaWeight{\West} \right) \; \geq \; \ulim{m} \; \MSE \left( \alphaWeight{\Wgeneric} \right) - 2\sqrt{\epsilon' c'}.
    \end{equation*}
    
    We can repeat this procedure for any $\epsilon' > 0$ and therefore conclude
    
    \begin{equation*}
        \ulim{m} \MSE \left( \alphaWeight{\West} \right) \; \geq \; \ulim{m} \; \MSE \left( \alphaWeight{\Wgeneric} \right),
    \end{equation*}
    which is the desired result.
\end{proof}

\begin{lemma} \label{lem:upper_bounded}
Let $\West - \Wgeneric \asconv \mathbf{0}$ and let there exist some $c > 0$, $m' \in \mathbb{N}$, such that $||\Wgeneric||_2 \leq c, \forall m \geq m'$, almost surely. Then, it holds that
    \begin{equation*}
    \ulim{m} \; \MSE \, \big(\alphaWeight{\West}\big) \; \leq \; \ulim{m} \MSE \big(\alphaWeight{\Wgeneric} \big).
    \end{equation*}

\end{lemma}
\begin{proof}
We again employ the formulation from \eqref{eq:upper_lower_bound}, but this time to construct an upper bound. For the first term of \eqref{eq:upper_lower_bound}, we see that
    \begingroup
    \addtolength{\jot}{.5em}
    \begin{equation} \label{eq:upper_term_2}
        \begin{aligned}
        & &&\mathbb{E}\left[ \mathbbm{1}\left\{||\West - \Wgeneric||_2 \leq \epsilon \right\} \; ||\alphaWeight{\West} - \bm{\alpha}||_2^2 \right] \; = \; \mathbb{E}\left[ \mathbbm{1}\left\{||\West - \Wgeneric||_2 \leq \epsilon \right\} \; ||\alphaWeight{\West} - \alphaWeight{\Wgeneric} + \alphaWeight{\Wgeneric} - \bm{\alpha}||_2^2 \right] \\
        &\leq \; &&\text{MSE} \, \big( \alphaWeight{\Wgeneric} \big) \; + \; 2\sqrt{\mathbb{E}\left[ \mathbbm{1}\left\{||\West - \Wgeneric||_2 \leq \epsilon \right\} ||\alphaWeight{\West} - \alphaWeight{\Wgeneric}||_2^2 \right] \; \mathbb{E}[||\alphaWeight{\Wgeneric} - \bm{\alpha} ||_2^2]}\; + \\ 
        & &&\mathbb{E}\left[ \mathbbm{1}\left\{||\West - \Wgeneric||_2 \leq \epsilon \right\} ||\alphaWeight{\West} - \alphaWeight{\Wgeneric}||_2^2 \right],
        \end{aligned}
    \end{equation}
    \endgroup
    by triangle inequality and the Cauchy-Schwarz inequality. Since for $m \geq m'$ it holds that $||\Wgeneric||_2 \leq c$, almost surely, there exists a constant $c' > 0$ such that $\mathbb{E}\left[||\alphaWeight{\Wgeneric} - \bm{\alpha}||_2^2\right] \; \leq \; c'$, for all $m \geq m'$. This is true because the two estimators $\alphaI$ and $\alphaO$ have both bounded mean squared error for any sample size $m$.

    Analogously to the proof for Lemma \ref{lem:MSEconvergence}, we now fix an $\epsilon' > 0$ and choose a corresponding $\epsilon$ such that \eqref{equ:continuity} holds. For $m \geq m'$, we then conclude from \eqref{eq:upper_term_2} that 
    \begingroup
    \addtolength{\jot}{.5em}
    \begin{equation}
    \begin{aligned} \label{eq:first_term}
        &\mathbb{E} \left[ \mathbbm{1}\left\{||\West - \Wgeneric||_2 \leq \epsilon \right\} ||\alphaWeight{\West} - \bm{\alpha}||_2^2 \right] \\
        \leq \quad & \MSE \left( \alphaWeight{\Wgeneric} \right) + 2 \sqrt{\epsilon' \; \mathbb{E}\left[ ||\alphaWeight{\Wgeneric} - \bm{\alpha}||_2^2 \right]} + \epsilon' \\
        \leq \quad & \MSE \left( \alphaWeight{\Wgeneric} \right) + 2\sqrt{\epsilon' c'} + \epsilon'.
    \end{aligned}
    \end{equation}
    \endgroup
    This bounds the first term of \eqref{eq:upper_lower_bound}. For the second term of \eqref{eq:upper_lower_bound}, we use almost sure convergence of $\West - \Wgeneric$. Since $\Wgeneric$ is bounded in the limit, almost surely, so is $\West$. Formally, $|| \West ||_2 \leq c^{''}, \forall m \geq m'$ for some $m' \in \mathbb{N}$, almost surely.
    
    We use this to bound $||\alphaWeight{\West} - \bm{\alpha}||_2^2 < c'''$ for all $m \geq m'$, almost surely, for some $c''' > 0$. Now, we apply iterated expectations to the second term of \eqref{eq:upper_lower_bound} to see that for all $m \geq m'$
    \begingroup
    \addtolength{\jot}{.5em}
    \begin{equation} \label{eq:second_term}
    \begin{aligned}
        \mathbb{E} \left[ \mathbbm{1}\left\{||\West - \Wgeneric||_2 > \epsilon \right\} ||\alphaWeight{\West} - \bm{\alpha}||_2^2 \right] \; 
        = \quad &\mathbb{E}_{\West} \left[ \mathbbm{1}\left\{||\West - \Wgeneric||_2 > \epsilon \right\} \; \mathbb{E}_{\alphaWeight{\West}|\West}\left[||\alphaWeight{\West} - \bm{\alpha}||_2^2 \right] \right] \\ 
        \leq \quad &\text{P} \left( ||\West - \Wgeneric||_2 > \epsilon \right) c''',
    \end{aligned}
    \end{equation}
    \endgroup
    almost surely. Now, we can combine the inequalities \eqref{eq:first_term} and \eqref{eq:second_term} to obtain
    \begin{equation*}
    \text{MSE} \, \left( \alphaWeight{\West} \right) \; \leq \; \text{MSE} \, \left( \alphaWeight{\Wgeneric} \right) \; + \; 2 \sqrt{\epsilon' c'} \; + \; \epsilon' \; + \; \text{P} \left( || \West - \Wgeneric ||_2 > \epsilon \right) c''',
    \end{equation*}
    for all $m \geq m''$. Almost sure convergence implies consistency of $\West - \Wgeneric$ with respect to $\mathbf{0}$, so we see that $\text{P}\left( || \West - \Wgeneric ||_2 > \epsilon \right)$ vanishes in the limit $m \rightarrow \infty$, for all $\epsilon > 0$. We can repeat this procedure for any $\epsilon' > 0$. This implies the desired result.
\end{proof}

\iffalse
\begin{lemma} \label{lem:MSEconvergence}
    For $\West - \Wgeneric \cons 0$ where there exist some $c > 0$, $m' \in \mathbb{N}$, s.t. $||\Wgeneric||_2 \leq c, \forall m \geq m'$, almost surely, it holds that
    \begin{equation*}
    \ulim{m} \left( \text{MSE} \, \big(\alphaWeight{\West}\big) - \text{MSE} \, \big(\alphaWeight{\Wgeneric} \big) \right) \; = \; 0.
    \end{equation*}
\end{lemma}

\begin{proof}
    We start by deriving upper and lower bounds on 
    \begingroup
    \addtolength{\jot}{.5em}
    \begin{equation} \label{equ:MSE}
    \begin{aligned}
        \text{MSE} \, \big(\alphaWeight{\West}\big) \; = \; &\mathbb{E}\left[ \mathbbm{1}\left\{||\West - \Wgeneric||_2 \leq \epsilon \right\} \; ||\alphaWeight{\West} - \bm{\alpha}||_2^2 \right] \; + \\ 
        &\mathbb{E} \left[ \mathbbm{1}\left\{||\West - \Wgeneric||_2 > \epsilon \right\} \;  ||\alphaWeight{\West} - \bm{\alpha}||_2^2 \right], \quad \forall \epsilon > 0.
    \end{aligned}
    \end{equation}
    \endgroup
    We begin by deriving the upper bound. For the left term of \eqref{equ:MSE}, we see that
    \begingroup
    \addtolength{\jot}{.5em}
    \begin{equation} \label{equ:upper_term}
        \begin{aligned}
        & &&\mathbb{E}\left[ \mathbbm{1}\left\{||\West - \Wgeneric||_2 \leq \epsilon \right\} \; ||\alphaWeight{\West} - \bm{\alpha}||_2^2 \right] \; = \; \mathbb{E}\left[ \mathbbm{1}\left\{||\West - \Wgeneric||_2 \leq \epsilon \right\} \; ||\alphaWeight{\West} - \alphaWeight{\Wgeneric} + \alphaWeight{\Wgeneric} - \bm{\alpha}||_2^2 \right] \\
        &\leq \; &&\text{MSE} \, \big( \alphaWeight{\Wgeneric} \big) \; + \; 2\sqrt{\mathbb{E}\left[ \mathbbm{1}\left\{||\West - \Wgeneric||_2 \leq \epsilon \right\} ||\alphaWeight{\West} - \alphaWeight{\Wgeneric}||_2^2 \right] \; \mathbb{E}[||\alphaWeight{\Wgeneric} - \bm{\alpha} ||_2^2]}\; + \\ 
        & &&\mathbb{E}\left[ \mathbbm{1}\left\{||\West - \Wgeneric||_2 \leq \epsilon \right\} ||\alphaWeight{\West} - \alphaWeight{\Wgeneric}||_2^2 \right],
        \end{aligned}
    \end{equation}
    \endgroup
    by triangle inequality and the Cauchy-Schwarz inequality. We note that $||\Wgeneric||_2 \leq 1$, almost surely, and further note that there exists a constant $c > 0$ such that $\mathbb{E}\left[||\alphaWeight{\Wb} - \bm{\alpha}||_2^2\right] \; \leq \; c$, for all $\mathbf{W} \in \mathbb{R}^{p \times p}$ with $||\mathbf{W}||_2 \leq 1$ and all $m \in \mathbb{N}$. This is true because the two estimators $\alphaI$ and $\alphaO$, which have both bounded mean squared error for any sample size $m$. For any constant $\mathbf{W}, \Wb' \in \mathbb{R}^{p \times p}$, we rewrite
    \begingroup
    \addtolength{\jot}{.5em}
    \begin{equation*}
        \begin{aligned}
            \mathbb{E}\left[ ||\alphaWeight{\Wb'} - \alphaWeight{\Wb}||_2^2 \right] \; &= \; &&\mathbb{E}\left[||(\Wb' - \mathbf{W})\alphaI \; + \; (\mathbf{W} - \Wb')\alphaO||_2^2 \right] \\
            &\leq &&2 \left( ||\mathbf{W} - \Wb'||_2^2 \text{Tr}\left( \mathbb{E}\left[\alphaI \widehat{\bm{\alpha}}_{\textsc{i}}^{m \, \top}\right] \right)  \; + \; ||\mathbf{W} - \Wb'||_2^2 \text{Tr}\left( \mathbb{E}\left[\alphaO \widehat{\bm{\alpha}}_{\textsc{o}}^{n \, \top}\right] \right) \right) \\
            &= &&2 ||\mathbf{W} - \Wb'||_2^2 \Bigg[ \left( ||\mathbb{E}\left[\alphaI\right]||_2^2 \; + \; \text{Tr} \left( \text{\textbf{Cov}} \left(\alphaI \right) \right) \right) \; + \; \left( ||\mathbb{E}\left[\alphaO\right]||_2^2 \; + \; \text{Tr} \left( \text{\textbf{Cov}} \left(\alphaO \right) \right) \right) \Bigg],
        \end{aligned}
    \end{equation*}
    \endgroup
    where we have used Young's inequality in the first step. We see that both $||\mathbb{E}\left[\alphaI\right]||_2^2$ and $||\mathbb{E}\left[\alphaO\right]||_2^2$ remain bounded $\forall m$, while $\text{Tr} \left( \text{\textbf{Cov}} \left(\alphaO \right) \right)$ and $\text{Tr} \left( \text{\textbf{Cov}} \left(\alphaI \right) \right)$ decrease monotonically in $m$. Hence, we conclude that for any $\epsilon' > 0$, there exists and $\epsilon > 0$ such that
    \begingroup
    \addtolength{\jot}{.5em}
    \begin{equation} \label{equ:continuity}
    \begin{aligned}
        &\mathbb{E}\left[\big|\big|\alphaWeight{\Wb'} - \alphaWeight{\Wb}\big|\big|_2^2\right] \leq \epsilon', \; \forall m \in \mathbb{N} \; \text{and} \; \forall \mathbf{W}, \Wb' \in \mathbb{R}^{p \times p} \; \text{s.t.} \; ||\mathbf{W} - \Wb'||_2 \leq \epsilon.
    \end{aligned}
    \end{equation}
    \endgroup
    We now fix an $\epsilon' > 0$ and choose a corresponding $\epsilon$ such that \eqref{equ:continuity} holds. For $m \geq m'$, we then conclude from \eqref{equ:upper_term} that 
    \begingroup
    \addtolength{\jot}{.5em}
    \begin{equation}
    \begin{aligned}
        &\mathbb{E} \left[ \mathbbm{1}\left\{||\West - \Wgeneric||_2 \leq \epsilon \right\} ||\alphaWeight{\West} - \bm{\alpha}||_2^2 \right] \\
        \leq \quad & \MSE \left( \alphaWeight{\Wopt} \right) + 2 \sqrt{\epsilon' \; \mathbb{E}\left[ ||\alphaWeight{\Wopt} - \bm{\alpha}||_2^2 \right]} + \epsilon' \\
        \leq \quad & \MSE \left( \alphaWeight{\Wopt} \right) + 2\sqrt{\epsilon' c} + \epsilon'.
    \end{aligned}
    \end{equation}
    \endgroup
    This bounds the first term of \eqref{equ:MSE}. For the second term of \eqref{equ:MSE}, it follows by iterated expectations that
    \begingroup
    \addtolength{\jot}{.5em}
    \begin{equation*}
    \begin{aligned}
        \mathbb{E} \left[ \mathbbm{1}\left\{||\West - \Wgeneric||_2 > \epsilon \right\} ||\alphaWeight{\West} - \bm{\alpha}||_2^2 \right] \; 
        = \quad &\mathbb{E}_{\West} \left[ \mathbbm{1}\left\{||\West - \Wgeneric||_2 > \epsilon \right\} \; \mathbb{E}_{\alphaWeight{\West}|\West}\left[||\alphaWeight{\West} - \bm{\alpha}||_2^2 \right] \right] \\ 
        \leq \quad &\text{P} \left( ||\West - \Wgeneric||_2 > \epsilon \right) c,
    \end{aligned}
    \end{equation*}
    \endgroup
    where we use that $|| \West ||_2 \leq 1$, almost surely, $\forall m \in \mathbb{N}$. Combining the previous inequalities yields
    \begin{equation*}
        \text{MSE} \, \left( \alphaWeight{\West} \right) \; \leq \; \text{MSE} \, \left( \alphaWeight{\Wgeneric} \right) \; + \; 2 \sqrt{\epsilon' c} \; + \; \epsilon' \; + \; \text{P} \left( || \West - \Wgeneric ||_2 > \epsilon \right) c,
    \end{equation*}
    where by consistency of $\West$, $\text{P} \left( || \West - \Wgeneric ||_2 > \epsilon \right)$ vanishes in the sample limit. We can repeat this procedure for any $\epsilon' > 0$ and therefore conclude

    \begin{equation*}
        \ulim{m} \MSE \left( \alphaWeight{\West} \right) \leq \ulim{m} \MSE \left( \alphaWeight{\Wgeneric} \right).
    \end{equation*}
    
    For the lower bound, we bound $\text{MSE}\left(\alphaWeight{\West}\right)$ using \eqref{equ:MSE} from below 
\end{proof}

\begin{lemma} \label{lem:MSEconvergence}
    For $\West - \Wgeneric \cons 0$ with $||\West||$ bounded for large enough $m$, almost surely\footnote{We note that $\Wgeneric$ may be random.}, it holds that
    \begin{equation*}
    \ulim{m} \left( \text{MSE} \, \big(\alphaWeight{\West}\big) - \text{MSE} \, \big(\alphaWeight{\Wgeneric} \big) \right) \; = \; 0.
    \end{equation*}
\end{lemma}
\fi
\iffalse
\subsection{Proposition \ref{main-prop:soft_param_share}}

\begin{proof}
    Wee see directly that $\mathcal{L}^{\Wopt}(\Ab)$ is convex in $\Ab$, because it is a sum of convex functions. This means we can solve for its minimizer by setting its gradient to $\mathbf{0}$. Furthermore, we see that the solution for each $a^{(k)}$ is decoupled from other feature dimensions:

    \begin{equation*}
        \begin{aligned}
        \left( \nabla_{\Ab} \frac{1}{m} \big| \big| \YI - \XI \Ab \big| \big|_2^2 \right)^{(k)} \; \propto \; \left( \frac{1}{m} \XI^{\top} \YI \; + \text{diag}\left(\tilde{\sigma}^{(l) \; 2}\right) \; \Ab \right)^{(k)} \; = \; \frac{1}{m} \XI^{(k) \; \top} \YI \; + \; \tilde{\sigma}^{(k) \; 2} \; a^{(k)}.
        \end{aligned}
    \end{equation*} 
     This means we can solve for the minimizer component-wise. We then get as the closed-form solution
    \begingroup 
    \addtolength{\jot}{.5em}
    \begin{equation*}
        \begin{aligned}
            &\frac{\partial}{\partial \, a^{(k)}} \; \mathcal{L}^{\Wopt}(\Ab) \; \overset{!}{=} \; 0\\
            \implies \quad & a^{* \; (k)} \; = \; w^{* \; (k)} \hat{\alpha}^{I \; (k)} \; + \;  \left( 1 - w^{* \; (k)} \right) \hat{\alpha}^{O \; (k)},
        \end{aligned}
    \end{equation*} 
    \endgroup

    by noting that

    \begin{equation*}
        \begin{aligned}
            \hat{\alpha}^{I \; (k)} \; = \; \left(\XI^{(k) \; \top} \XI^{(k)}\right)^{-1} \XI^{(k) \; \top} \YI \; = \; \left( m \tilde{\sigma}^{(k) \; 2}\right)^{-1} \XI^{(k) \; \top} \YI.
        \end{aligned}
    \end{equation*} 

    Consequently, we get indeed $\Ab^{*} \; = \; \hat{\bm{\alpha}}^{\Wopt}$.
\end{proof}
\fi


\section{Detailed Derivation of Optimal Weighting Schemes}

In general, we observe that

\begingroup
    \addtolength{\jot}{.5em}
    \begin{equation*}
        \begin{aligned}
    \Bias(\alphaWeight{\Wb}) &= \Wb \alphab + (\Ib - \Wb)(\alphab + \bm{\Delta}) - \alphab = (\Ib - \Wb) \bm{\Delta}, \\
    \COV(\alphaWeight{\Wb}) &= \Wb \COV(\alphaI) \Wb^{\top} + (\Ib - \Wb) \COV(\alphaO) (\Ib - \Wb)^{\top}.
    \end{aligned}
\end{equation*}
\endgroup

\subsection{Optimal Scalar Weight}

Here, we have
\begin{align*}
&\frac{\partial}{\partial w} \MSE\left(\alphaWeight{w\Ib_p}\right) \\
= \quad &\frac{\partial}{\partial w} \Big| \Big| \Bias\left(\alphaWeight{w\Ib_p}\right) \Big| \Big|_2^2 \; + \; \frac{\partial}{\partial w}\text{Tr} \left( \COV \left(\alphaWeight{w\Ib_p}\right) \right) \\
= \quad & -2(1 - w) ||\bm{\Delta}||_2^2 + 2w \text{Tr}\left( \COV(\alphaI) \right) - 2(1 - w) \text{Tr}\left( \COV(\alphaO) \right) \overset{!}{=} 0.
\end{align*}

By rearranging, we get

\begin{equation*}
\wopt = \frac{\text{Tr}(\mathbf{Cov}(\alphaO)) + \norm{\bm{\Delta}}_2^2}{\text{Tr}(\mathbf{Cov}(\alphaI)) + \text{Tr}(\mathbf{Cov}(\alphaO)) + \norm{\bm{\Delta}}_2^2}.
\end{equation*}

\subsection{Optimal Diagonal Weight Matrix}

Here, we see that the objective decouples into a sum over the individual dimensions

\begin{equation*}
    \MSE\left(\alphaWeight{w\Ib_p}\right) = \sum_{k = 1}^p \left( 1 - w^{(k)} \right)^2 \bm{\Delta}^{(k) \, 2} \; + \; w^{(k) \, 2} \mathbf{Cov}^{(k, k)}(\alphaI) +  \left( 1 - w^{(k)} \right)^2 \mathbf{Cov}^{(k, k)}(\alphaO).
\end{equation*}

Thus, we optimize for each dimension $k$ separately and obtain

\begin{equation*}
    w_*^{m (k)} = \frac{\mathrm{Cov}^{(k, k)}(\alphaO) + \Delta^{(k) \, 2}}{\mathrm{Cov}^{(k, k)}(\alphaI) + \mathrm{Cov}^{(k, k)}(\alphaO) + \Delta^{(k) \, 2}}.
\end{equation*}

\subsection{Optimal Weight Matrix}

Using $\frac{\partial}{\partial \Wb} \text{Tr}(\Wb \Ab \Wb^{\top}) = 2 \Wb \Ab$, since $\Ab$ is symmetric, we observe that
\begin{align*}
    &\frac{\partial}{\partial \Wb} \MSE\left(\alphaWeight{\Wb}\right) \\
    = \quad &2\Wb \left( \COV(\alphaI) + \COV(\alphaO) + \bm{\Delta} \bm{\Delta}^{\top} \right) - 2 \left( \bm{\Delta} \bm{\Delta}^{\top} + \COV(\alphaO) \right) \\
    \overset{!}{=} \quad &\mathbf{0}.
\end{align*}
We see that this minimum is attained for
\begin{equation*}
\left( \COV(\alphaO) + \bm{\Delta} \bm{\Delta}^{\top} \right) \left( \COV(\alphaI) + \COV(\alphaO) + \bm{\Delta} \bm{\Delta}^{\top} \right)^{-1}.
\end{equation*}
%\section{Additional Experiments}
%We conduct further experiments using the general parameter configuration as specified in 
%\textbf{Varying Amount of Parameters.} Here, we 

%Our methods make no assumptions about dependencies of the entries of $\Xb$. 
\section{Non Zero-Mean Exogenous Variables}

All results established here can readily be extended to settings, where any of the exogenous variables have non-zero mean, i.e., $\bm{\mu}_{\Nb_\Xb}$, $\bm{\mu}_{\tilde{\Nb}_\Xb} \coloneqq \mathbb{E}[\tilde{\Nb}_\Xb]$, $\bm{\mu}_{\Nb_\Zb}$, $\mu_{N_Y}$ (see \eqref{main-equ:multivarSCMconfounder}--\eqref{main-eq:outcome}) may be non-zero. In order to extend the practical estimators introduced here, one needs to consider the following two pre-processing steps:

First, we center both treatment distributions separately, without scaling:

\begin{align}
    &\xb'_i \; \leftarrow \; \xb_i - n^{-1}\sum_{j \in 1, ..., n} \xb_j, \quad &&\forall i \in 1, ..., n ,\label{eq:mean_center} \\
    &\xb'_i \; \leftarrow \; \xb_i - m^{-1}\sum_{j \in n+1, ..., n+m} \xb_j, \quad &&\forall i \in n+1, ..., n+m.
\end{align}
In this manner, both treatment variables become zero-mean. 

Furthermore, we add a dummy dimension with value one to all treatment vectors:

\begin{equation*}
    \xb''_i \; \leftarrow \; (\xb'_i, \; 1), \quad \forall i \in 1, ..., n+m. 
\end{equation*}

This naturally adds one more dimension also to $\bm{\alpha}$, which corresponds to the intercept term. We then use the constructed $\xb''_i$ to compute the weight matrices proposed in this work.

Finally, we see that the intercept term must be identical for both distributions, interventional and observational:

\begin{equation*} 
    \mathbb{E}[Y \; | \; \Xb' = \xb'] \; = \; \gammab^{\top} \EE[\Zb \; | \; \Xb'= \xb'] + \alphab^\top \xb' + \mu_{N_Y}.
\end{equation*}

We then have in the observational setting (data points $1, ..., n$) that

\begin{align*}
    \bm{\gamma}^{\top} \EE[\Zb \; | \; \Xb'= \xb'] &= \bm{\gamma}^\top \bm{\mu}_{\Nb_\Zb} + \bm{\gamma}^\top \Sigmab_{\Nb_\mathbf{Z}} \mathbf{B}^{\top} (\Sigmab_{\mathbf{N}_{\Xb}} + \mathbf{B} \Sigmab_{\Nb_\mathbf{Z}} \mathbf{B}^{\top})^{-1} (\xb' - \mathbb{E}[\Xb']) \\
     &= \bm{\gamma}^{\top} \bm{\mu}_{\Nb_\Zb} + \bm{\Delta}^{\top} \xb',
\end{align*}

where $\mathbb{E}[\Xb'] = \mathbf{0}$ due to ~\eqref{eq:mean_center}.

For the interventional data, we have independence between $\Xb'$ and $\Zb$ by definition and so we trivially get

\begin{equation*}
    \bm{\gamma}^{\top} \EE[\Zb \; | \; \Xb'= \xb'] = \bm{\gamma}^{\top} \bm{\mu}_{\Nb_\Zb}
\end{equation*}

here. Thus, the intercept is $\bm{\gamma}^{\top} \bm{\mu}_{\Nb_\Zb} + \mu_{N_Y}$ for both distributions and we fix $\hat{\Delta}^{(p+1)} = 0$.

\section{Sample Imbalance}

We see that the ground truth covariance matrices of $\alphaI$ and $\alphaO$ adapt to changes in the sample sizes, keeping the distributions of all variables fixed. For instance, we see that

\begin{equation*}
    \COV(\alphaI) = (\XI^{\top} \XI)^{-1} \sigma_{Y|\text{do}(X)}^2 = m^{-1} (m^{-1} \XI^{\top} \XI)^{-1} \sigma_{Y|\text{do}(X)}^2.
\end{equation*}

The term $(m^{-1} \XI^{\top} \XI)^{-1} \sigma_{Y|\text{do}(X)}^2$ is bounded in probability, for large enough $m$. Accordingly, this implies that $\COV(\alphaI) \cons \mathbf{0}$. Thus, when keeping $n$ fixed, we obtain $\Wopt \cons \Ib_p$, for $m \rightarrow \infty$. 

On the other hand, if we keep $m$ fixed and consider the limit $n \rightarrow \infty$ instead, we observe that

\begin{equation*}
    \Wopt \cons \Deltab \Deltab^{\top} (\COV(\alphaI) + \Deltab \Deltab^{\top})^{-1}.
\end{equation*}

We note that we do not have $\Wopt \cons \mathbf{0}$ here in general, because the bias in $\alphaO$ remains, independent of the sample size $n$.

\end{document}
