\documentclass[accepted]{uai2022}

%%% HELPER CODE FOR DEALING WITH EXTERNAL REFERENCES
\usepackage{xr}
\makeatletter
\newcommand*{\addFileDependency}[1]{
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{
    \externaldocument{#1}
    \addFileDependency{#1.tex}
    \addFileDependency{#1.aux}
}
%%% END HELPER CODE

% put all the external documents here!
\myexternaldocument{cheng_695}

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    % \bibliographystyle{plainnat}
    \bibliographystyle{abbrvnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example


\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
% \usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
% \usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
% \usepackage{nicefrac}       % compact symbols for 1/2, etc.
% \usepackage{microtype}      % microtypography
\usepackage{xspace}
\usepackage{xcolor}         % colors

% new packages
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{bbm}
% \usepackage{algorithm}
% \usepackage{algpseudocode}
\usepackage[ruled,vlined]{algorithm2e}
\usepackage{thmtools}
\usepackage{thm-restate}
\usepackage[capitalise]{cleveref}
\usepackage{bm}

% function class, spaces, operator, etc.
\newcommand{\fX}{\mathcal{X}}
\newcommand{\fY}{\mathcal{Y}}
\newcommand{\fD}{\mathcal{D}}
\newcommand{\fH}{\mathcal{H}}


% big sets
\newcommand{\sR}{\mathbb{R}}
\newcommand{\sN}{\mathbb{N}}
\newcommand{\sZ}{\mathbb{Z}}

% other
\newcommand{\iid}{i.i.d.}
\newcommand{\1}[1]{\mathbbm{1}(#1)}
\newcommand{\Exp}{\mathbb{E
}}
\newcommand{\Var}{\mathrm{Var}}
\newcommand{\Cov}{\mathrm{Cov}}
\newcommand{\Res}{\mathrm{Res}}
% util function 
\newcommand{\util}{f_u}
\newcommand{\wrt}{w.r.t.\xspace}
\newcommand{\nphard}{$\mathcal{NP}$-hard\xspace}
\newcommand{\npcomplete}{$\mathcal{NP}$-complete\xspace}
\newcommand{\sharpphard}{$\mathcal{\sharp P}$-hard\xspace}
% name replace
\newcommand{\mm}{multimodal\xspace}
\newcommand{\mmcap}{Multimodal\xspace}
\newcommand{\polytime}{polynomial time\xspace}
\newcommand{\greedyset}{greedily-obtained set\xspace}
\newcommand{\greedyvalue}{greedily-obtained value\xspace}
\newcommand{\optimalset}{optimal set\xspace}
\newcommand{\optimalvalue}{optimal value\xspace}
% experiment
\newcommand{\pmnist}{Patch-MNIST\xspace}
\newcommand{\mnist}{MNIST\xspace}
\newcommand{\pems}{PEMS-SF\xspace}
\newcommand{\mosi}{CMU-MOSI\xspace}

% algorithm2e
\SetKwInput{KwInput}{Input}                % Set the Input
\SetKwInput{KwOutput}{Output}              % set the Output

\renewcommand{\qedsymbol}{$\blacksquare$}
\newcommand{\defeq}{\vcentcolon=}
\newcommand{\eqdef}{=\vcentcolon}
\newcommand{\eps}{\varepsilon}
\newcommand{\R}{\mathbb{R}}

\newcommand{\suchthat}{\xspace\ s.t.\ \xspace}
\newcommand{\KLD}[2]{D_{\mathrm{KL}}(#1~\|~#2)}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

% for restatement
\declaretheorem[name=Theorem,numberwithin=section]{thm}
\declaretheorem[name=Corollary,numberwithin=section]{cor}
\declaretheorem[name=Proposition,numberwithin=section]{prop}
\declaretheorem[name=Lemma,numberwithin=section]{lem}
\declaretheorem[name=Definition,numberwithin=section]{defi}
\declaretheorem[name=Assumption,numberwithin=section]{as}

\declaretheorem[name=Algorithm,numberwithin=section]{algo}

\theoremstyle{definition}
\newtheorem*{remark}{Remark}

% editing
\newcommand{\comment}[1]{}
\newcommand{\fix}[1]{\textcolor{red}{#1}}
\newcommand{\han}[1]{\textcolor{blue}{\textbf{[Han]: #1}}}
\newcommand{\sam}[1]{\textcolor{red}{[Sam]: #1}}
\newcommand{\gargi}[1]{\textcolor{magenta}{\textbf{[Gargi]: #1}}}
\newcommand{\yifei}[1]{\textcolor{cyan}{[Yifei]: #1}}

\title{Greedy Modality Selection via Approximate Submodular Maximization (Supplementary material)}


\begin{document}

% Add authors
% \author[1]{anonymous}
\author[1]{\href{mailto:<rcheng12@illinois.edu>}{Runxiang Cheng$^{*}$}{}}
\author[1]{\href{mailto:<gargib2@illinois.edu>}{Gargi Balasubramaniam$^{*}$}{}}
\author[1]{\href{mailto:<yifeihe3@illinois.edu>}{Yifei He\thanks{Equal contribution.}}{}}
\author[2]{\href{mailto:<yaohungt@cs.cmu.edu>}{Yao-Hung Hubert Tsai}{}}
\author[1]{\href{mailto:<hanzhao@illinois.edu>}{Han Zhao}{}}
% Add affiliations after the authors
\affil[1]{%
    University of Illinois Urbana-Champaign, Illinois, USA
}
\affil[2]{%
    Carnegie Mellon University, Pennsylvania, USA
}

\onecolumn
\maketitle

\section{Preliminary for Missing Proofs}
% \label{appendix:prelimForProofs}

\begin{restatable}{prop}{crossEntropyInf}
\label{prop:crossEntropyInf}
Let $X$, $Y \in \{0, 1\}$ be random variables, $\fH$ be the class of functions of $X$ such that $\forall h \in \fH, h(X) \in [0, 1]$, and $\ell(\cdot, \cdot)$ be the cross-entropy loss. We have:
% Let $X$, $Y \in \{0, 1\}$ be random variables,  $f$ be any function of $X$ such that $f(X) \in \{0, \}$, and $\ell(\cdot, \cdot)$ be the cross-entropy loss, we have:
\begin{equation}
    \inf_{h\in \fH} \Exp[\ell(Y, h(X))] = H(Y\mid X)
\end{equation}
\end{restatable}
\begin{proof}
Let $x, \hat y$ be the instantiation of $X, \hat Y$ respectively, where $\hat Y \coloneqq h(X)$. $\1{\cdot}$ denotes the indicator function, and $\KLD{\cdot}{\cdot}$ denotes the Kullback–Leibler divergence.
\begin{align}
    \Exp_{\fD}[\ell(Y, h(X))] ={}& \Exp_{X, Y}[ - \1{Y=1}\log\hat{Y} - \1{Y=0}\log(1-\hat{Y})] \\
    ={}& - \Exp_X[\Exp_{Y\mid x}[\1{Y=1}\log\hat{y} + \1{Y=0}\log(1-\hat{y})]] \\
    ={}& - \Exp_X[\Pr(Y=1\mid x)\log\hat{y} + \Pr(Y=0\mid x)\log(1-\hat{y})] \\
    ={}& \Exp_X[\Pr(Y=1\mid x)\log\frac{1}{\hat{y}} + \Pr(Y=0\mid x)\log\frac{1}{1-\hat{y}}] \\
    ={}& \Exp_X[\Pr(Y=1\mid x)\log\frac{\Pr(Y=1\mid x)}{\hat{y}} + \Pr(Y=0\mid x)\log\frac{\Pr(Y=0\mid x)}{1-\hat{y}}] \\ 
    & + \Exp_X[- \Pr(Y=1\mid x)\log\Pr(Y=1\mid x) - \Pr(Y=0\mid x)\log\Pr(Y=0\mid x)] \\
    ={}& \Exp_X[\KLD{\Pr(Y\mid x)}{h(x)}] + \Exp_X[H(Y\mid x)] \\
    ={}& \KLD{\Pr(Y\mid X)}{h(X)} + H(Y\mid X)
\end{align}
Since $H(Y\mid X) \geq 0$ and is unrelated to $h(X)$, $\Exp_{\fD}[\ell(Y, h(X))]$ is minimum when $h(X) = \Pr(Y\mid X)$. 
\end{proof}


\section{Missing Proofs}
% \label{appendix:proofs}


\begin{restatable}{prop}{utilityCE}
Given $Y \in \{0, 1\}$ and $\ell(Y, \hat Y) \coloneqq \1{Y=1}\log\hat{Y} + \1{Y=0}\log(1-\hat{Y})$, $\util(S) = I(S; Y)$.
\end{restatable}
\begin{proof}
By \cref{defi:utility} and \cref{prop:crossEntropyInf}, we have:
\begin{align}
    \util(S) ={}& \inf_{h\in\fH} \Exp[\ell(Y, c)]-\inf_{h\in\fH} \Exp[\ell(Y, h(S))] \\
    ={}& H(Y\mid c) - H(Y\mid S) \\
    ={}& H(Y) - H(Y\mid S) \\
    ={}& I(S; Y)
\end{align}
\end{proof}


% \montonicCE*
\begin{restatable}{prop}{montonicCE}
$\forall M \subseteq N \subseteq V$, $I(N; Y) - I(M; Y) = I(N \setminus M; Y \mid M) \geq 0$.
\end{restatable}
\begin{proof}
Let $N \coloneqq \{X_1,..., X_n\}$, $M \coloneqq \{X_1,..., X_m\}$, $n \geq m$.
\begin{align}
    I(N; Y) - I(M; Y) ={}& \sum_{i=1}^n I(X_i; Y \mid X_{i-1}, ..., X_1) - \sum_{i=1}^m I(X_i; Y \mid X_{i-1}, ..., X_1) \\
    ={}& \sum_{i=m+1}^n I(X_i;Y \mid X_{i-1}, ..., X_1) \\
    ={}& I(N \setminus M; Y\mid M) \\
    \geq {}& 0
\end{align}
\end{proof}

% \eSubmodularityMI*
\begin{restatable}{prop}{eSubmodularityMI}
Under \cref{as:eCondIndep}, $ I(S; Y)$ is $\epsilon$-approximately submodular, i.e., $\forall A \subseteq B \subseteq V$, $e \in V \setminus B$, $I(A \cup \{e\}; Y) - I(A; Y) + \epsilon \geq I(B \cup \{e\}; Y) - I(B; Y)$.
\end{restatable}
\begin{proof}
For subset $A$, we have:
\begin{align}
    I(A\cup \{e\}; Y) - I(A; Y) ={}& I(\{e\}; Y\mid A) \\
    ={}& I(\{e\}; Y, A) - I(\{e\}; A) \\
    ={}& I(\{e\}; Y) + I(\{e\}; A \mid Y) - I(\{e\}; A)
\end{align}
Similarly, $I(B\cup \{e\}; Y) - I(B; Y) = I(\{e\}; Y) + I(\{e\}; B \mid Y) - I(\{e\}; B)$.  Given \cref{as:eCondIndep} holds, we denote $I(\{e\}; A \mid Y) = \epsilon_A$ and $I(\{e\}; B \mid Y) = \epsilon_B$ where $\epsilon_A, \epsilon_B \leq \epsilon$. In the worst case where $\epsilon_A = 0$, absolute submodularity is still satisfied if $\epsilon_B \leq I(\{e\}; B) - I(\{e\}; A)$, i.e.,
\begin{align}
    I(B\cup \{e\}; Y) - I(B; Y) ={}& I(\{e\}; Y) + I(\{e\}; B \mid Y) - I(\{e\}; B) \\
    ={}& I(\{e\}; Y) - I(\{e\}; B) + \epsilon_B \\
    \leq{}& I(\{e\}; Y) - I(\{e\}; B) + I(\{e\}; B) - I(\{e\}; A) = I(A\cup \{e\}; Y) - I(A; Y)
\end{align}

But if $\epsilon_B > I(\{e\}; B) - I(\{e\}; A)$, the submodularity above will not hold. However, because $\epsilon_B \leq \epsilon$ , we can define approximate submodularity characterized by the constant $\epsilon \geq 0$. Specifically:
\begin{align}
    I(B\cup \{e\}; Y) - I(B; Y) ={}& I(\{e\}; Y) + I(\{e\}; B \mid Y) - I(\{e\}; B) \\
    ={}& I(\{e\}; Y) - I(\{e\}; B) + \epsilon_B \\
    \leq{}& I(\{e\}; Y) - I(\{e\}; B) + \epsilon \\
    \leq{}& I(\{e\}; Y) - I(\{e\}; A) + \epsilon \\
    \leq{}& I(\{e\}; Y) - I(\{e\}; A) + \epsilon_A + \epsilon \\
    % \leq{}&  I(\{e\}; Y) + I(\{e\}; A \mid Y) - I(\{e\}; A) + \epsilon \\
    \leq{}& I(A\cup \{e\}; Y) - I(A; Y) + \epsilon
\end{align}
% It follows that $I(X_{B\cup \{e\}}; Y) - I(X_B; Y) \leq I(X_{A\cup \{e\}}; Y) - I(X_A; Y) + \epsilon$, which we denoted as $\epsilon$-approximate submodularity. Without loss of generality, $I(\rvmoset{s}; Y)$ is $\epsilon$-approximately submodular when $I(\rvmoset{S}; \rvmoset{S'} \mid Y) \leq \epsilon$ for some $\epsilon > 0$, $\forall S, S' \subseteq V$ and $S \cap S' = \emptyset$.
\end{proof}

% \greedyBoundMI*
\begin{restatable}{thm}{greedyBoundMI}
Under \cref{as:eCondIndep}, let  $q \in \sZ^+$, and $S_p$ be the solution from \cref{algo:greedyOrginal} at iteration $p$, we have:
\begin{equation}
    I(S_p; Y) \geq (1-e^{-\frac{p}{q}})\max_{S: |S|\leq q}I(S; Y) - q\epsilon
\end{equation}
\end{restatable}
\begin{proof}
Let $S^* \coloneqq \max_{S: |S|\leq q}I(S; Y)$ be the optimal subset with cardinality at most $q$. By \cref{prop:montonicCE}, $|S^*| = q$. We order $S^*$ as $ \{X_1^*,\ ...,\ X_q^*\}$. Then for all positive integer $i \leq p$,
\begin{align}
    I(S^*; Y) \leq {}& I(S^*\cup S_i; Y) \label{proof:greedyBoundMI:1} \\
    ={}& I(S_i; Y) + \sum^q_{j=1} I(X_j^*; Y\mid S_i \cup \{X_{j-1}^*, ..., X_1^*\}) \label{proof:greedyBoundMI:2} \\
    ={}& I(S_i; Y) + \sum^q_{j=1} (I(\{X_j^*, ..., X_1^*\}\cup S_i; Y) - I(\{X_{j-1}^*, ..., X_1^*\}\cup S_i; Y)) \label{proof:greedyBoundMI:3} \\ 
    \leq{}& I(S_i; Y) + \sum^q_{j=1} (I(\{X_j^*\}\cup S_i; Y) - I(S_i; Y) + \epsilon) \label{proof:greedyBoundMI:4} \\
    \leq{}& I(S_i; Y) + \sum^q_{j=1} (I(S_{i+1}; Y) - I(S_i; Y) + \epsilon) \label{proof:greedyBoundMI:5} \\
    \leq{}& I(S_i; Y) + q(I(S_{i+1}) - I(S_i; Y) + \epsilon) \label{proof:greedyBoundMI:6}
\end{align}
\cref{proof:greedyBoundMI:1} is from \cref{prop:montonicCE}, \cref{proof:greedyBoundMI:2} and \cref{proof:greedyBoundMI:3} are by the chain rule of mutual information, \cref{proof:greedyBoundMI:4} is from \cref{prop:eSubmodularityMI}, \cref{proof:greedyBoundMI:5} is by the definition of \cref{algo:greedyOrginal} that $I(S_{i+1}; Y) - I(S_i; Y)$ is maximized in each iteration $i$. Let $\delta_i \coloneqq I(S^*; Y) - I(S_i; Y)$, we can rewrite \cref{proof:greedyBoundMI:6} into $\delta_i \leq q(\delta_i - \delta_{i+1} + \epsilon)$, which can be rearranged into $\delta_{i+1} \leq (1-\frac{1}{q})\delta_i + \epsilon$.

Let $\delta_0 = I(S^*; Y) - I(S_0; Y)$. Since $S_0 = \emptyset$, we have $\delta_0 = I(S^*; Y)$. By the previous results, we can upper bound the quantity $\delta_p = I(S^*; Y) - I(S_p; Y)$ as follows:
\begin{align}
    \delta_p \leq{}& (1-\frac{1}{q})\delta_{p-1} + \epsilon \\
    \leq{}& (1-\frac{1}{q})((1-\frac{1}{q})\delta_{p-2} + \epsilon) + \epsilon \\
    \leq{}& (1-\frac{1}{q})^p\delta_0 + (1+(1-\frac{1}{q})+...+(1-\frac{1}{q})^{p-1})\epsilon \label{proof:greedyBoundMI:7} \\
    ={}& (1-\frac{1}{q})^p\delta_0 + (\frac{1 - (1-\frac{1}{q})^{p-1+1}}{1 - (1-\frac{1}{q})})\epsilon \\
    ={}& (1-\frac{1}{q})^p\delta_0 + (q - q(1-\frac{1}{q})^p)\epsilon \label{proof:greedyBoundMI:8} \\
    \leq{}& (1-\frac{1}{q})^p\delta_0 + q\epsilon \\
    \leq{}& e^{-\frac{p}{q}} \delta_0 + q\epsilon \label{proof:greedyBoundMI:9}
\end{align}

\cref{proof:greedyBoundMI:7} to \cref{proof:greedyBoundMI:8} is through the summation of the geometric series $1+(1-\frac{1}{q})+...+(1-\frac{1}{q})^{p-1}$. \cref{proof:greedyBoundMI:9} is by the inequality $1-x\leq e^{-x}$ for all $x \in \sR$. Substitute the definitions of $\delta_p$ and $\delta_0$ into \cref{proof:greedyBoundMI:9} completes the proof. 
\end{proof}

% \greedyLossBound*
\begin{restatable}{cor}{greedyLossBound}
Assume conditions in \cref{thm:greedyBoundMI} hold, there exists optimal predictor $h^*(S_p) = \Pr(Y\mid S_p)$ such that
\begin{align}
    \Exp[\ell_{01}(Y, h^*(S_p))] \leq{}& \Exp[\ell_{ce}(Y, h^*(S_p))] \nonumber \\
    \leq{}& H(Y) - (1-e^{-\frac{p}{q}})I(S^*; Y) + q\epsilon
\end{align}
\end{restatable}
\begin{proof}
Denote the quantity $ (1-e^{-\frac{p}{q}})\max_{S: |S|\leq q}I(S; Y) - q\epsilon$ from \cref{thm:greedyBoundMI} as letter $b$. By the definition of mutual information, we have $H(Y\mid S_p) \leq H(Y) - b$. Following \cref{prop:crossEntropyInf}, $\inf_{h: S_p\to [0, 1]} \Exp[\ell_{ce}(Y, h(S_p))] \leq H(Y) - b$. In other words, $\exists h^* = \Pr(Y\mid S_p) \suchthat \Exp[\ell_{ce}(Y, h^*(S_p))] \leq H(Y) - b$.

When the predictor is probabilistic (i.e., $h(X) = 0$ if and only if $h(X) \leq 0.5$), $\ell_{01}(Y, \hat Y) = \1{Y\neq \hat Y}$ naturally extends to $Y\1{\hat Y \leq 0.5} + (1-Y)\1{\hat Y > 0.5}$, which is upper bounded by $\ell_{ce}(Y, \hat Y)$ for all $(Y, \hat Y)$. Therefore, for the same $h^*$ as above, we have:
\begin{equation}
    \Exp[\ell_{01}(Y, h^*(S_p))] \leq \Exp[\ell_{ce}(Y, h^*(S_p))] \leq H(Y) - b
\end{equation}
\end{proof}

% \greedyLossDiff*
\begin{restatable}{cor}{greedyLossDiff}
Assume conditions in \cref{thm:greedyBoundMI} hold. There exists optimal predictors $h_1^* = \Pr(Y\mid S_p)$, $h_2^* = \Pr(Y\mid S^*)$ such that
\begin{align}
    \Exp[\ell_{ce}(Y, h_1^*(S_p))] - \Exp[\ell_{ce}(Y, h_2^*(S^*))] \nonumber \\ \leq e^{-\frac{p}{q}}I(S^*; Y) +q\epsilon
\end{align}
\end{restatable}
\begin{proof}
Following \cref{thm:greedyBoundMI}, and denote $\argmax_{S: |S|\leq q}I(S; Y)$ as $S^*$, we have:
\begin{align}
    & I(S_p; Y) \geq (1-e^{-\frac{p}{q}})\max_{S: |S|\leq q}I(S; Y) - q\epsilon \\
    &\implies H(Y) - H(Y\mid S_p) \geq (1-e^{-\frac{p}{q}})(H(Y) - H(Y\mid S^*)) - q\epsilon \\
    &\implies H(Y\mid S_p)  - H(Y\mid S^*) \leq  e^{-\frac{p}{q}} (H(Y) - H(Y\mid S^*)) + q\epsilon \\
    &\implies H(Y\mid S_p)  - H(Y\mid S^*) \leq  e^{-\frac{p}{q}} (I(S^*; Y)) + q\epsilon
\end{align}
Using \cref{prop:crossEntropyInf} completes the proof.
\end{proof}

% \subadditiveMI*
\begin{restatable}{prop}{subadditiveMI}
Under \cref{as:eCondIndep}, $I(S; Y)$ is $\epsilon$-approximately sub-additive for any $S\subseteq V$, i.e., $I(S\cup S'; Y) \leq I(S; Y) + I(S'; Y) + \epsilon$.
\end{restatable}
\begin{proof}
\begin{align}
    I(S\cup S'; Y) ={}& I(S; Y) + I(S';Y\mid S) \\
    ={}& I(S; Y) + I(S\cup Y; S') - I(S;S') \\
    ={}& I(S; Y) + I(S'; Y) + I(S; S'\mid Y) - I(S; S') \label{proof:subadditiveMI:1} \\
    \leq{}& I(S; Y) + I(S'; Y) + \epsilon \label{proof:subadditiveMI:2}
\end{align}
\cref{proof:subadditiveMI:1} to \cref{proof:subadditiveMI:2} because $I(S; S'\mid Y) \leq \epsilon$ by \cref{as:eCondIndep}, and $I(S; S')$ is always non-negative.
\end{proof}

% \superadditiveMI*
\begin{restatable}{prop}{superadditiveMI}
Under \cref{as:eMarginalIndep}, $I(S; Y)$ is $\epsilon$-approximately super-additive for any $S\subseteq V$, i.e., $I(S\cup S'; Y) \geq I(S; Y) + I(S'; Y) - \epsilon$.
\end{restatable}
\begin{proof}
Similarly to the proof of \cref{prop:subadditiveMI}, we have: 
\begin{align}
    I(S\cup S'; Y) ={}& I(S; Y) + I(S'; Y) + I(S; S'\mid Y) - I(S; S') \label{proof:superadditiveMI:1} \\
    \geq{}& I(S; Y) + I(S'; Y) - \epsilon \label{proof:superadditiveMI:2}
\end{align}
\cref{proof:superadditiveMI:1} to \cref{proof:superadditiveMI:2} because $ I(S; S')\leq \epsilon$ by \cref{as:eMarginalIndep}, and $I(S; S'\mid Y)$ is non-negative.
\end{proof}

% \shapleyMI*
\begin{restatable}{prop}{shapleyMI}
If conditions in \cref{prop:subadditiveMI} and \cref{prop:superadditiveMI} hold, we have $ I(X_i; Y) - \epsilon \leq \phi_{I, X_i} \leq I(X_i; Y) + \epsilon$ for any $X_i \in V$.
\end{restatable}
\begin{proof}
By \cref{prop:subadditiveMI} and \cref{prop:superadditiveMI}, for any $X_i \in V$ and $S \subseteq V$, we have:
\begin{equation}
\label{eq:shapleyMI:1}
    I(X_i; Y) - \epsilon \leq I(S\cup \{X_i\}; Y) - I(S; Y) \leq I(X_i; Y) + \epsilon
\end{equation}

Let's first apply the right inequality in \cref{eq:shapleyMI:1} to \cref{defi:shapley}. Because  $I(X_i; Y) + \epsilon$ is independent of $S$, we can simplify the calculation of the upper bound of $\phi_{I, X_i}$ as follows. 
\begin{align}
    \phi_{I, X_i} &= \sum_{S \subseteq V \setminus \{X_i\}} \frac{|S|!(|V|-|S|-1)!}{|V|!} (I(S\cup \{i\}; Y) - I(S; Y)) \\
    &\leq \sum_{S \subseteq V \setminus \{i\}} \frac{|S|!(|V|-|S|-1)!}{|V|!} (I(X_i; Y) + \epsilon) \\
    &= \sum_{|S| = 0}^{|V|-1} \binom{|V|-1}{|S|} \frac{|S|!(|V|-|S|-1)!}{|V|!} (I(X_i; Y) + \epsilon) \\
    &= \sum_{|S| = 0}^{|V|-1} \frac{(|V|-1)!}{|S|(|F|-1-|S|)!} \frac{|S|!(|V|-|S|-1)!}{|V|!} (I(X_i; Y) + \epsilon) \\
    &= \sum_{|S| = 0}^{|V|-1} \frac{1}{|V|} (I(X_i; Y) + \epsilon) \\
    &= I(X_i; Y) + \epsilon
\end{align}

Applying the same procedure to the left inequality in \cref{eq:shapleyMI:1} to \cref{defi:shapley}, we have $\phi_{I, X_i} \geq I(X_i; Y) - \epsilon$. Combining both results completes the proof.
\end{proof}

% \efficientMCI*
\begin{restatable}{prop}{efficientMCI}
Under \cref{as:eCondIndep}, $\forall X_i \in V$, we have $I(X_i; Y) \leq \phi_{I, X_i}^{mci} \leq I(X_i; Y) + \epsilon$.
\end{restatable}
\begin{proof}
By \cref{prop:eSubmodularityMI}, $I(\cdot; Y)$ would be approximately submodular under \cref{as:eCondIndep}, thus:
\begin{align}
    I(X_i; Y) + \epsilon ={}& I(\emptyset \cup X_i; Y) - I(\emptyset; Y) + \epsilon \\
    \geq{}& \max_{S\subseteq V} I(S \cup X_i; Y) - I(S; Y) = \phi_{I, X_i}^{mci} \label{proof:efficientMCI:1}
\end{align}

On the other hand, if $\argmax_{S\subseteq V} I(S \cup X_i; Y) - I(S; Y) = \emptyset$, we have $\phi_{I, X_i}^{mci} = I(\emptyset \cup X_i; Y) - I(\emptyset; Y) = I(X_i; Y)$. If $\argmax_{S\subseteq V} I(S \cup X_i; Y) - I(S; Y)$ is some non-empty subset $A$, we have $\phi_{I, X_i}^{mci} = I(A \cup X_i; Y) - I(A; Y) \geq I(\emptyset \cup X_i; Y) - I(\emptyset; Y)$. In this case, $\phi_{I, X_i}^{mci} \geq I(X_i; Y)$. Combining both inequalities completes the proof.
\end{proof}



\end{document}