\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
%\usepackage{booktabs} % commands to create good-looking tables
%\usepackage{tikz} % nice language for creating drawings and diagrams

%% custom packages
\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks                           
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors

% packages we used last year
\usepackage{subfig}      % subfigures
\usepackage{amsthm}         % theorems
\usepackage{amsmath}        % align
\usepackage{amssymb}
\usepackage[capitalise]{cleveref}       % cref
\usepackage{graphicx}       % images
\usepackage{wrapfig}        % wrapping image
\usepackage{makecell}
\usepackage{multirow}
\usepackage{floatrow}
\usepackage{authblk}
\floatsetup[figure]{style=plain,subcapbesideposition=center}
\newcommand\CoAuthorMark{\footnotemark[\arabic{footnote}]}

%%% HELPER CODE FOR DEALING WITH EXTERNAL REFERENCES
\usepackage{xr}
\makeatletter
\newcommand*{\addFileDependency}[1]{
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{
    \externaldocument{#1}
    \addFileDependency{#1.tex}
    \addFileDependency{#1.aux}
}
%%% END HELPER CODE

% put all the external documents here!
\myexternaldocument{choi_500}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newtheorem{defi}{Definition}
\newtheorem{thm}{Theorem}
\newtheorem{lem}[thm]{Lemma}
\newtheorem{col}[thm]{Corollary}
\newtheorem*{nothm}{Theorem}
\newtheorem*{nolem}{Lemma}
\newtheorem*{nocol}{Corollary}

\title{Combating the Instability of Mutual Information-based Losses \\via Regularization (Supplementary material)}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is automatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<juice500@sogang.ac.kr>}{Kwanghee Choi\thanks{These authors contributed equally to this work.}}}
\author[2]{\href{mailto:<siyeong.lee@naverlabs.com>}{Siyeong Lee\protect\CoAuthorMark}}
% Add affiliations after the authors
\affil[1]{%
Sogang University
}
\affil[2]{%
NAVER LABS
}
\begin{document}\onecolumn
\maketitle \appendix
\section{Proofs}
In this section, we provide proof of all theoretical results mentioned in the manuscript. 

\subsection{Proof of the \texorpdfstring{\textbf{$D_\text{ReDV}$}}{ReDV} representation}\label{supp:ReDV}
In this subsection, we consider two probability distributions $\mathbb{P}$ and $\mathbb{Q}$, with $\mathbb{P}$ absolutely continuous with respect to $\mathbb{Q}$.
In addition, assume that both distributions are absolutely continuous with respect to Lebesgue measure $\mu$ on some compact domain $\Omega$. 

We first show that there exists the family of optimal function for the DV representation \citep{donsker1975asymptotic}.

\begin{lem} All functions of the form $T=\log\frac{d\mathbb{P}}{d\mathbb{Q}}+C^*$ is optimal for the DV representation $D_{DV}$.
\end{lem}  

\begin{proof}
To show this theorem, we borrow the proof of the dual representation for the KL divergence \citep{belghazi2018mutual}.

%For a given function $T$, consider the Gibbs distribution $G$ defined by $d \mathbb{G} = \frac{1}{Z} e^T d\mathbb{Q}$, where $Z=\mathbb{E}_{\mathbb{Q}}(e^T)$.

For a function $T$, let $\Delta_{T}$ be the gap 
\begin{equation}
    \Delta_{T} := D_{KL}(P || Q) - \left(\mathbb{E}_{\mathbb{P}}(T) - \log \mathbb{E}_{\mathbb{Q}}(e^T)\right).
\end{equation}

By Theorem 1 of MINE \citep{donsker1975asymptotic}, we already knew that there exists an optimal function $T^*=\log \frac{d\mathbb{P}}{d\mathbb{Q}} + C$ for some $C \in \mathbb{R}$ such that $\Delta_{T^*}$ = 0.

Consider a function $T = \log\frac{d\mathbb{P}}{d\mathbb{Q}}+C^*$ for $C^* \in \mathbb{R}$. 
The function $T$ can be rewritten as $(T^*-C) + C^*$.

Since
\begin{align}
\mathbb{E}_\mathbb{P}(T)&=\mathbb{E}_\mathbb{P}(T^*-C+C^*) \\
&=\mathbb{E}_\mathbb{P}(T^*)-C+C^*,
\end{align} and
\begin{align}
\log(\mathbb{E}_\mathbb{Q}(e^{T}))&=\log(\mathbb{E}_\mathbb{Q}(e^{T^*-C+C^*})) \\
&=\log(e^{C^*-C}\mathbb{E}_\mathbb{Q}(e^{T^*})) \\
&=(C^*-C)+\log(\mathbb{E}_\mathbb{Q}(e^{T^*})),
\end{align}

\begin{equation}
\mathbb{E}_\mathbb{P}(T) - \log(\mathbb{E}_\mathbb{Q}(e^{T}))=\mathbb{E}_\mathbb{P}(T^*) - \log(\mathbb{E}_\mathbb{Q}(e^{T^*})).
\end{equation}

Therefore, for the function $T$,
\begin{align}
\Delta_{T} = D_{KL}(P || Q) - \left(\mathbb{E}_{\mathbb{P}}(T) - \log \mathbb{E}_{\mathbb{Q}}(e^T)\right) = D_{KL}(P || Q) - \left(\mathbb{E}_{\mathbb{P}}(T^*) - \log \mathbb{E}_{\mathbb{Q}}(e^{T^{*}})\right) = \Delta_{T^*} = 0.
\end{align}

As a result, optimal functions takes the form $T = \log \frac{d \mathbb{P}}{d \mathbb{Q}} + C^*$ for some constant $C^* \in \mathbb{R}$.
\end{proof}

\begin{nothm} {(Theorem 1 restated)} Let $d$ be a distance function on $\mathbb{R}$. For any constant $C^* \in \mathbb{R}$ and any class of functions $\mathcal{T}$ mapping from $\Omega$ to $\mathbb{R}$, we have a novel dual representation of $KL$ divergence

\begin{align}
D_{\text{ReDV}} := \sup_{T \in \mathcal{T}} \mathbb{E}_\mathbb{P}(T) - \log(\mathbb{E}_\mathbb{Q}(e^T)) - d(\log(\mathbb{E}_\mathbb{Q}(e^T)), C^*) = D_{KL} (\mathbb{P} || \mathbb{Q}) .
\end{align}
\end{nothm}
\begin{proof}
%As $d$ is a distance function, $d(\log(\mathbb{E}_\mathbb{Q}[e^T]), C') \geq 0$.
i) For any $T$,
\begin{align}
    \mathbb{E}_\mathbb{P}(T) - \log(\mathbb{E}_\mathbb{Q}(e^T)) - d(\log(\mathbb{E}_\mathbb{Q}(e^T)) , C^*) \leq \mathbb{E}_\mathbb{P}(T) - \log(\mathbb{E}_\mathbb{Q}(e^T)).
\end{align}
Therefore, $\sup_{T:\Omega \to \mathbb{R}} \mathbb{E}_\mathbb{P}(T) - \log(\mathbb{E}_\mathbb{Q}(e^T)) - d(\log(\mathbb{E}_\mathbb{Q}(e^T)), C^*) \leq D_{KL}(\mathbb{P} || \mathbb{Q})$.

ii) By the lemma above, there exists $T^*=\log\frac{d\mathbb{P}}{d\mathbb{Q}}+C^*$ such that
\begin{align}
    D_{KL}(\mathbb{P} || \mathbb{Q})=\mathbb{E}_\mathbb{P}(T^*) - \log(\mathbb{E}_\mathbb{Q}(e^{T^*}))
\end{align}
and
\begin{align}
    \log(\mathbb{E}_\mathbb{Q}(e^{T^*})) = \log(\mathbb{E}_\mathbb{Q}(e^{C^*}\frac{d\mathbb{P}}{d\mathbb{Q}})) = \log(\int e^{C^*}\frac{d\mathbb{P}}{d\mathbb{Q}} d\mathbb{Q}) = C^*.
\end{align}
Therefore,
\begin{align}
\sup_{T:\Omega \to \mathbb{R}} \mathbb{E}_\mathbb{P}(T) - \log(\mathbb{E}_\mathbb{Q}(e^T)) - d(\log(\mathbb{E}_\mathbb{Q}(e^T)), C^*) & \geq \mathbb{E}_\mathbb{P}(T^*) - \log(\mathbb{E}_\mathbb{Q}(e^{T^*}))-d(\log(\mathbb{E}_\mathbb{Q}(e^{T^*})), C^*) \\ &= D_{KL}(\mathbb{P} || \mathbb{Q}).
\end{align}
Combining i) and ii) finishes the proof.
\end{proof}

\subsection{Extension to NWJ representation}\label{supp:ReNWJ}
In this subsection, we show that our regularizer can also be applied to the NWJ representation \citep{nguyen2010estimating}.

\begin{nothm} Let $d$ be a distance function on $\mathbb{R}$.
We have another dual representation such that
\begin{equation}
D_{\text{ReNWJ}} := (\mathbb{P} || \mathbb{Q}) = \sup_{T:\Omega \to \mathbb{R}} \mathbb{E}_\mathbb{P}(T) - \mathbb{E}_\mathbb{Q}(e^{T-1}) - d(\mathbb{E}_\mathbb{Q}(e^{T-1}), 1) = D_{KL} (\mathbb{P} || \mathbb{Q}).
\end{equation}
\end{nothm}
\begin{proof}
As $d$ is a distance function, $d(\mathbb{E}_\mathbb{Q}(e^{T-1}), 1) \geq 0$.

i) For any $T$,
\begin{equation}
    \mathbb{E}_\mathbb{P}(T) - \mathbb{E}_\mathbb{Q}(e^{T-1}) - d(\mathbb{E}_\mathbb{Q}(e^{T-1}), 1) \leq \mathbb{E}_\mathbb{P}(T) - \mathbb{E}_\mathbb{Q}(e^{T-1}).
\end{equation}
Therefore, $\sup_{T:\Omega \to \mathbb{R}} \mathbb{E}_\mathbb{P}(T) - \mathbb{E}_\mathbb{Q}(e^{T-1}) - d(\mathbb{E}_\mathbb{Q}(e^{T-1}), 1) \leq D_{KL}(\mathbb{P} || \mathbb{Q})$.

ii) By \cite{poole2019variational}, there exists $T^*=\log\frac{d\mathbb{P}}{d\mathbb{Q}}+1$ such that
\begin{equation}
    D_{KL}(\mathbb{P} || \mathbb{Q})=\mathbb{E}_\mathbb{P}(T^*) - \mathbb{E}_\mathbb{Q}(e^{T^*-1}).
\end{equation}

\begin{equation}
    \mathbb{E}_\mathbb{P}(T^*) = \mathbb{E}_\mathbb{P}(1 + \log(\frac{d\mathbb{P}}{d\mathbb{Q}})) = 1 + D_{KL}(\mathbb{P} || \mathbb{Q}).
\end{equation}
and
\begin{equation}
    \mathbb{E}_\mathbb{Q}(e^{T^*-1}) = \mathbb{E}_\mathbb{Q}(\frac{d\mathbb{P}}{d\mathbb{Q}}) = 1.
\end{equation}

Therefore,
\begin{align}
\sup_{T:\Omega \to \mathbb{R}} \mathbb{E}_\mathbb{P}(T) - \mathbb{E}_\mathbb{Q}(e^{T-1}) - d(\mathbb{E}_\mathbb{Q}(e^{T-1}), 1) & \geq \mathbb{E}_\mathbb{P}(T^*) - \mathbb{E}_\mathbb{Q}(e^{T^* -1}) -d(\mathbb{E}_\mathbb{Q}(e^{T^*-1}), 1) \\ &= D_{KL}(\mathbb{P} || \mathbb{Q}).
\end{align}
Combining i) and ii) finishes the proof.
\end{proof}

\subsection{Mathematical properties of \texorpdfstring{$I_\text{ReMINE}$}{ReMINE}}\label{supp:proof:ReMINE}
This subsection presents the proof of the consistency and the sample complexity of $I_\text{ReMINE}$.
To show these properties, we assume that the input space of the functions below is a compact domain, and all measures are absolutely continuous with respect to the Lebesgue measure. 
We will restrict to families of feedforward functions with continuous activations, with a single output neuron.
To avoid unnecessary heavy notation, we denote $\mathbb{P}=\mathbb{P}_{XY}$ and $\mathbb{Q}=\mathbb{P}_{X} \otimes \mathbb{P}_{Y}$ as the joint distribution and the product of marginals unless specified.

First, we define the sample complexity of the MI estimator. As mentioned by \cite{belghazi2018mutual}, this property is related to the \textit{approximation} problem, which addresses the size of the family of function $T_{\theta}$, and the \textit{estimation} problem, which addresses whether it is a reliable estimator.

\begin{defi}
The MI estimator $\hat{I}(X,Y)_n$ is strongly consistent if for all $\epsilon >0$, there exists a positive integer $N$ and a choice of statistics networks such that $\forall n \geq N, |I(X, Y) - \hat{I}(X, Y)_n| \leq \epsilon$, where the probability is over a set of samples.
\end{defi}

\paragraph{Consistency proof}
 \begin{lem}\label{supp:lem:ReMINE:approximation} (Approximation) Let $\eta$ > 0. There exists a neural network function $T_{\theta}$ with parameters $\theta \in \Theta$ such that 
\begin{equation}\label{supp:lem:ReMINE:approximation:eq} 
|\hat{I}_{\text{ReMINE}}(X, Y) - I_{\text{ReMINE}}(X, Y)| \leq \eta,
\end{equation}
where
\begin{equation}
\hat{I}_\text{ReMINE}(X, Y)=\sup_{\theta \in \Theta} \mathbb{E}_\mathbb{P}(T_\theta) - \log(\mathbb{E}_\mathbb{Q}(e^{T_\theta}) - d(\log(\mathbb{E}_\mathbb{Q}(e^{T_\theta}), C^*)) .
\end{equation}
\end{lem}

\begin{proof} 
Without loss of generality, we set $T^*=\log\frac{d\mathbb{P}}{d\mathbb{Q}}$. By construction, $T^*$ satisfies: %$C^* = 0$
\begin{equation}
    \mathbb{E}_{\mathbb{P}}(T^*) = I(X, Y), \quad \mathbb{E}_{\mathbb{Q}}(e^{T^*}) = 1, \quad \log(\mathbb{E}_{\mathbb{Q}}(e^{T^*})) = 0
\end{equation}
    
For a function $T$,
\begin{align}
    I_{\text{ReMINE}}&(X, Y) - \hat{I}_{\text{ReMINE}}(X, Y)\\
    &\leq \mathbb{E}_{\mathbb{P}}(T^* -T) + \log(\mathbb{E}_{\mathbb{Q}}(e^T)) + d(\log(\mathbb{E}_{\mathbb{Q}}(e^T), C^*) - d(\log(\mathbb{E}_{\mathbb{Q}}(e^{T^*}), C^*) \\
    &\leq \mathbb{E}_{\mathbb{P}}(T^* -T) + \log(\mathbb{E}_{\mathbb{Q}}(e^T)) + d(\log(\mathbb{E}_{\mathbb{Q}}(e^T), \log(\mathbb{E}_{\mathbb{Q}}(e^{T^*})) \\
    &\leq \mathbb{E}_{\mathbb{P}}(T^* -T)
    + \mathbb{E}_{\mathbb{Q}}(e^{T}-e^{T^*}) + d(\mathbb{E}_{\mathbb{Q}}(e^{T})-1, 0) \label{supp:lem:ReMINE:approximation:proof:eq}
\end{align}
where we used the inequality $\log x \leq x - 1$ and $d(\cdot)$ is the distance function induced by norm on $\mathbb{R}$ (e.g., absolute or square error).
Fix $\eta > 0$. By the universal approximation theorem, we may choose a feedforward network function $T_{\theta} \leq M$ such that
\begin{equation} \label{supp:lem:ReMINE:approximation:proof:eq-term1}
\mathbb{E}_{\mathbb{P}}|T^* - T_\theta| \leq \frac{\eta}{3}, \quad  \mathbb{E}_{\mathbb{Q}}|T^* - T_\theta| \leq \frac{\eta}{3}e^{-M}, \quad \text{and }\  d(\mathbb{E}_{\mathbb{Q}}|T_{\theta}-T^*|, 0) \leq \frac{\eta}{3\cdot d(e^M, 0)}
\end{equation}
Since $\exp$ is Lipschitz continuous with constant $e^M$ on $(-\infty, M]$, we have

\begin{equation} \label{supp:lem:ReMINE:approximation:proof:eq-term2}
\mathbb{E}_{\mathbb{Q}}|e^{T^*} - e^{T_\theta}| \leq e^{M}\mathbb{E}_{\mathbb{Q}}|T^* - T_\theta| \leq \frac{\eta}{3},
\end{equation}
and
\begin{align} \label{supp:lem:ReMINE:approximation:proof:eq-term3}
d(\mathbb{E}_{\mathbb{Q}}(e^{T})-1, 0) = d(\mathbb{E}_{\mathbb{Q}}(e^{T_{\theta}})-\mathbb{E}_\mathbb{Q}(e^{T^*}), 0) & = d(\mathbb{E}_{\mathbb{Q}}|e^{T_{\theta}}-e^{T^*}|, 0) \\
& \leq d(e^M\mathbb{E}_{\mathbb{Q}}|T_{\theta}-T^*|, 0) \leq d(e^M, 0)\cdot d(\mathbb{E}_{\mathbb{Q}}|T_{\theta}-T^*|, 0) \leq  \frac{\eta}{3}.
\end{align}  
 
From \cref{supp:lem:ReMINE:approximation:proof:eq}, \cref{supp:lem:ReMINE:approximation:proof:eq-term1}, \cref{supp:lem:ReMINE:approximation:proof:eq-term2}, \cref{supp:lem:ReMINE:approximation:proof:eq-term3} and the triangular inequality, we then obtain:
\begin{equation}
|\hat{I}_{\text{ReMINE}}(X, Y) - I_{\text{ReMINE}}(X, Y)| < \eta.
\end{equation}
\end{proof}

\begin{lem}\label{supp:lem:ReMINE:estimation} (Esitmation) Let $\eta > 0$. Given a neural network function $T_\theta$ with parameters $\theta \in \Theta$, there exists $N \in \mathbb{N}$ such that
\begin{equation}\label{supp:lem:ReMINE:estimation:proof:eq}
\forall n \geq N, \mathcal{P}(|\hat{I}_{\text{ReMINE}}(X,Y)_n - \hat{I}_\mathcal{\text{ReMINE}}(X,Y)| \leq \eta) = 1,
\end{equation}
where $\hat{I}_{\text{ReMINE}}(X,Y)_n$ is the ReMINE representation which is empirically obtained by $n$ samples.
\end{lem}

\begin{proof}
We start by using the triangular inequality to write, 
\begin{multline}
    |\hat{I}_{\text{ReMINE}}(X,Y)_n - \sup_{\theta \in \Theta}\hat{I}_{\text{ReMINE}}(T_{\theta})| \leq \sup_{\theta \in \Theta}|\mathbb{E}_{\mathbb{P}}(T_{\theta}) - \mathbb{E}_{\mathbb{P}_n}(T_{\theta})| + \sup_{\theta \in \Theta}|\log\mathbb{E}_{\mathbb{Q}}(e^{T_{\theta}}) - \log\mathbb{E}_{\mathbb{Q}_n}(e^{T_{\theta}})| \\ + \sup_{\theta \in \Theta}d(|\log\mathbb{E}_{\mathbb{Q}}(e^{T_{\theta}})- \log\mathbb{E}_{\mathbb{Q}_n}(e^{T_{\theta}})|, 0).
\end{multline}
Since the function $T_\theta$ is uniformly bounded by a constant $M$ and $\log$ is Lipschitz continuous with constant $e^M$, we have 
\begin{equation}
|\log\mathbb{E}_{\mathbb{Q}}(e^{T_{\theta}}) - \log\mathbb{E}_{\mathbb{Q}_n}(e^{T_{\theta}})| \leq e^M |\mathbb{E}_{\mathbb{Q}}(e^{T_{\theta}}) - \mathbb{E}_{\mathbb{Q}_n}(e^{T_{\theta}})|
\end{equation}
and
\begin{equation}
    d(|\log\mathbb{E}_{\mathbb{Q}}(e^{T_{\theta}})- \log\mathbb{E}_{\mathbb{Q}_n}(e^{T_{\theta}})|, 0) \leq d(e^M, 0) \cdot  d(|\mathbb{E}_{\mathbb{Q}}(e^{T_{\theta}})- \mathbb{E}_{\mathbb{Q}_n}(e^{T_{\theta}})|, 0).
\end{equation}
Since $\Theta$ is compact and the feedforward network function is continuous, $T_\theta$ and $e^{T_\theta}$ satisfy the uniform law of
large numbers \citep{belghazi2018mutual}. Given $\epsilon> 0$, we can thus choose $N\in \mathbb{N}$ such that $\forall n \geq N$ and with probability $1$,
\begin{align}
\sup_{\theta \in \Theta}|\mathbb{E}_{\mathbb{P}}(T_{\theta}) - \mathbb{E}_{\mathbb{P}_n}(T_{\theta})| &\leq \frac{\eta}{3},\\ 
\sup_{\theta \in \Theta}|\mathbb{E}_{\mathbb{Q}}(e^{T_{\theta}}) - \mathbb{E}_{\mathbb{Q}_n}(e^{T_{\theta}})| &\leq e^{-M} \frac{\eta}{3},\\ 
\sup_{\theta \in \Theta}d(|\mathbb{E}_{\mathbb{Q}}(e^{T_{\theta}})- \mathbb{E}_{\mathbb{Q}_n}(e^{T_{\theta}})|, 0) &\leq \frac{1}{ d(e^M, 0)}\frac{\eta}{3}.
\end{align}
Hence, this leads to 
\begin{equation}
|\hat{I}_{\text{ReMINE}}(X,Y)_n - \hat{I}_{\text{ReMINE}}(X,Y)|\leq \frac{\eta}{3} + \frac{\eta}{3} + \frac{\eta}{3} = \eta.
\end{equation}
\end{proof}

\begin{nothm} ReMINE is strongly consistent. 
\end{nothm}
\begin{proof}
Let $\epsilon > 0$. We apply \cref{supp:lem:ReMINE:approximation} and \cref{supp:lem:ReMINE:estimation} to find a neural network function $T_\theta$ and $N \in \mathbb{N}$ such that \cref{supp:lem:ReMINE:approximation:eq} and \cref{supp:lem:ReMINE:estimation:proof:eq} hold with $\eta = \epsilon/2$. By the triangular inequality, for all $n \geq N$ and with probability one, we have:
\begin{multline}
|I(X, Y) - \hat{I}_{\text{ReMINE}}(X,Y)_n| = |I_{\text{ReMINE}}(X, Y) - \hat{I}_{\text{ReMINE}}(X,Y)_n|  \quad(\because \text{Theorem~1})\\
\leq |I_{\text{ReMINE}}(X, Y) - \hat{I}_\text{ReMINE}(X,Y)| + |\hat{I}_{\text{ReMINE}}(X,Y)_n - \hat{I}_\text{ReMINE}(X,Y)| \leq \epsilon
\end{multline}
which proves the consistency.
\end{proof}

\paragraph{Sample complexity proof}
\begin{nothm} Assume that the function $T_\theta$ are $M$-bounded and $\mathcal{L}$-lipschitz with respect to the parameter $\theta$. The domain $\theta$ is bounded, so that $||\theta|| \leq K$ for some constant $K$. When using $k$ mini-batches to estimate MI, we have
\begin{equation}
\mathcal{P}(|\hat{I}_{\text{ReMINE}}(X,Y) - I(X, Y)| \leq \epsilon) \geq 1 - \delta
\end{equation}
whenever the number of samples $n$ for each batch satisfies 
\begin{equation}
n \geq \frac{2M^2(d\log(24KL\sqrt{d}/\epsilon) + 2dM + log(2/\delta))}{\epsilon^2k}.
\end{equation}
\end{nothm}

\begin{proof}
As the optimal $T^*$ of $I_{ReMINE}$ is also the solution of $I_{MINE}$, we can use the same proof process of the Theorem 6 in \citep{belghazi2018mutual}.
Contrast to MINE \citep{belghazi2018mutual}, we start from $\mathcal{P}(|\mathbb{E}_{\mathbb{Q}}[f]-\mathbb{E}_{\hat{\mathbb{Q}}}[f]| > \epsilon/6) \leq 2\exp(\frac{-\epsilon^2nk}{2M^2})$ by the Hoeffding inequality, because we use $n \cdot k$ samples and our loss function consists of three terms including the regularization term.
\end{proof}

\subsection{Mathematical properties of \texorpdfstring{$I_\text{ReNWJ}$}{ReNWJ estimator}}%\label{supp:proof:ReNWJ}
\paragraph{Consistency Proof} We show the proof of the consistency for the ReNWJ based estimator. Same to the proof of ReMINE consistency, we assume that the input space of the functions below is a compact domain, and all measures are absolutely continuous with respect to the Lebesgue measure. We will also restrict to families of feedforward functions with continuous activations, with a single output neuron. We provide a proof for the case where $d(\cdot, \cdot)$ is the log-Euclidean distance in this subsection.

\begin{lem}\label{supp:lem:ReNWJ:approximation} (Approximation) Let $\eta$ > 0. There exists a neural network function $T_{\theta}$ with parameters $\theta \in \Theta$ such that 
\begin{equation}\label{supp:lem:ReNWJ:approximation:eq}
|\hat{I}_{\text{ReNWJ}}(X, Y) - I_{\text{ReNWJ}}(X, Y)| \leq \eta
\end{equation}
where
\begin{equation}
\hat{I}_\text{ReNWJ}(X, Y)= \sup_{\theta \in \Theta} \mathbb{E}_\mathbb{P}(T_\theta) - \mathbb{E}_\mathbb{Q}(e^{T_\theta - 1}) - d(\mathbb{E}_\mathbb{Q}(e^{T_\theta-1}), 1).
\end{equation}
\end{lem}

\begin{proof} 
Without loss of generality, we set $T^*=\log\frac{d\mathbb{P}}{d\mathbb{Q}} + 1$. By construction, $T^*$ satisfies
\begin{equation}
    \mathbb{E}_{\mathbb{P}}(T^*) = 1 + I(X, Y), \quad \mathbb{E}_{\mathbb{Q}}(e^{T^*-1}) = 1.
\end{equation}
    
For a function $T$,
\begin{align}
    I_{\text{ReNWJ}}&(X, Y) - \hat{I}_{\text{ReNWJ}}(X, Y) \\
    &\leq \mathbb{E}_{\mathbb{P}}(T^* -T) + \mathbb{E}_{\mathbb{Q}}(e^{T-1}) - \mathbb{E}_{\mathbb{Q}}(e^{T^* - 1}) + d(\mathbb{E}_{\mathbb{Q}}(e^{T-1}), 1) - d(\mathbb{E}_{\mathbb{Q}}(e^{T^*-1}), 1)\\
    &\leq \mathbb{E}_{\mathbb{P}}(T^* -T) + \mathbb{E}_{\mathbb{Q}}(e^{T-1} - e^{T^*-1}) + d(\mathbb{E}_{\mathbb{Q}}(e^{T-1}), \mathbb{E}_{\mathbb{Q}}(e^{T^*-1})) \\
    &\leq \mathbb{E}_{\mathbb{P}}(T^* -T)
    + e^{-1}\mathbb{E}_{\mathbb{Q}}(e^{T}-e^{T^*}) + d(\mathbb{E}_{\mathbb{Q}}(e{^{T-1}}), 1) \label{supp:lem:ReNWJ:approximation:proof:eq}
\end{align}
where $d(\cdot, \cdot)$ is the log-Euclidean distance on $\mathbb{R}$.
Fix $\eta > 0$. By the universal approximation theorem, we may choose a feedforward network function $T_{\theta} \leq M$ with $M>1$ such that
\begin{equation}
\mathbb{E}_{\mathbb{P}}|T^* - T_\theta| \leq \frac{\eta}{3}, \quad  \mathbb{E}_{\mathbb{Q}}|T^* - T_\theta| \leq \frac{\eta}{3}e^{1-M}, \quad \text{and }\ d(\mathbb{E}_{\mathbb{Q}}(e^{T_{\theta}}), e) \leq \frac{\eta}{3}.
\end{equation}
Since $\exp$ is Lipschitz continuous with constant $e^M$ on $(-\infty, M]$, we have

\begin{equation}\label{supp:lem:ReNWJ:approximation:proof:eq-term2}
\mathbb{E}_{\mathbb{Q}}|e^{T^*} - e^{T_\theta}| \leq e^{M}\mathbb{E}_{\mathbb{Q}}|T^* - T_\theta| \leq \frac{\eta}{3}e.
\end{equation}
And
\begin{equation}\label{supp:lem:ReNWJ:approximation:proof:eq-term3}
d(\mathbb{E}_{\mathbb{Q}}(e^{T-1}), 1) = d(\mathbb{E}_{\mathbb{Q}}(e^{T_{\theta}}), \mathbb{E}_{\mathbb{Q}}(e^{T^*}))  \leq d(\mathbb{E}_{\mathbb{Q}}(e^{T_{\theta}}), e)
\leq \frac{\eta}{3}.
\end{equation}

From \cref{supp:lem:ReNWJ:approximation:proof:eq}, \cref{supp:lem:ReNWJ:approximation:proof:eq-term2}, \cref{supp:lem:ReNWJ:approximation:proof:eq-term3} and the triangular inequality, we then obtain
\begin{equation}
|\hat{I}_{\text{ReNWJ}}(X, Y) - I_{\text{ReNWJ}}(X, Y)| < \eta.
\end{equation}
\end{proof}

\begin{lem}\label{supp:lem:ReNWJ:estimation} (Estimation) Let $\eta > 0$. Given a neural network function $T_\theta$ with parameters $\theta \in \Theta$, there exists $N \in \mathbb{N}$ such that
\begin{equation}\label{supp:lem:ReNWJ:estimation:proof:eq}
\forall n \geq N, \mathcal{P}(|\hat{I}_{\text{ReNWJ}}(X,Y)_n - \hat{I}_\mathcal{\text{ReNWJ}}(X,Y)| \leq \eta) = 1,
\end{equation}
where $\hat{I}_{\text{ReNWJ}}(X,Y)_n$ is the ReNWJ representation which is empirically obtained by $n$ samples.
\end{lem}

\begin{proof}
We start by using the triangular inequality to write, 
\begin{multline}
    |\hat{I}_{\text{ReNWJ}}(X,Y)_n - \sup_{\theta \in \Theta}\hat{I}_{\text{ReNWJ}}(T_{\theta})| \leq \sup_{\theta \in \Theta}|\mathbb{E}_{\mathbb{P}}(T_{\theta}) - \mathbb{E}_{\mathbb{P}_n}(T_{\theta})| + \sup_{\theta \in \Theta}|\mathbb{E}_{\mathbb{Q}}(e^{T_{\theta}-1}) - \mathbb{E}_{\mathbb{Q}_n}(e^{T_{\theta}-1})| \\ + \sup_{\theta \in \Theta}d(\mathbb{E}_{\mathbb{Q}}(e^{T_{\theta}-1}),  \mathbb{E}_{\mathbb{Q}_n}(e^{T_{\theta}-1})).
\end{multline}

Since $\Theta$ is compact and the feedforward network $T_\theta$ is continuous and uniformly bounded by a constant $M$, $T_\theta$ and $e^{T_\theta}$ satisfy the uniform law of
large numbers \citep{belghazi2018mutual}. Given $\epsilon> 0$, we can thus choose $N\in \mathbb{N}$ such that $\forall n \geq N$ and with probability $1$,
\begin{align}
\sup_{\theta \in \Theta}|\mathbb{E}_{\mathbb{P}}(T_{\theta}) - \mathbb{E}_{\mathbb{P}_n}(T_{\theta})| &\leq \frac{\eta}{3}, \label{supp:lem:ReNWJ:estimation:proof:eq-term1}\\ 
\sup_{\theta \in \Theta}e^{-1}|\mathbb{E}_{\mathbb{Q}}(e^{T_{\theta}}) - \mathbb{E}_{\mathbb{Q}_n}(e^{T_{\theta}})| &\leq  \frac{\eta}{3}e^{-M}, \label{supp:lem:ReNWJ:estimation:proof:eq-term2}\\ 
\sup_{\theta \in \Theta}d(\frac{\mathbb{E}_{\mathbb{Q}}(e^{T_{\theta}})}{ \mathbb{E}_{\mathbb{Q}_n}(e^{T_{\theta}})}, 1) &\leq \frac{\eta}{3}. \label{supp:lem:ReNWJ:estimation:proof:eq-term3}
\end{align}
Hence, this leads to 
\begin{equation}
|\hat{I}_{\text{ReNWJ}}(X,Y)_n - \hat{I}_{\text{ReNWJ}}(X,Y)|\leq \frac{\eta}{3} + \frac{\eta}{3} + \frac{\eta}{3} = \eta.
\end{equation}
\end{proof}

\begin{nothm} The ReNWJ estimator is strongly consistent. 
\end{nothm}
\begin{proof}
Let $\epsilon > 0$. We apply \cref{supp:lem:ReNWJ:approximation} and \cref{supp:lem:ReNWJ:estimation} to find a neural network function $T_\theta$ and $N \in \mathbb{N}$ such that \cref{supp:lem:ReNWJ:approximation:eq} and \cref{supp:lem:ReNWJ:estimation:proof:eq} hold with $\eta = \epsilon/2$. By the triangular inequality, for all $n \geq N$ and with probability one, we have
\begin{multline}
|I(X, Y) - \hat{I}_{\text{ReNWJ}}(X,Y)_n| = |I_{\text{ReNWJ}}(X, Y) - \hat{I}_{\text{ReNWJ}}(X,Y)_n|  \quad(\because \text{ReNWJ representation})\\
\leq |I_{\text{ReNWJ}}(X, Y) - \hat{I}_\text{ReNWJ}(X,Y)| + |\hat{I}_{\text{ReNWJ}}(X,Y)_n - \hat{I}_\text{ReNWJ}(X,Y)| \leq \epsilon
\end{multline}
which proves the consistency.
\end{proof}

\paragraph{Sample complexity proof}
\begin{nothm} Assume that the function $1 \leq |T_\theta| < M$ is $\mathcal{L}$-lipschitz with respect to the parameter $\theta$. The domain $\theta$ is bounded, so that $||\theta|| \leq K$ for some constant $K$. When using $k$ mini-batches to estimate MI and $d(x, 1) \leq |x - 1|$, we have
\begin{equation}
\mathcal{P}(|\hat{I}_{\text{ReNWJ}}(X, Y) - I(X, Y)| \leq \epsilon) \geq 1 - \delta
\end{equation}
whenever the number of samples $n$ for each batch satisfies 
\begin{equation}\label{supp:lem:ReNWJ:sample:eq}
n \geq \frac{2M^2(d\log(24KL\sqrt{d}/\epsilon) + 2dM + log(2/\delta))}{\epsilon^2k}.
\end{equation}
\end{nothm}

\begin{proof}
By taking the assumptions of \cref{supp:lem:ReNWJ:estimation}, we begin with \cref{supp:lem:ReNWJ:estimation:proof:eq-term1}, \cref{supp:lem:ReNWJ:estimation:proof:eq-term2} and \cref{supp:lem:ReNWJ:estimation:proof:eq-term3}. By the Hoeffding inequality, for all function $f$,
\begin{equation}\label{supp:lem:ReNWJ:sample:proof:eq-result}
\mathcal{P}(|\mathbb{E}_{\mathbb{Q}}[f]-\mathbb{E}_{\hat{\mathbb{Q}}}[f]| > \epsilon/6) \leq 2\exp(\frac{-\epsilon^2(n \cdot k)}{2M^2}).
\end{equation}

To extend this inequality to a uniform inequality over all functions $T_\theta$ and $e^{T_\theta}$, we choose a minimal cover of the domain $\Theta \subset \mathbb{R}^d$ by a finite set of small balls of radius $\eta$, $\Theta \subset \cup_{j}B_{\eta}(\theta_j)$, and the union bound. The minimal cardinality of such covering is bounded by the covering number $N_\eta(\Theta)$ of $\Theta$,
\begin{equation}
    N_\eta(\Theta) \leq \left(\frac{2K\sqrt{d}}{\eta}\right)^d.
\end{equation}

Successively applying a union bound in \cref{supp:lem:ReNWJ:sample:proof:eq-result} with the set of functions $\{T_{\theta_j}\}_j$, and $\{e^{T_{\theta_j}}\}_j$, We have
\begin{equation}\label{supp:lem:ReNWJ:sample:proof:eq-term1}
    \mathcal{P}\left(max_{j}|\mathbb{E}_\mathbb{Q}(T_{\theta_j}) - \mathbb{E}_\mathbb{\hat{Q}}(T_{\theta_j})| \geq \frac{\epsilon}{6} \right) \leq 2N_{\eta}(\Theta)\exp(-\frac{\epsilon^2(n \cdot k)}{2M^2}),
\end{equation}
\begin{equation}\label{supp:lem:ReNWJ:sample:proof:eq-term2}
    \mathcal{P}\left(max_{j}|\mathbb{E}_\mathbb{Q}(e^{T_{\theta_j}}) - \mathbb{E}_\mathbb{\hat{Q}}(e^{T_{\theta_j}})| \geq \frac{\epsilon}{6} \right) \leq 2N_{\eta}(\Theta)\exp(-\frac{\epsilon^2(n \cdot k)}{2M^2}).
\end{equation}

We now choose that ball radius to be $\eta = \frac{\epsilon}{12L}e^{-2M}$. Solving for $n$ the inequation,
\begin{equation}
    2N_\eta(\Theta)\exp(-\frac{\epsilon^2 n}{2M^2})\leq \delta,
\end{equation}

we deduce from \cref{supp:lem:ReNWJ:sample:proof:eq-term1} that, whenever \cref{supp:lem:ReNWJ:sample:eq} holds, with probability at least $1-\delta$, for all $\theta \in \Theta$,
\begin{align}\begin{split}
|\mathbb{E}_\mathbb{Q}(T_{\theta}) - \mathbb{E}_\mathbb{\hat{Q}}(T_{\theta})| & \leq |\mathbb{E}_\mathbb{Q}(T_{\theta}) - \mathbb{E}_\mathbb{Q}(T_{\theta_j})| + |\mathbb{E}_\mathbb{Q}(T_{\theta_j}) - \mathbb{E}_\mathbb{\hat{Q}}(T_{\theta_j})| + 
|\mathbb{E}_\mathbb{\hat{Q}}(T_{\theta_j}) - \mathbb{E}_\mathbb{\hat{Q}}(T_{\theta})|  \\
& \leq \frac{\epsilon}{12}e^{-2M} + \frac{\epsilon}{6} +\frac{\epsilon}{12}e^{-2M} < \frac{\epsilon}{3}.
\end{split}\end{align}

Similarly, using \cref{supp:lem:ReNWJ:sample:proof:eq-term2}, we get that with probabilty at least $1-\delta$,
\begin{equation}
|\mathbb{E}_\mathbb{Q}(e^{T_{\theta}-1})  - \mathbb{E}_\mathbb{\hat{Q}}(e^{T_{\theta}-1}) | \leq \frac{\epsilon}{3} < e \cdot \frac{\epsilon}{3}.
\end{equation}

Hence,
\begin{multline}
|\hat{I}_\text{ReNWJ}(X, Y) - I(X,Y)| \leq |\mathbb{E}_\mathbb{Q}(T_{\theta_{j}}) - \mathbb{E}_\mathbb{\hat{Q}}(T_{\theta_{j}})| + | \mathbb{E}_\mathbb{Q}(e^{T_{\theta_{j}}-1}) - \mathbb{E}_\mathbb{\hat{Q}}(e^{T_{\theta_{j}}-1}) | + d(\mathbb{E}_\mathbb{Q}(e^{T_{\theta_{j}}}), \mathbb{E}_\mathbb{\hat{Q}}(e^{T_{\theta_{j}}})) \\
\leq |\mathbb{E}_\mathbb{Q}(T_{\theta_{j}}) - \mathbb{E}_\mathbb{\hat{Q}}(T_{\theta_{j}})| + e^{-1}| \mathbb{E}_\mathbb{Q}(e^{T_{\theta_{j}}}) - \mathbb{E}_\mathbb{\hat{Q}}(e^{T_{\theta_{j}}}) | + | \mathbb{E}_\mathbb{Q}(e^{T_{\theta_{j}}}) - \mathbb{E}_\mathbb{\hat{Q}}(e^{T_{\theta_{j}}}) |\leq \epsilon.
\end{multline}
\end{proof}

\subsection{The property of MI estimators}
\paragraph{The variance of the exponential value of the statistic network's output according to the bias of optimal functions on the distribution $\mathbb{Q}$.}

\begin{nothm}
Let $Q^{(n)}$ be the empirical distributions of $n$ i.i.d. samples from $\mathbb{Q}$. For the optimal $T_{1} = \log\frac{dp}{dq} + C_{1}$ and $T_{2} = \log\frac{dp}{dq} + C_{2}$ where $C_1 \geq C_2$,
\begin{equation}
\text{Var}_{\mathbb{Q}}(\mathbb{E}_{\mathbb{Q}^{(n)}}(e^{T_1})) \geq \text{Var}_{\mathbb{Q}}(\mathbb{E}_{\mathbb{Q}^{(n)}}(e^{T_2})).
\end{equation}
\end{nothm}

\begin{proof}
Consider that 
\begin{equation}
\text{Var}_{\mathbb{Q}}(e^{T_1}) = e^{2C_1}\left(\mathbb{E}_{\mathbb{Q}}((\frac{d \mathbb{P}}{d \mathbb{Q}})^2) - (\mathbb{E}_{\mathbb{Q}}(\frac{d \mathbb{P}}{d \mathbb{Q}}))^2 \right), 
\end{equation}
and
\begin{equation}
\text{Var}_{\mathbb{Q}}(e^{T_2}) = e^{2C_2}\left(\mathbb{E}_{\mathbb{Q}}((\frac{d \mathbb{P}}{d \mathbb{Q}})^2) - (\mathbb{E}_{\mathbb{Q}}(\frac{d \mathbb{P}}{d \mathbb{Q}}))^2 \right).
\end{equation}
 
By \cite{Song2020Understanding}, the variance of the mean of $n$ i.i.d. random variable then gives us
\begin{equation}
    \text{Var}_{\mathbb{Q}}(\mathbb{E}_{\mathbb{Q}^{(n)}}(e^{T_1})) = \frac{\text{Var}_{\mathbb{Q}}(e^{T_1})}{n} \text{, }    \text{Var}_{\mathbb{Q}}(\mathbb{E}_{\mathbb{Q}^{(n)}}(e^{T_2})) = \frac{\text{Var}_{\mathbb{Q}}(e^{T_2})}{n}.
\end{equation} 

Since $e^x \geq 1$ for all $x \geq 0$, 

\begin{equation}
    \frac{\text{Var}_{\mathbb{Q}}(\mathbb{E}_{\mathbb{Q}^{(n)}}(e^{T_1}))}{\text{Var}_{\mathbb{Q}}(\mathbb{E}_{\mathbb{Q}^{(n)}}(e^{T_2}))} = \frac{\frac{\text{Var}_{\mathbb{Q}}(e^{T_1})}{n}}{\frac{\text{Var}_{\mathbb{Q}}(e^{T_2})}{n}} = e^{2(C_1 - C_2)} \geq 1.
\end{equation} 

Therefore,  the variance of $T_1$ is equal to or less than the that of $T_2$ on $\mathbb{Q}$.
\end{proof}

\paragraph{Proof of estimation bias caused by drifting} \begin{nothm}
When used on DV representation, the two averaging strategies below produce a biased MI estimate if the drifting problem occurs.
    \begin{enumerate}
        \item Macro-averaging (similar to that of \citet{poole2019variational}): Establish a single estimate through the average of estimated MI from each batch.
        \item Micro-averaging: Calculate the DV representation using the average of the each individual network outputs.
    \end{enumerate}
\end{nothm}

\begin{proof}
We start from the definition of $I_{\text{DV}}$, where
\begin{equation}
    I_{\text{DV}}(X,Y) = \mathbb{E}_{\mathbb{P}}(T(x, y)) - \log(\mathbb{E}_{\mathbb{Q}}(e^{T(x, y)}))
\end{equation}
becomes the objective function to estimate MI, i.e. MINE.

Let $T_{ij}^{(J)}$ and  $T_{ij}^{(M)}$ denote the $ij$-th element of outputs for $\mathbb{P}_m$ and $\mathbb{Q}_n$ respectively, where $i$ is the index of batch and $j$ is the index of sample inside the batch, and the non-drifting output as $T^*_{ij}$, and the drifting constant for each batch $C_i$.
Then, $T_{ij} = T^*_{ij} + C_i$.

When the number of batch is $B$ and each batch size is $N$, 
\begin{enumerate}
    \item Macro averaging: 
\begin{align}
  &\frac{1}{B} \Sigma_i {[
    \frac{1}{N} \Sigma_j T_{ij}^{(J)}
    - \log (\frac{1}{N} \Sigma_j e^{{T_{ij}^{(M)}}})
]} \\
 = &\frac{1}{B} \Sigma_i {[
    \frac{1}{N} \Sigma_j (T_{ij}^{(J*)} + C_i)
    - \log (\frac{1}{N} \Sigma_j e^{{T_{ij}^{(M*)} + C_i}})
]} \\
 = & \frac{1}{B} \Sigma_i {[
    \frac{1}{N} \Sigma_j (T_{ij}^{(J*)} + C_i)
    - \log (\frac{1}{N} e^{C_i} \Sigma_j e^{{T_{ij}^{(M*)}}})
]} \\
 = & \frac{1}{B} \Sigma_i {[
    \frac{1}{N} \Sigma_j T_{ij}^{(J*)}
    - \log (e^{-C_i} \frac{1}{N} e^{C_i} \Sigma_j e^{T_{ij}^{(M*)}})
]} \\
= & \frac{1}{B} \Sigma_i {[
    \frac{1}{N} \Sigma_j T_{ij}^{(J*)}
    - \log (\frac{1}{N} \Sigma_j e^{T_{ij}^{(M*)}})
]} \\
= & \frac{1}{NB} \Sigma_{ij} T_{ij}^{(J*)}
- \frac{1}{B} \Sigma_{i}{[
    \log (\frac{1}{N} \Sigma_j e^{T_{ij}^{(M*)}})
]} \\
\neq & \frac{1}{NB} \Sigma_{ij} T_{ij}^{(J*)}
    - \log (\frac{1}{NB} \Sigma_{ij} e^{T_{ij}^{(M*)}})
\end{align}

    \item Micro averaging: 
\begin{align}
&\frac{1}{NB} \Sigma_{ij} T_{ij}^{(J)}
    - \log (\frac{1}{NB} \Sigma_{ij} e^{T_{ij}^{(M)}}) \\
= &\frac{1}{NB} \Sigma_{ij} (
        T_{ij}^{(J*)} + C_i
    )
    - \log (
        \frac{1}{NB} \Sigma_{ij} e^{
            (T_{ij}^{(M*)} + C_i )
        })
    \\
= &\frac{1}{NB} \Sigma_{ij} T_{ij}^{(J*)}
    - \log [(
        \frac{1}{NB} \Sigma_{ij} e^{
            (T_{ij}^{(M*)} + C_i)
        }) ^ {\frac{1}{B} \Sigma_i C_i}
        ] \\
\neq & \frac{1}{NB} \Sigma_{ij} T_{ij}^{(J*)}
    - \log (\frac{1}{NB} \Sigma_{ij} e^{T_{ij}^{(M*)}})
\end{align}
\end{enumerate}
\end{proof}
We emphasize that we have to stop the drifting via the regularization term of ReDV.

\paragraph{Wrong estimation derived from biased values}
According to the theorem above, the MI estimate derived from the average of the values estimated from the mini-batch in DV representation-based estimators will lead to erroneous results. 
However, the micro-averaging strategy is often used to measure the performance of MI estimators (MINE or InfoNCE), as shown in Fig. 6 of \cite{pmlr-v119-cheng20b}.

\subsection{The proof for the validity of our benchmark}
We assume that the dataset used for our benchmark satisfies the single label assumption where there exists exactly one label for every sample inside the dataset.
Note that the assumption implies that $p(y|x) = 1$.
In other words, we assume statistical dependence between $X$ and $Y$ \citep{tishby2015deep}. 

\begin{nothm} (Supervised Learning Benchmark) Consider a dataset $D = (X, Y)$ where $Y$ is the label for sample $X$, and $H(Y)$ is the entropy of $Y$. 
\begin{equation}
    I(X, Y) = H(Y)
\end{equation}
\end{nothm}

\begin{proof}
\begin{align}
I(X;Y) &= \int_{X, Y} P(X, Y) \log {\frac{P(X, Y)}{P(X)P(Y)}} \\
&=\int_x \int_y P(x, y) \log {\frac{P(y|x)}{P(y)}} dy dx \\
&=\int_x \int_y P(x)P(y|x) \log {\frac{P(y|x)}{P(y)}} dy dx \\
&=\int_x P(x) \left(\int_y P(y|x) \log {\frac{P(y|x)}{P(y)}} dy \right) dx \\
&=\int_{R} P(x^*) \log \frac{1}{P(y^*)} \quad \text{(where $R$ is the region where $y^*$ is a correct label for the given $x^*$}) \\
&=\sum_c \int_{R_c} P(x^*, c) \log\frac{1}{P(c)} \quad \text{ (where $R$ is partitioned by the label $c$ to yield $R_c$}) \\
& =\sum_c \log\frac{1}{P(c)} \int_{R_c} P(x^*, c) \quad \text{ ($\because P(c)$ is constant inside the $R_c$}) \\
&=\sum_c\log\frac{1}{P(c)} P(c) \quad \text{ $(\because \int_{R_c} P(x^*, c)=P(c)$, i.e., marginalization}) \\
&=H(Y)
\end{align}
\end{proof}

\begin{nothm} (Contrastive Learning Benchmark) Consider a dataset $D = (X, Y)$. Let $X_1$ be a sample drawn from the dataset with the label $Y$ and $X_2$ be another sample drawn from the subset of $D$ where all the samples inside the subset are with the same label $Y$. Assume that $D$ also satisfies the single label assumption.
\begin{equation}
I(X_1, X_2) = I(X_1, Y) = I(X_2, Y) = H(Y)
\end{equation}
\end{nothm}

\begin{proof}
\begin{align*}
P(X_1, X_2) &= \sum_{y_i} P(X_1, X_2, Y) \quad (\because \text{marginalization}) \\
&=\sum_{y_i} P(X_1)P(Y|X_1)P(X_2|Y, X_1) \quad (\because \text{factorization})\\
&=\sum_{y_i} P(Y)P(X_1|Y)P(X_2|Y) \quad (\because \text{$X_1$ and $X_2$ are independent for given $Y$}) \\
&=\sum_{i}P(y_i)P(X_1|y_i)P(X_2|y_i) \\
\end{align*}
\begin{align*}
P(X_1) &= \sum_{y_i} P(X_1, Y)= \sum_{y_i} P(Y)P(X_1|Y) = \sum_{i} P(y_i)P(X_1|y_i) \\
P(X_2) &= \sum_{y_i} P(X_2, Y)= \sum_{y_i} P(Y)P(X_2|Y) = \sum_{i} P(y_i)P(X_2|y_i) \\
\end{align*}
\begin{align*}
\frac{P(X_1, X_2)}{P(X_1)P(X_2)} &=\frac{\sum_{i}P(y_i)P(X_1|y_i)P(X_2|y_i)}{\sum_{i} P(y_i)P(X_1|y_i)\sum_{y_i} P(y_i)P(X_2|y_i)} \\
&=\frac{\sum_{i}P(y_i)P(X_1|y_i)P(X_2|y_i)}{\sum_{i}P(y_i)^2P(X_1|y_i)P(X_2|y_i)} \quad \text{ ($\because X_1$ and $X_2$ has the same label)}
\end{align*}

Let $R_i$ be the region where $(X, y_i)$ such as $i$-th class label $y_i$ is a correct label for the given $X_1$.
\begin{align*}
I(X_1, X_2) &= \int_{X_1, X_2} P(X_1, X_2) \log {\frac{P(X_1, X_2)}{P(X_1)P(X_2)}} \\
&= \int_{X_1, X_2}\left(\sum_{i}P(y_i)P(X_1|y_i)P(X_2|y_i)\right)\log\frac{\sum_{i}P(y_i)P(X_1|y_i)P(X_2|y_i)}{\sum_{i}P(y_i)^2P(X_1|y_i)P(X_2|y_i)} \\
&=\sum_{i}P(y_i)\int_{X_2}P(X_2|y_i)\left(\int_{R_i}P(X_1|y_i)\log\frac{P(y_i)P(X_1|y_i)P(X_2|y_i)}{P(y_i)^2P(X_1|y_i)P(X_2|y_i)}dx_1\right)dx_2\\
&=\sum_{i}P(y_i)\int_{X_2}P(X_2|y_i)\left(\int_{R_i}P(X_1|y_i)\log\frac{1}{P(y_i)}dx_1\right)dx_2\\
&=\sum_{i}P(y_i)\log\frac{1}{P(y_i)}\int_{X_2}P(X_2|y_i)\left(\int_{R_i}P(X_1|y_i)dx_1\right)dx_2\\
&=\sum_{i}P(y_i)\log\frac{1}{P(y_i)}\\
&=H(Y)
\end{align*}
\end{proof}

\section{Directly Utilizing the Statistics Network Outputs for Out-of-distribution Task}
\begin{figure}[ht]
    \centering
    \subfloat[]{\includegraphics[width=0.3\columnwidth]{asset/fig-ood-seen.png}} 
    \subfloat[]{\includegraphics[width=0.3\columnwidth]{asset/fig-ood-unseen.png}}
    \caption{
        Histogram of the exponential of the network outputs $e^{T(x, y)}$ which is trained with CLB CIFAR10. 
        Training samples and unseen samples are fed to (a) and (b), respectively.
    }
\label{supp:figs:supp_ood}
\end{figure}

We observe the SLB CIFAR10-trained network outputs when seen or unseen samples are fed to the statistics network $T$ in \cref{supp:figs:supp_ood}.
Note that we can take $e^{T(x, y)} = \frac{d\mathbb{P}_{XY}}{d\mathbb{P}_{X} \otimes \mathbb{P}_{Y}}$ for granted, thanks to regularization.
\cref{supp:figs:supp_ood} (a) shows the distribution of $e^{T(x, y)}$ for the training set samples $(x, y) \sim \mathbb{P}_{X_{\text{Train}}Y_{\text{Train}}}$.
As 90\% of $(x, y) \sim \mathbb{P}_X \otimes \mathbb{P}_Y$ is wrongly labeled, the majority yields $e^{T(x, y)} = 0$.
The likelihood ratio for the $(x, y) \sim \mathbb{P}_{XY}$ is $10$, and all the samples are centered around the ideal value as expected.
CIFAR10 test set samples $(x, y) \sim \mathbb{P}_{X_{\text{Test}}Y_{\text{Test}}}$ also yield similar results, where some of the samples are wrongly positioned, being the test error of $T$.
Surprisingly, when we feed MNIST \citep{lecun1998gradient} training samples $(x, y) \sim \mathbb{P}_{X_{\text{MNIST}}Y_{\text{MNIST}}}$, model successfully classifies nearly all the samples to be less likely to occur in $\mathbb{P}_{X_{\text{Train}}Y_{\text{Train}}}$.
This implies that exploiting the network outputs with the viewpoints of MI may show usefulness in out-of-distribution detection.

%%figure
\begin{figure}[ht] %%% t: top, b: bottom, h: here
\centering
\subfloat{\includegraphics[width=0.3\columnwidth]{asset/mine-imagenet-plot.pdf}}
\subfloat{\includegraphics[width=0.3\columnwidth]{asset/remine-imagenet-plot.pdf}}
\caption{
    Training $T_\theta$ using $I_\text{MINE}$ and $I_\text{ReMINE}$ with batch size $100$ for $20$ epochs.
    We breakdown the MI loss into two components.
    We split both losses into first term \textcolor{blue}{\textbf{$\mathbb{E}_{\mathbb{P}_{X X}}(T)$}} and second term \textcolor{orange}{\textbf{$\log\mathbb{E}_{\mathbb{P}_X \otimes \mathbb{P}_X}(e^T)$}}.
}
\label{figs:imagenet_drifting}
\end{figure}

\section{Experiments on ImageNet}\label{supp:imagenet}
%table
\begin{table*}[ht]
\centering
% \small

\begin{tabular}{c|c|cc|cc}
\Xhline{1.2pt}
\multirow{2}{*}{Task}  & \multirow{2}{*}{Loss} & \multicolumn{2}{c|}{MI Estimation} & \multicolumn{2}{c}{Test Accuracy} \\ 
& & Original & Regularized & Original & Regularized \\
\hline

\multirow{3}{*}{Supervised Learning Benchmark} 
& CE & - & - & 0.0795 & - \\
& MINE & 6.147 & 6.110 & 0.1056 & 0.1081 \\
& NWJ & 6.072 & 6.075 & 0.1020 & 0.1005 \\
\hline

\multirow{2}{*}{Contrastive Learning Benchmark} & MINE & 1.095 & 1.140 & 0.0103 & 0.0098 \\
& NWJ & 0.000 & 1.008 & 0.0010 & 0.0072 \\
\hline\Xhline{1.2pt}

\end{tabular}
\caption{Our supervised and contrastive learning benchmark results on ImageNet dataset.
We provide the MI estimation and test accuracy, where we clip the negative MI estimations to 0.
We compare the performance of original and regularized loss.
We also add the accuracy of standard cross-entropy loss (CE) for comparison.
Similar to \cref{subsec:comparison}, we choose the regularization weight $\lambda \in \{0.1, 0.01, 0.001\}$ that shows the best MI estimation results.
}
\label{table:supp_imagenet_benchmark}
\end{table*}
We test on the ImageNet dataset with $1000$ classes, where we use the batch size of $100$.
We set the batch size to be relatively small to observe how different losses behave, whereas multiple contrastive learning literature such as \cite{chen2020simple, he2020momentum} uses large batch sizes to avoid instability.
We train for $20$ epochs to observe the early stages of training.

First, we can observe in \cref{figs:imagenet_drifting} that the regularizer successfully solves the drifting problem of $I_\text{MINE}$.
Also, \cref{table:supp_imagenet_benchmark} shows that $I_\text{NWJ}$ fails in the contrastive learning benchmark.
$I_\text{NWJ}$ explodes within a few steps of training, where the regularizer successfully avoids the problem to yield a feasible output.
Note that we did not observe the losses till convergence; we have to train much longer to obtain a more accurate performance of MI estimation and test accuracy.
However, we can see that in the supervised learning benchmark, which is the relatively easier benchmark, all the losses are already close to the optimal MI even in the earlier epochs.
We can also observe a similar trade-off between the MI estimation and test accuracy in \cref{table:supp_imagenet_benchmark}.
Future works on large-scale datasets are needed to observe the behaviors further.

\section{Experimental Details}
In this section, we provide the experiment details in the manuscript with the accompanying code \url{https://github.com/Siyeong-Lee/Deconstructing-MINE}.

\subsection{Hardware Specification}
We use a single NVIDIA DGX A100 machine with 8 GPUs for all the experiments.
All the experiments except for our benchmark experiments take less than 10 minutes and a single GPU to compute.
It takes less than 2 days to compute all the benchmark experiments: 4 settings, 12 losses, and 5 seeds running on 8 GPUs and 4 processes per GPU.

\subsection{Detailed Settings for One-hot Dataset Experiments} \label{subsec:supp_onehot}
We describe the detailed settings for \cref{figs:onehot_toy_stable_loss}, \cref{figs:onehot_toy_stable_scatter}, \cref{figs:onehot_toy_unstable}, \cref{figs:remine_different_c}, and \cref{figs:onehot_toy_remine}.
We choose $N=16$ for the one-hot discrete dataset $X \sim U(1, N)$.
We use a simple statistics network $T$ with a concatenated vector of dimension $N\times2=32$ as input.
We pass the input through two fully connected layers with ReLU activation by widths: $32-256-1$.
The last layer outputs a single scalar with no bias and activation.
We use stochastic gradient descent (SGD) with learning rate $0.1$ to optimize the statistics network unless specified.

\subsection{Detailed Settings for Our Benchmark} \label{subsec:supp_our_benchmark}
We describe the detailed settings for \cref{table:sota_benchmark} and \cref{figs:our_benchmark_ablation}.
We use ResNet-18 \citep{he2016deep} as the backbone network and use Adam optimizer with the default learning rate $0.001$, $\beta_1=0.9$ and $\beta_2=0.999$.
We use batch size $100$ for CIFAR100 and $10$ for CIFAR10.
We train for different epochs per each benchmark: $40$ epochs (SLB CIFAR10), $100$ epochs (SLB CIFAR100), $100$ epochs (CLB CIFAR10), and $150$ epochs (CLB CIFAR100).
We choose enough number of epochs for all the losses to be fully converged for each of the benchmarks.
We rerun the same experiment $5$ times with different seeds.

\subsection{Detailed Settings for the 20D Correlated Gaussian Task}
We describe the detailed settings for \cref{figs:gaussian_benchmark}. We sampled $(x, y)$ from $d$-dimensional correlated Gaussian dataset where $X \sim N(\mathbf{0}, \mathbf{I}_d)$ and $Y \sim N(\rho X, (1-\rho^2)\mathbf{I}_d)$ given the correlation parameter $0 \leq \rho < 1$, which is taken from \citet{belghazi2018mutual}.
The true MI for the dataset is $I(X,Y)=-{\frac{d}{2}} \log(1-\rho^2)$.
For the statistics network architecture, we consider the architecture similar to \cref{subsec:supp_onehot} where we concatenate the inputs $(x,y)$ to pass through three fully connected layers with ReLU activation (excluding the output layer) by widths $40-256-256-1$, same as the network used in \citet{poole2019variational}.
We used the same optimizer with \cref{subsec:supp_our_benchmark}.

\end{document}