% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} 

%% Choose your variant of English; be consistent
\usepackage[american]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % typesetting numbers and units
\usepackage{booktabs} % good-looking tables
\usepackage{tikz} % drawings and diagrams

%% Self-defined macros (and packages)
\usepackage{soul}
\usepackage{graphicx}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{xcolor}
\usepackage{amsmath}
\newcommand{\mathcolorbox}[2]{\colorbox{#1}{$\displaystyle #2$}}
% \newcommand{\omar}[1]{\textcolor{blue}{#1}}
% \newcommand{\alex}[1]{\textcolor{green}{#1}}
% \newcommand{\aapo}[1]{\textcolor{red}{#1}} 

% \usepackage[backref=page]{hyperref}
\input{math_commands.tex}  % optional math commands from github.com/goodfeli/dlbook_notation


\usepackage{amsthm}
\newtheoremstyle{mythmstyle}% name
{\topsep}%Space above
{-2\topsep}%Space below
{\itshape}%Body font
{0pt}%Indent amount
{\bfseries}% Theorem head font
{}%Punctuation after theorem head
{ }%Space after theorem head
{}% theorem head specification

\theoremstyle{mythmstyle}
\newtheorem{theorem}{Theorem}
\newtheorem{conjecture}{Conjecture}
\newcommand\numberthis{\addtocounter{equation}{1}\tag{\theequation}}
% \newcommand{\sectionbreak}{\clearpage}


\title{The Optimal Noise in Noise-Contrastive Learning Is \\ Not What You Think \\ (Supplementary Material)}

% Add authors
\author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2022 paper}{Omar Chehab}{}}
\author[1]{Alexandre Gramfort}
\author[2]{Aapo Hyv\"arinen}

% Add affiliations
\affil[1]{%
    Université Paris-Saclay, Inria, CEA, 
    Palaiseau, France
}
\affil[2]{%
    Department of Computer Science, University of Helsinki, Finland 
}
  

\begin{document}

\onecolumn
\setcounter{conjecture}{0}
\setcounter{theorem}{0}

\maketitle

\section{Visualizations of the MSE landscape}
\label{sec:mselandscape}

\begin{figure}[!ht]. %!ht
\centering
\includegraphics[width=\columnwidth]{img/mse_vs_noiseval.pdf}
\caption{MSE vs. the noise parameter. Top left panel for model (i), Gaussian mean; Top right panel for model (ii), Gaussian variance; Bottom left for model (iii), Gaussian correlation.
}
\label{fig:msevsnoiseval}
\end{figure} 

We provide visualizations of the MSE landscape of the NCE estimator, when the noise is constrained within a parametric family containing the data.

We draw attention to the two local minima symmetrically placed to the left and to the right of the Gaussian mean. This corroborates the indeterminacies observed in this paper (Conjecture on limit of zero noise), as to where the optimal noise should place its mass for this estimation problem. 



\newpage

\section{Intractability of the 1D Gaussian case}
\label{sec:intractability}

Suppose the data distribution $p_d$ is a one-dimensional standardized zero-mean Gaussian. The model and noise distributions are of the same family, parameterized by mean and/or variance (we write these together in one model):

$$p_{\theta}(x) = \frac{1}{\sqrt{2\pi\alpha}}e^{-\frac{1}{2}\frac{(x-\mu)^2}{\alpha}} , \quad  p_{n}(x) = \frac{1}{\sqrt{2\pi\beta}}e^{-\frac{1}{2}(\frac{(x-\pi)^2}{\beta}} \qquad x \in \sR$$

We can write out the relevant functions, evaluated at $\alpha=1,\mu=0$ as
     the 2D score:
    $$\vg(x) = 
        \begin{pmatrix}
        \partial_\mu \log p_\theta \\
        \partial_\alpha \log p_\theta
    \end{pmatrix} \bigg|_{\mu=0,\alpha=1}
=
    \begin{pmatrix}
        x \\
        -1+ x^2
    \end{pmatrix}$$

    and its ``pointwise covariance":
    $\vg(x)\vg(x)^\top = 
    \begin{pmatrix}
        x^2 & -x + x^3 \\
        -x + x^3 & x^4 - x^2 + 1
    \end{pmatrix}$


In the following, we consider estimation of variance only. i.e.\ only the second term in $\vm$ and the second diagonal term in the Fisher information matrix $I$ .
Now we can compute the generalized score mean $m$ and mean of square $I$ as they intervene in the MSE formula for Noise-Contrastive Estimation:

\begin{align*}
    m & = 
    \int g(x) (1 - D(x)) p(x)dx
  %  =
%    \begin{pmatrix}
%        m_1 \\
%        m_2
 %   \end{pmatrix}
\end{align*}

which gives

\begin{equation*}
 %   \begin{cases}
 %       m_1 & = 0 \quad \text{(the integrand is an odd function)} \\
        m  = 
        -\frac{1}{2\sqrt{2\pi}} 
        \int 
        \left( e^{\frac{-x^2}{2}} \frac{1}{1 + \frac{1}{\nu}\sqrt{\beta} e^{\frac{-x^2}{2}({1} - \frac{1}{\beta})}} \right)dx 
         + 
        \frac{1}{2\sqrt{2\pi}} 
        \int 
        x^2
        \left( e^{\frac{-x^2}{2}} \frac{1}{1 + \frac{1}{\nu}\sqrt{\beta} e^{\frac{-x^2}{2}(1 - \frac{1}{\beta})}} \right)dx   
  %  \end{cases}
\end{equation*}

and 

\begin{align*}
    I & = 
    \int g(x)^2 (1 - D(x)) p(x)dx
 %   =
 %   \begin{pmatrix}
  %      I_{11} & I_{12} \\
%        I_{21} & 
%        I_{22} %\\
%    \end{pmatrix}
\end{align*}

which gives

\begin{multline*}
 %   \begin{cases}
 %       I_{12} & = I_{21} = 0 \quad %\text{(the integrand is an odd %function)} \\
  %      I_{11} & =
   %     \frac{1}{\sqrt{2\pi}} 
    %    \int 
     %   x^2
%        \left( e^{\frac{-x^2}{2}} \frac{1}{1 + \frac{1}{\nu}\sqrt{\beta} e^{\frac{-x^2}{2}(1 - \frac{1}{\beta})}} \right)dx \\ 
        I  = 
        \frac{1}{4\sqrt{2\pi}} 
        \int 
        x^4
        \left( e^{\frac{-x^4}{2}} \frac{1}{1 + \frac{1}{\nu}\sqrt{\beta} e^{\frac{-x^2}{2}(1 - \frac{1}{\beta})}} \right)dx 
         - 
        \frac{1}{2\alpha^3\sqrt{2\pi}} 
        \int 
        x^2
        \left( e^{\frac{-x^2}{2}} \frac{1}{1 + \frac{1}{\nu}\sqrt{\beta} e^{\frac{-x^2}{2}(1 - \frac{1}{\beta})}} \right)dx \\
         +
        \frac{1}{4\sqrt{2\pi}} 
        \int 
        \left( e^{\frac{-x^2}{2}} \frac{1}{1 + \frac{1}{\nu}\sqrt{\beta} e^{\frac{-x^2}{2}(1 - \frac{1}{\beta})}} \right)dx   
 %   \end{cases}
\end{multline*}


We see that even in a simple 1D Gaussian setting,  evaluating the asymptotic MSE of the Noise-Contrastive Estimator is untractable in closed-form, given the integrals in $I$, where the integrand includes the product of a Gaussian density with the logistic function compounded by the Gaussian density, further multiplied by monomials. While here we considered the case of variance, the intractability is seen even in the case of the mean.
%if we constrain the analysis to one of them, which would leave us with the diagonal entries $I_{11}$ or $I_{22}$
Optimizing the asymptotic MSE 
with respect to $\beta$ and $\pi$ (noise distribution) or $\nu$ (identifiable to the noise proportion)
yields similarly intractable integrals.
%,
%ven when imposing, for example, unit-variance on the data distribution $\alpha=1$.

\if0
\aapo{\bf OLD TEXT:}
In this case, we can write out:

\begin{itemize}

    \item the 2D score:
    $g(x) = 
    \begin{pmatrix}
        \frac{1}{\alpha}x \\
        -\frac{1}{2\alpha} + \frac{1}{2\alpha^2}x^2
    \end{pmatrix}$

    \item its pointwise covariance:
    $g(x)g(x)^\top = 
    \begin{pmatrix}
        \frac{1}{\alpha^2}x^2 & -\frac{1}{2\alpha^2}x + \frac{1}{2\alpha^3}x^3 \\
        -\frac{1}{2\alpha^2}x + \frac{1}{2\alpha^3}x^3 & -\frac{1}{4\alpha^4}x^4 - \frac{1}{2\alpha^3}x^2 + \frac{1}{4\alpha^2}
    \end{pmatrix}$

    \item the optimal discriminator:
    $1 - D(x) = 
    \frac{\nu p_n(x)}{\nu p_n(x) + p_d(x)}
    =
    \frac{1}{1 + \frac{1}{\nu}\sqrt{\frac{\beta}{\alpha}} e^{\frac{-x^2}{2}(\frac{1}{\alpha} - \frac{1}{\beta})}}
    % \sigma_{\frac{1}{\nu}\sqrt{\frac{\beta}{\alpha}}}(\frac{x^2}{2}(\frac{1}{\alpha} - \frac{1}{\beta}))
    $
\end{itemize}

Now we can compute the (generalized) score mean $\vm$ and covariance $\mI$ as they intervene in the MSE formula for Noise-Contrastive Estimation:

\begin{align*}
    \vm & = 
    \int \vg(\vx) (1 - D(\vx)) p(\vx)d\vx
    =
    \begin{pmatrix}
        m_1 \\
        m_2
    \end{pmatrix}
\end{align*}

where

\begin{align*}
    \begin{cases}
        m_1 & = 0 \quad \text{(the integrand is an odd function)} \\
        m_2 & = 
        -\frac{1}{2\alpha\sqrt{2\pi\alpha}} 
        \int 
        \left( e^{\frac{-x^2}{2\alpha}} \frac{1}{1 + \frac{1}{\nu}\sqrt{\frac{\beta}{\alpha}} e^{\frac{-x^2}{2}(\frac{1}{\alpha} - \frac{1}{\beta})}} \right)dx \\
        & + 
        \frac{1}{2\alpha^2\sqrt{2\pi\alpha}} 
        \int 
        x^2
        \left( e^{\frac{-x^2}{2\alpha}} \frac{1}{1 + \frac{1}{\nu}\sqrt{\frac{\beta}{\alpha}} e^{\frac{-x^2}{2}(\frac{1}{\alpha} - \frac{1}{\beta})}} \right)dx   
    \end{cases}
\end{align*}

and 

\begin{align*}
    \mI & = 
    \int \vg(\vx)\vg(\vx)^\top (1 - D(\vx)) p(\vx)d\vx
    =
    \begin{pmatrix}
        I_{11} & I_{12} \\
        I_{21} & I_{22} \\
    \end{pmatrix}
\end{align*}

where 

\begin{align*}
    \begin{cases}
        I_{12} & = I_{21} = 0 \quad \text{(the integrand is an odd function)} \\
        I_{11} & =
        \frac{1}{\alpha^2\sqrt{2\pi\alpha}} 
        \int 
        x^2
        \left( e^{\frac{-x^2}{2\alpha}} \frac{1}{1 + \frac{1}{\nu}\sqrt{\frac{\beta}{\alpha}} e^{\frac{-x^2}{2}(\frac{1}{\alpha} - \frac{1}{\beta})}} \right)dx \\ 
        I_{22} & = 
        \frac{1}{4\alpha^4\sqrt{2\pi\alpha}} 
        \int 
        x^4
        \left( e^{\frac{-x^4}{2\alpha}} \frac{1}{1 + \frac{1}{\nu}\sqrt{\frac{\beta}{\alpha}} e^{\frac{-x^2}{2}(\frac{1}{\alpha} - \frac{1}{\beta})}} \right)dx \\
        & - 
        \frac{1}{2\alpha^3\sqrt{2\pi\alpha}} 
        \int 
        x^2
        \left( e^{\frac{-x^2}{2\alpha}} \frac{1}{1 + \frac{1}{\nu}\sqrt{\frac{\beta}{\alpha}} e^{\frac{-x^2}{2}(\frac{1}{\alpha} - \frac{1}{\beta})}} \right)dx \\
        & +
        \frac{1}{4\alpha^2\sqrt{2\pi\alpha}} 
        \int 
        \left( e^{\frac{-x^2}{2\alpha}} \frac{1}{1 + \frac{1}{\nu}\sqrt{\frac{\beta}{\alpha}} e^{\frac{-x^2}{2}(\frac{1}{\alpha} - \frac{1}{\beta})}} \right)dx   
    \end{cases}
\end{align*}

Even in a simple 1D Gaussian setting, 
evaluating the asymptotic MSE of the Noise-Contrastive Estimator is untractable in closed-form,
given the integrals in $m_2, I_{11}, I_{22}$. While here we considered the case of two parameters, the intractability is seen even if we constrain the analysis to one of them, which would leave us with the diagonal entries $I_{11}$ and $I_{22}$

Optimizing the asymptotic MSE 
with respect to $\beta$ (noise distribution) or $\nu$ (identifiable to the noise proportion)
yields similarly intractable integrals,
even when imposing, for example, unit-variance on the data distribution $\alpha=1$.

\fi

\newpage

\section{Optimal Noise Proportion when the Noise Distribution matches the Data Distribution: Proof}
\label{sec:optimalnoiseprop}

We wish to minimize the MSE given by
%
\begin{align*}
    \mathrm{MSE}_{\mathrm{NCE}}(T, \nu, p_n) =
    \frac{\nu + 1}{T} \mathrm{tr}
    (
        \mI^{-1} & - \frac{\nu + 1}{\nu} (\mI^{-1} \vm \vm^\top \mI^{-1})
    )
\end{align*}
%
when $p_n = p_d$. In that case, 
%
\begin{align*}
    D(\vx)
    =
    \frac{p_d}{p_d + \nu p_n}(\vx)
    =
    \frac{p_d}{p_d + \nu p_d}(\vx)
    =
    \frac{1}{1 + \nu}
\end{align*}
%
and the integrals involved become
%
\begin{align*}
    \vm 
    & = 
    \int \vg(\vx) (1 - D(\vx)) p(\vx)d\vx \\
    & = 
    \frac{\nu}{1 + \nu} \int \vg(\vx) p(\vx)d\vx \\
    & = 
    0
\end{align*}
%
given the score has zero mean, and
%
\begin{align*}
    \mI 
    & = 
    \int \vg(\vx)\vg(\vx)^\top (1 - D(\vx)) p(\vx)d\vx \\
    & = 
    \frac{\nu}{1 + \nu} \int \vg(\vx)\vg(\vx)^\top p(\vx)d\vx \\
    & = 
    \frac{\nu}{1 + \nu} \mI_F \enspace .
\end{align*}
%
The objective function thus reduces to
%
\begin{align*}
    \mathrm{MSE}_{\mathrm{NCE}}(T, \nu, p_n) 
    & =
    \frac{\nu + 1}{T} \mathrm{tr}(\mI^{-1} )
    =
    \frac{(\nu + 1)^2}{\nu T} \mathrm{tr}(\mI_F^{-1} )
    \propto
    \frac{(\nu + 1)^2}{\nu} \enspace .
\end{align*}
%
The derivative with respect to $\nu$ is proportional to $\frac{1}{\nu^2} - 1$ and is null when $\nu=1$ so when the noise proportion is $50\%$. 

Note that in that case where $p_n = p_d$, we can compare the $\mathrm{MSE}$ achieved by NCE (using $T_d$ data samples and $T_n$ noise samples) with the $\mathrm{MSE}$ achieved my MLE (using $T_d$ data samples):
%
\begin{align*}
    \frac{\mathrm{MSE}_{\mathrm{NCE}}(T, \nu, p_n)}{\mathrm{MSE}_{\mathrm{MLE}}(T_d)}
    & =
    \frac{
    \frac{(\nu + 1)^2}{\nu T} \mathrm{tr}(\mI_F^{-1} )
    }
    {
    \frac{1}{T_d} \mathrm{tr}(\mI_F^{-1} )
    }
    =    
    \frac{
    \frac{(\nu + 1)^2}{\nu T} \mathrm{tr}(\mI_F^{-1} )
    }
    {
    \frac{\nu + 1}{T} \mathrm{tr}(\mI_F^{-1} )
    }
    =    
    1
    +
    \frac{1}{\nu}
\end{align*}
%
which is known from~\citep{gutmann2012nce,pihlaja2010nce}.

\newpage

\section{Optimal Noise for Estimating a Parameter: Proofs}
\label{sec:parameterproofs}

We here prove the theorem and conjecture for the optimal noise distribution in three limit cases $\nu \rightarrow 0$ (all data samples), $\nu \rightarrow \infty$ (all noise samples), and $\frac{p_d}{p_n}(.) = 1 + \eps(.)$ as $\epsilon(.) \rightarrow 0$ (noise distribution is an infinitesimal perturbation of the data distribution).

The goal is to optimize the $\mathrm{MSE}_{\mathrm{NCE}}(T, \nu, p_n)$ with respect to the noise distribution $p_n$, where
%
\begin{align}
    \mathrm{MSE}_{\mathrm{NCE}}(T, \nu, p_n) & =
    \frac{\nu + 1}{T} \mathrm{tr}
    (\mI^{-1} - \frac{\nu + 1}{\nu} (\mI^{-1} \vm \vm^\top \mI^{-1}))
\end{align}
%
where the integrals
%
\begin{align*}
    \vm & = 
    \int \vg(\vx) (1 - D(\vx)) p(\vx)d\vx \\
    \mI & = 
    \int \vg(\vx)\vg(\vx)^\top (1 - D(\vx)) p(\vx)d\vx
\end{align*}
%
depend non-linearly on $p_n$ via the optimal discriminator:
%
\begin{align*}
    1-D(\vx) = \frac{\nu p_n(\vx)}{p_d(\vx) + \nu p_n(\vx)}
\end{align*}

The general proof structure is:

\begin{itemize}
    \item Perform a Taylor expansion of $1-D(\vx)$ in the $\nu \rightarrow 0$ or $\nu \rightarrow \infty$ limit
    \item Plug into the integrals $\vm$, $\mI$ and evaluate them (up to a certain order)
    \item Perform a Taylor expansion of  $\mI^{-1}$ (up to a certain order)
    \item Evaluate the $\mathrm{MSE}_{\mathrm{NCE}}$ (up to a certain order) \\
    \item Optimize the $\mathrm{MSE}_{\mathrm{NCE}}$ w.r.t. $p_n$
    \item Compute the MSE gaps at optimality
\end{itemize} 

\newpage

\begin{theorem}
    In either of the following two limits: 
    \begin{enumerate}
        \item[(i)] the noise distribution is a (infinitesimal) perturbation of the data distribution $\frac{p_d}{p_n} = 1 + \eps(x)$;
        \item[(ii)] in the limit of all noise samples $\nu \rightarrow \infty$;
    \end{enumerate}
    the noise distribution minimizing asymptotic MSE is
%  
    \begin{align}
        p_n^{\mathrm{opt}}(\vx) \propto p_d(\vx) \|\mI_F^{-1} \vg(\vx)\|
        \enspace .
    \end{align}
    \\
\end{theorem}

\textit{Proof:} case where $\nu \rightarrow \infty$.

We start with a change of variables $\gamma = \frac{1}{\nu} \rightarrow 0$ to bring us to a zero-limit. 

The MSE in terms of our new variable $\gamma = \frac{1}{\nu}$ can be written as:
%
\begin{align}
    \mathrm{MSE}_{\mathrm{NCE}}(T, \gamma, p_n) & =
    \frac{\gamma + 1}{\gamma T} \mathrm{tr}
    (\mI^{-1}) - \frac{(\gamma + 1)^2}{T \gamma} \mathrm{tr}(\mI^{-1} \vm \vm^\top \mI^{-1}) \\
     & =
    \bigg(\gamma^{-1} T^{-1} + \gamma^0 T^{-1} \bigg) 
    \mathrm{tr}
    (\mI^{-1}) 
    - 
    \bigg( \gamma^{-1}T^{-1} + \gamma^0 2T^{-1} + \gamma^1 T^{-1} \bigg) 
    \mathrm{tr}(\mI^{-1} \vm \vm^\top \mI^{-1})
\end{align}

Given the term up until $\gamma^{-1}$ in the MSE, we will use Taylor expansions up to order 2 throughout the proof, in anticipation that the MSE will be expanded until order 1.

\begin{itemize}

    \item Taylor expansion of the discriminator
    
    \begin{align*}
        1-D(\vx) 
        = 
        \frac{\nu p_n(\vx)}{p_d(\vx) + \nu p_n(\vx)}
        = 
        \frac{1}{1 + \gamma \frac{p_d}{p_n}(\vx)}
        = 
        1 - \gamma \frac{p_d}{p_n}(\vx) + \gamma^2 \frac{p_d^2}{p_n^2}(\vx) + \circ(\gamma^2)
    \end{align*}
    
    \item Evaluating the integrals $\vm$, $\mI$
    
    \begin{align*}
        \vm 
        & = 
        \int \vg(\vx)p_d(\vx)
        \bigg(1 - D(\vx)\bigg) 
        d\vx 
        = 
        \int \vg(\vx)p_d(\vx)
        \bigg( 1 - \gamma \frac{p_d}{p_n}(\vx) + \gamma^2 \frac{p_d^2}{p_n^2}(\vx) + \circ(\gamma^2) \bigg) 
        d\vx \\     
        & = 
        \vm_F - \gamma \va + \gamma^2 \vb + \circ(\gamma^2) 
        \numberthis
        \label{eq:taylormallnoise}
    \end{align*}
    % 
    where $\vm_F$ is the Fisher-score mean of the (possibly unnormalized) model and we use shorthand notations $a$ and $b$ for the remaining integrals:
    %
    \begin{align*}
        \vm_F
        & = 
        \int \vg(\vx)p_d(\vx)
        d\vx 
        = 0 \\
        \va
        & = 
        \int \vg(\vx)\frac{p_d^2}{p_n}(\vx)
        d\vx \\
        \vb
        & = 
        \int \vg(\vx)\frac{p_d^3}{p_n^2}(\vx)
        d\vx \enspace .
    \end{align*}
    
    Similarly,
    
    \begin{align*}
        \mI 
        & = 
        \int \vg(\vx)\vg(\vx)^\top p_d(\vx)
        \bigg(1 - D(\vx)\bigg) 
        d\vx 
        = 
        \int \vg(\vx)\vg(\vx)^\top p_d(\vx)
        \bigg( 1 - \gamma \frac{p_d}{p_n}(\vx) + \gamma^2 \frac{p_d^2}{p_n^2}(\vx) + \circ(\gamma^2) \bigg) 
        d\vx \\     
        & = 
        \mI_F - \gamma \mA + \gamma^2 \mB + \circ(\gamma^2)
    \end{align*}
    % 
    where the Fisher-score covariance (Fisher information) is $\mI_F$ and we use shorthand notations $A$ and $B$ for the remaining integrals:
    %
    \begin{align*}
        \mI_F
        & = 
        \int \vg(\vx)\vg(\vx)^\top p_d(\vx)
        d\vx \\
        \mA
        & = 
        \int \vg(\vx)\vg(\vx)^\top \frac{p_d^2}{p_n}(\vx)
        d\vx \\
        \mB
        & = 
        \int \vg(\vx)\vg(\vx)^\top \frac{p_d^3}{p_n^2}(\vx)
        d\vx \enspace .
    \end{align*}

    \item Taylor expansion of $\mI^{-1}$
    
    \begin{align*}
        \mI^{-1} 
        & = 
        \bigg(
        \mI_F - \gamma \mA + \gamma^2 \mB + \circ(\gamma^2) 
        \bigg)^{-1} \\
        & = 
        \bigg(
        \mI_F (\textbf{Id} - \gamma \mI_F^{-1} \mA + \gamma^2 \mI_F^{-1} \mB)
        + \circ(\gamma^2) 
        \bigg)^{-1} \\
        & = 
        \mI_F^{-1}
        \bigg(
        \textbf{Id} - \gamma \mI_F^{-1} \mA + \gamma^2 \mI_F^{-1} \mB \bigg)^{-1}
        + \circ(\gamma^2) \\
        & = 
        \mI_F^{-1}
        \bigg(
        \textbf{Id} + \gamma \mI_F^{-1} \mA + \gamma^2 ((\mI_F^{-1} \mA)^2 - \mI_F^{-1} \mB) + \circ(\gamma^2)
        \bigg)
        + \circ(\gamma^2) \\
        & = 
        \mI_F^{-1} + \gamma \mI_F^{-2} \mA + \gamma^2 (\mI_F^{-1}(\mI_F^{-1} \mA)^2 - \mI_F^{-2} \mB) + \circ(\gamma^2)
    \numberthis
    \label{eq:taylorinverseIallnoise}
    \end{align*}
    
    \item Evaluating the $\mathrm{MSE}_{\mathrm{NCE}}$

    \begin{align*}
        \mI^{-1} \vm \vm^\top \mI^{-1}
        & = 
        \mI_F^{-1} \vm_F \vm_F^\top \mI_F^{-1}
        \gamma^2 (
        \mI_F^{-1} \va \va^\top  \mI_F^{-1} 
        +
        \mI_F^{-2} \mA \vm_F \vm_F\top \mI_F^{-2} \mA
        )
        + \circ(\gamma^2)
    \end{align*}
    % 
    by plugging in the Taylor expansions of $\mI^{-1}$ and $\vm$ and retaining only terms up to the second order. Hence, the second term of the MSE without the trace is
    \begin{align*}
        & \bigg( \gamma^{-1}T^{-1} + \gamma^0 2T^{-1} + \gamma^1 T^{-1} \bigg)
        \mI^{-1} \vm \vm^\top \mI^{-1} \\
        & = 
        \gamma^{-1} \frac{1}{T} (
        \mI_F^{-1} \vm_F \vm_F^\top \mI_F^{-1}
        )
        +
        \gamma^{0} \frac{2}{T} ( 
        \mI_F^{-1} \vm_F \vm_F^\top \mI_F^{-1}
        )
        + \\
        & \quad
        \gamma^{1} \frac{1}{T} (
        \mI_F^{-1} \vm_F \vm_F^\top \mI_F^{-1} +
        \mI_F^{-1} \va \va^\top  \mI_F^{-1} +
        \mI_F^{-2} \mA \vm_F \vm_F^\top \mI_F^{-2} \mA
        )
        + \circ(\gamma)
    \end{align*}
    
    and the first term of the MSE without the trace is
    %
    \begin{align*}
        & \bigg(\gamma^{-1} T^{-1} + \gamma^0 T^{-1} \bigg) 
        (\mI^{-1}) \\
        & = 
        \bigg(\gamma^{-1} T^{-1} + \gamma^0 T^{-1} \bigg) 
        \bigg(
        \mI_F^{-1} + \gamma \mI_F^{-2} \mA + \gamma^2 (\mI_F^{-1}(\mI_F^{-1} \mA)^2 - \mI_F^{-2} \mB) + \circ(\gamma^2)
        \bigg) \\
        & = 
        \gamma^{-1} \frac{1}{T}\mI_F^{-1} + 
        \gamma^{0}\frac{1}{T}(\mI_F^{-2} \mA + \mI_F^{-1}) 
        +
        \gamma^1
        \frac{1}{T}[\mI_F^{-1}(\mI_F^{-1} \mA)^2 - \mI_F^{-2} \mB + \mI_F^{-2} \mA ]
        + \circ(\gamma) \enspace .
    \end{align*}
    
    Subtracting the second term from the first term and applying the trace, we finally write the MSE:
    %
    \begin{align}
        & \mathrm{MSE}_{\mathrm{NCE}} 
        = 
        \mathrm{tr}\bigg(
        \gamma^{-1} \frac{1}{T} \big(
        \mI_F^{-1} 
        -
        \mI_F^{-1} \vm_F \vm_F^\top \mI_F^{-1}
        \big)
        + 
        \gamma^{0} \frac{1}{T} \big(
        \mI_F^{-2} \mA + \mI_F^{-1}
        -
        2\mI_F^{-1} \vm_F \vm_F^\top \mI_F^{-1}
        \big) 
        % +
        % \gamma^1
        % \frac{1}{T}[\mI_F^{-1}(\mI_F^{-1} \mA)^2 - \mI_F^{-2} \mB + \mI_F^{-2} \mA - \mI_F^{-1} \va \va^\top  \mI_F^{-1}]
        \bigg)
        + \circ(\gamma)
    \label{eq:allnoisemse}
    \end{align}

    \item Optimize the $\mathrm{MSE}_{\mathrm{NCE}}$ w.r.t. $p_n$
    
    To optimize w.r.t. $p_n$, we need only keep the two first orders of the $\mathrm{MSE}_{\mathrm{NCE}}$, which depends on $p_n$ only via the term $\mathrm{tr}(\mI_F^{-2}\mA) = \int \| \mI_F^{-1} \vg(x) \|^2 \frac{p_d^2}{p_n}(x)d\vx$. Hence, we need to optimize
    %
    \begin{equation}
    J (p_n)= \frac{1}{T} \int \|I_F^{-1} \vg(\vx)\|^2\frac{p_d^2}{p_n}(\vx)d\vx
    \end{equation}
    %
    with respect to $p_n$. We compute the variational (Fr\'echet) derivative together with the Lagrangian of the constraint $\int p_n(\vx)=1$ (with $\lambda$ denoting the Lagrangian multiplier) to obtain
    \begin{equation}
    \delta_{p_n}J = - \|\mI_F^{-1} \vg\|^2\frac{p_d^2}{ p_n^2} + \lambda \enspace .
    \end{equation}
    Setting this to zero and taking into account the non-negativity  of $p_n$ gives
    \begin{equation}
    p_n(\vx) =   \|\mI_F^{-1} \vg(x) \| p_d(\vx) /Z
    \end{equation}
    where $Z=\int  \|\mI_F^{-1} \vg(x) \| p_d(\vx)d\vx$ is the normalization constant. This is thus the optimal noise distribution, as a first-order approximation. 
    
    \item Compute the MSE gaps at optimality
    
    Plugging this optimal $p_n$ into the formula of $\mathrm{MSE}_{\mathrm{NCE}}$ and subtracting the Cramer-Rao MSE (which is a lower bound for a normalized model), we get:

    \begin{align*}
        \Delta_{\mathrm{opt}}\mathrm{MSE}_{\mathrm{NCE}} 
        & = 
        \mathrm{MSE}_{\mathrm{NCE}}(p_n = p_n^{\mathrm{opt}}) - \mathrm{MSE}_{\mathrm{Cramer-Rao}} \\
        & = 
        \frac{1}{T} \bigg( \int \|I_F^{-1} \boldsymbol{\psi}\| p_d \bigg)^2 \enspace .
    \end{align*}

    This is interesting to compare with the case where the noise distribution is the data distribution, which gives

    \begin{align*}
        \Delta_{\mathrm{data}}\mathrm{MSE}_{\mathrm{NCE}} 
        & = 
        \mathrm{MSE}_{\mathrm{NCE}}(p_n = p_d) - \mathrm{MSE}_{\mathrm{Cramer-Rao}} \\
        & = 
        \frac{1}{T} \int \|I_F^{-1} \boldsymbol{\psi}\|^2 p_d
    \end{align*}

    where the squaring is in a different place.
    In fact, we can compare these two quantities by the Cauchy-Schwartz inequality, or simply the fact that
    
    \begin{align*}
        \Delta\mathrm{MSE}_{\mathrm{NCE}} 
        & = 
        \Delta_{\mathrm{data}}\mathrm{MSE}_{\mathrm{NCE}} - \Delta_{\mathrm{opt}}\mathrm{MSE}_{\mathrm{NCE}} \\
        & =
        \mathrm{MSE}_{\mathrm{NCE}}(p_n = p_d) -
        \mathrm{MSE}_{\mathrm{NCE}}(p_n = p_n^{\mathrm{opt}}) \\
        & = 
        \frac{1}{T} \text{Var}_{X \sim p_d} \{ \|I_F^{-1} \vg(\textbf{X}) \| \}
    \end{align*}

    This implies that the two MSEs, when when the noise distribution is either $p_n^{\mathrm{opt}}$ or $p_d$, can be equal only if $ \|I_F^{-1} \vg(.)\|$  is constant in the support of $p_d$. This does not seem to be possible for any reasonable distribution.

    % \item Optimize the $\mathrm{MSE}_{\mathrm{NCE}}$ w.r.t. $\nu$
    
    % To do this, we look to order $\gamma^1$, so that the $\mathrm{MSE}_{\mathrm{NCE}}(\nu) = k_{-1} \nu^{-1} + k_{0} \nu^0 + k_{1} \nu^1$, for constants $k_{-1}, k_{0}, k_{1}$ given by the formula above.
    
    % Setting the derivative to zero yields:  $k_{1} - \frac{k_{-1}}{\nu^2} = 0$, hence:
    
    % \begin{align*}
    %     \nu^{\mathrm{opt}} 
    %     = 
    %     \sqrt{\frac{k_{-1}}{k_1}}
    %     = 
    %     \sqrt{\frac{\mathrm{tr}(\mI_F^{-1})}{k_1}}
    % \end{align*}

\end{itemize}


\newpage

\textit{Proof:} case where $p_n \approx p_d$

We consider the limit case where $\frac{p_d}{p_n}(\vx) = 1 + \epsilon(\vx)$ with $|\epsilon(\vx) - 0| < \epsilon_{\mathrm{max}} \quad \forall \vx$.

Note that in order to use Taylor expansions for terms containing $\epsilon(\vx)$ in an integral, we assume for any integrand $h(\vx)$ that $\int h(\vx) \epsilon(\vx) d\vx \approx \eps \int h(\vx) d\vx$, where $\epsilon$ would be a constant.


\begin{itemize}

    \item Taylor expansion of the discriminator
    %
    \begin{align*}
        1-D(\vx) 
        & = 
        \frac{\nu p_n(\vx)}{p_d(\vx) + \nu p_n(\vx)}
        = 
        \frac{1}{1 + \frac{1}{\nu} + \frac{p_d}{p_n}(\vx)}
        = 
        \frac{1}{1 + \frac{1}{\nu} + \frac{1}{\nu} \epsilon(\vx)}
        \\
        &
        =
        \frac{\nu}{1 + \nu}\epsilon^0(\vx)
        -
        \frac{\nu}{(1 + \nu)^2}\epsilon^1(\vx)
        +
        \frac{\nu}{(1 + \nu)^3}\epsilon^2(\vx)
        + \circ(\epsilon^2) 
    \end{align*}
    
    \item Evaluating the integrals $\vm$, $\mI$
    %
    \begin{align*}
        \vm 
        & = 
        \int \vg(\vx)p_d(\vx)
        \bigg(1 - D(\vx)\bigg) 
        d\vx \\
        & = 
        \int \vg(\vx)p_d(\vx)
        \bigg( 
        \frac{\nu}{1 + \nu}\epsilon^0(\vx)
        -
        \frac{\nu}{(1 + \nu)^2}\epsilon^1(\vx)
        +
        \frac{\nu}{(1 + \nu)^3}\epsilon^2(\vx)
        + \circ(\epsilon^2)  
        \bigg) 
        d\vx \\     
        & = 
        \frac{\nu}{1 + \nu} \vm_F 
        - \frac{\nu}{(1 + \nu)^2} \va(\epsilon)
        + \frac{\nu}{(1 + \nu)^3} \vb(\epsilon^2)
        + \circ(\epsilon^3) 
    \end{align*}
    % 
    where the Fisher-score mean $\vm_F$ is null and we use shorthand notations $a$ and $b$ for the remaining integrals:
    %
    \begin{align*}
        \vm_F
        & = 
        \int \vg(\vx)p_d(\vx)
        d\vx \\
        \va(\epsilon)
        & = 
        \int \vg(\vx) p_d \epsilon(\vx)
        d\vx \\
        \vb(\epsilon^2)
        & = 
        \int \vg(\vx) p_d \epsilon^2(\vx)
        d\vx \enspace .
    \end{align*}

    Similarly,
    %
    \begin{align*}
        \mI 
        & = 
        \int \vg(\vx)\vg(\vx)^\top p_d(\vx)
        \bigg(1 - D(\vx)\bigg) 
        d\vx 
        \\ & = 
        \int \vg(\vx)\vg(\vx)^\top p_d(\vx)
        \bigg( 
        \frac{\nu}{1 + \nu}\epsilon^0(\vx)
        -
        \frac{\nu}{(1 + \nu)^2}\epsilon^1(\vx)
        +
        \frac{\nu}{(1 + \nu)^3}\epsilon^2(\vx)
        + \circ(\epsilon^2)  
        \bigg) 
        d\vx \\     
        & = 
        \frac{\nu}{1 + \nu} \mI_F 
        - \frac{\nu}{(1 + \nu)^2} \mA(\epsilon)
        + \frac{\nu}{(1 + \nu)^3} \mB(\epsilon^2)
        + \circ(\epsilon^3) 
    \end{align*}
    % 
    where the Fisher-score covariance (Fisher information) is $\mI_F$ and we use shorthand notations $A$ and $B$ for the remaining integrals:
    %
    \begin{align*}
        \mI_F
        & = 
        \int \vg(\vx)\vg(\vx)^\top p_d(\vx)
        d\vx \\
        \mA(\epsilon)
        & = 
        \int \vg(\vx)\vg(\vx)^\top p_d \epsilon(\vx)
        d\vx \\
        \mB(\epsilon^2)
        & = 
        \int \vg(\vx)\vg(\vx)^\top p_d \epsilon^2(\vx)
        d\vx \enspace .
    \end{align*}

    \item Taylor expansion of $\mI^{-1}$
    % Using Wolfram-Alpha
    
    \begin{align*}
        \mI^{-1}
        & = 
        \bigg(
        \frac{\nu}{1 + \nu} \mI_F 
        - \frac{\nu}{(1 + \nu)^2} \mA(\epsilon)
        + \frac{\nu}{(1 + \nu)^3} \mB(\epsilon^2)
        + \circ(\epsilon^3)
        \bigg)^{-1} \\
        & = 
        \frac{1 + \nu}{\nu} \mI_F^{-1} 
        + \frac{1}{\nu} \mI_F^{-2} \mA(\epsilon) 
        + \frac{\nu}{1 + \nu} \mI_F^{-2} 
        \big(
        \mI_F^{-1} \mA^2(\epsilon) - \mB(\epsilon^2)
        \big)
        + \circ(\epsilon^3)
    \end{align*}
    
    \item Evaluating the $\mathrm{MSE}_{\mathrm{NCE}}$

    \begin{align*}
        \mI^{-1} \vm \vm^\top \mI^{-1}
        & = 
        \mI_F^{-1} \vm_F \vm_F^\top \mI_F^{-1}
        +
        \frac{1}{(1 + \nu)^2} \bigg(
        \mI_F^{-2} \mA(\epsilon) \vm_F \vm_F^\top  \mI_F^{-2} \mA(\epsilon)
        +
        \mI_F^{-1} \va(\epsilon) \va(\epsilon)^\top \mI_F^{-1} 
        \bigg)
        + \circ(\epsilon^3)
    \end{align*}
    % 
    by plugging in the Taylor expansions of $\mI^{-1}$ and $\vm$ and retaining only terms up to the second order. Finally, the MSE becomes:
    %
    \begin{align*}
        \mathrm{MSE}_{\mathrm{NCE}}(T, \nu, p_n) 
        & = 
        \frac{\nu + 1}{T} \mathrm{tr} (\mI^{-1} - \frac{\nu + 1}{\nu} (\mI^{-1} \vm \vm^\top \mI^{-1})) \\
        & = 
        \mathrm{tr} \bigg(
        \frac{(1 + \nu)^2}{T \nu} (
        \mI_F^{-1}
        -
        \mI_F^{-1} \vm_F \vm_F^\top \mI_F^{-1}
        )
        + 
        \frac{1 + \nu}{T \nu} \mI_F^{-2} \mA(\epsilon) 
        + \\
        & \enspace
        \frac{1}{T \nu} \big(
        \mI_F^{-3} \mA^2(\epsilon) 
        -
        \mI_F^{-2} \mB(\epsilon^2) 
        - 
        \mI_F^{-1} \va(\epsilon) \va(\epsilon)^\top \mI_F^{-1}
        -
        \mI_F^{-2} \mA(\epsilon) \vm_F \vm_F^\top  \mI_F^{-2} \mA(\epsilon)
        \big)
        \bigg)
        + 
        \circ(\epsilon^3)
    \end{align*}
    
    \item Optimize the $\mathrm{MSE}_{\mathrm{NCE}}$ w.r.t. $p_n$
    
    To optimize w.r.t. $p_n$, we need only keep the $\mathrm{MSE}_{\mathrm{NCE}}$ up to order 1, which depends on $p_n$ only via the term
    \begin{align*}
        \mathrm{tr}(\mI_F^{-2}\mA(\epsilon)) 
        & = 
        \mathrm{tr} \bigg(
        \mI_F^{-2}   
        \big(
        \int \vg(\vx)\vg(\vx)^\top \frac{p_d^2}{p_n}(x)d\vx
        - 
        \mI_F
        \big)
        \bigg) 
    \end{align*}.
    where we unpacked $p_n$ from $\epsilon = \frac{p_d}{p_n} - 1$. Hence, we need to optimize
    %
    \begin{equation}
    J (p_n)= \frac{1}{T} \int \|I_F^{-1} \vg(\vx)\|^2\frac{p_d^2}{p_n}(\vx)d\vx
    \end{equation}
    %
    with respect to $p_n$. This was already done in the all-noise limit $\nu \rightarrow \infty$ and yielded
    \begin{equation}
    p_n(\vx) =   \|\mI_F^{-1} \vg(x) \| p_d(\vx) /Z
    \end{equation}
    where $Z=\int  \|\mI_F^{-1} \vg(x) \| p_d(\vx)d\vx$ is the normalization constant. This is thus the optimal noise distribution, as a first-order approximation. 

\end{itemize}



\newpage

In the third case, the limit of all data, we have the following conjecture: 
\begin{conjecture}
    In case (iii), the limit of all data samples $\nu \rightarrow 0$, the optimal noise distribution is such that it is all concentrated at the set of those $\boldsymbol{\xi}$ which are given by
%
    \begin{align*}
        \arg\max_{\boldsymbol{\xi}} \,
        &p_d(\boldsymbol{\xi}) 
        \mathrm{tr} \bigg(
        (\vg(\xi)\vg(\xi)^\top)^{-1}
        \bigg)^{-1} \\
        \mathrm{s.t.} & \quad \vg(\xi) = \mathrm{constant}
        \numberthis
        %\enspace .
    \end{align*}
%
\end{conjecture}
%
Informal and heuristic \textit{``proof"}:

We have the $\mathrm{MSE}_{\mathrm{NCE}}(T, \nu, p_n) = \frac{\nu + 1}{T} \mathrm{tr} (\mI^{-1} - \frac{\nu + 1}{\nu} (\mI^{-1} \vm \vm^\top \mI^{-1}))$.

Given the term up until $\nu^{-1}$ in the MSE, we will use Taylor expansions up to order 2 throughout the proof, in anticipation that the MSE will be expanded until order 1.

Note that in this no noise limit, the assumption made by Gutmann and Hyv\"arinen (2012) that $p_n$ is non-zero whenever $p_d$ is nonzero is not true for this optimal $p_n$, which reduces the rigour of this analysis. (This we denote by heuristic approximation~1.)

\begin{itemize}
    \item Taylor expansion of the discriminator
    
    \begin{align*}
        1-D(\vx) 
        = 
        \frac{\nu p_n(\vx)}{p_d(\vx) + \nu p_n(\vx)}
        = 
        \frac{1}{1 + \frac{1}{\nu} \frac{p_d}{p_n}(\vx)}
        = 
        \nu \frac{p_n}{p_d}(\vx) - \nu^2 \frac{p_n^2}{p_d^2}(\vx) + \circ(\nu^2)
    \end{align*}
    
    \item Evaluating the integrals $\vm$, $\mI$
    
    \begin{align*}
        \vm 
        & = 
        \int \vg(\vx)p_d(\vx)
        \bigg(1 - D(\vx)\bigg) 
        d\vx 
        = 
        \int \vg(\vx)p_d(\vx)
        \bigg( \nu \frac{p_n}{p_d}(\vx) - \nu^2 \frac{p_n^2}{p_d^2}(\vx) + \circ(\nu^2) \bigg) 
        d\vx \\     
        & = 
        \nu \vm_n - \nu^2 \vb + \circ(\nu^2) 
    \end{align*}
    % 
    where
    %
    \begin{align*}
        \vm_n
        & = 
        \int \vg(\vx)p_n(\vx)
        d\vx \\
        \vb
        & = 
        \int \vg(\vx)\frac{p_n^2}{p_d}(\vx)
        d\vx \enspace .
    \end{align*}
    
    Similarly,
    
    \begin{align*}
        \mI 
        & = 
        \int \vg(\vx)\vg(\vx)^\top p_d(\vx)
        \bigg(1 - D(\vx)\bigg) 
        d\vx 
        = 
        \int \vg(\vx)\vg(\vx)^\top p_d(\vx)
        \bigg( \nu \frac{p_n}{p_d}(\vx) - \nu^2 \frac{p_n^2}{p_d^2}(\vx) + \circ(\nu^2) \bigg) 
        d\vx \\     
        & = 
        \nu \mI_n - \nu^2 \mB + \circ(\nu^2) 
    \end{align*}
    % 
    where the Fisher-score covariance (Fisher information) is $\mI_F$ and we use shorthand notations $A$ and $B$ for the remaining integrals:
    %
    \begin{align*}
        \mI_n
        & = 
        \int \vg(\vx)\vg(\vx)^\top p_n(\vx)
        d\vx \\
        \mB
        & = 
        \int \vg(\vx)\vg(\vx)^\top \frac{p_n^2}{p_d}(\vx)
        d\vx \enspace .
    \end{align*}

    \item Taylor expansion of $\mI^{-1}$
    
    \begin{align*}
        \mI^{-1} 
        & = 
        \bigg(
        \nu \mI_n - \nu^2 \mB + \circ(\nu^2) 
        \bigg)^{-1} \\
        & = 
        \bigg(
        \nu\mI_n (\textbf{Id} - \nu \mI_n^{-1} \mB)
        + \circ(\nu^2) 
        \bigg)^{-1} \\
        & = 
        \nu^{-1} \mI_n^{-1}
        \bigg(
        \textbf{Id} + \nu \mI_n^{-1} \mB + \nu^2 (\mI_n^{-1} \mB)^2 + \nu^3(\mI_n^{-1} \mB)^3 + \circ(\nu^3) \bigg)
        + \circ(\nu^2) \\
        & = 
        \nu^{-1} \mI_n^{-1} + \nu^0 \mI_n^{-2} \mB + \nu^1 \mI_n^{-1}(\mI_n^{-2} \mB)^2 + \nu^2 \mI_n^{-1}(\mI_n^{-2} \mB)^3 + \circ(\nu^2)
    \end{align*}
    
    \item Evaluating the $\mathrm{MSE}_{\mathrm{NCE}}$

    \begin{align*}
        & \mI^{-1} \vm \vm^\top \mI^{-1}
        = \\
        & \nu^0 (\mI_n^{-1} \vm_n \vm_n^{T} \mI_n^{-1}) 
        +
        \nu^2 (\mI_n^{-1} \vb \vb^{T} \mI_n^{-1} + 
        \mI_n^{-2} \mB \vm_n \vm_n^{T} \mI_n^{-2} \mB)
        + \circ(\nu^2)
    \end{align*}
    % 
    by plugging in the Taylor expansions of $\mI^{-1}$ and $\vm$ and retaining only terms up to the second order. Hence, the second term of the MSE without the trace is
    \begin{align*}
        & \bigg( \nu^{1}T^{-1} + \nu^0 2T^{-1} + \nu^{-1}T^{-1} \bigg)
        \mI^{-1} \vm \vm^\top \mI^{-1} \\
        & = 
        \bigg( \nu^{1}T^{-1} + \nu^0 2T^{-1} + \nu^{-1}T^{-1} \bigg)
        \bigg(
        \nu^0 (\mI_n^{-1} \vm_n \vm_n^\top \mI_n^{-1}) 
        +
        \nu^2 (\mI_n^{-1} \vb \vb^\top \mI_n^{-1} + 
        \mI_n^{-2} \mB \vm_n \vm_n^\top \mI_n^{-2} \mB)
        + \circ(\nu^2)
        \bigg) \\
        & = 
        \nu^{-1} \frac{1}{T} (\mI_n^{-1} \vm_n \vm_n^\top \mI_n^{-1}) 
        +
        \nu^{0} \frac{1}{T} (2\mI_n^{-1} \vm_n \vm_n^\top \mI_n^{-1})
        + \\
        & \enspace \nu^1 \frac{1}{T} (\mI_n^{-1} \vb_n \vb_n^\top \mI_n^{-1} + 
        \mI_n^{-2} \mB \vm_n \vm_n^\top \mI_n^{-2} \mB + \mI_n^{-1} \vm_n \vm_n^\top \mI_n^{-1})
        + \circ(\nu)
    \end{align*}

    and the first term of the MSE without the trace is
    %
    \begin{align*}
        & \bigg( \nu^{0} T^{-1} + \nu^1 T^{-1} \bigg) 
        \mathrm{tr}(\mI^{-1}) \\
        & = 
        \bigg( \nu^{0} T^{-1} + \nu^1 T^{-1} \bigg) 
        \bigg(
        \nu^{-1} \mI_n^{-1} + \nu^0 \mI_n^{-2} \mB + \nu^1 \mI_n^{-1}(\mI_n^{-2} \mB)^2 + \nu^2 \mI_n^{-1}(\mI_n^{-2} \mB)^3 + \circ(\nu^2)
        \bigg) \\
        & = 
        \nu^{-1} \frac{1}{T}\mI_n^{-1} 
        + 
        \nu^{0}\frac{1}{T}(\mI_n^{-2} \mB + \mI_n^{-1}) 
        +
        \nu^1
        \frac{1}{T}[\mI_n^{-1}(\mI_n^{-1} \mB)^2 + \mI_n^{-2} \mB]
        + \circ(\nu) \enspace .
    \end{align*}
    
    Subtracting the second term from the first term and applying the trace, we finally write the MSE:
    
    \begin{align*}
        \mathrm{MSE}_{\mathrm{NCE}} 
        & = \mathrm{tr}(
        \nu^{-1} \frac{1}{T}(\mI_n^{-1} - \mI_n^{-1} \vm_n \vm_n^{T} \mI_n^{-1})
        + 
        \nu^{0}\frac{1}{T}(\mI_n^{-2} \mB + \mI_n^{-1} - 2\mI_n^{-1} \vm_n \vm_n^{T} \mI_n^{-1}) 
        + \\
        & \nu^1
        \frac{1}{T}[\mI_n^{-1}(\mI_n^{-1} \mB)^2 + \mI_n^{-2} \mB  - \mI_n^{-1} \vb_n \vb_n^{T} \mI_n^{-1} - 
        \mI_n^{-2} \mB \vm_n \vm_n^{T} \mI_n^{-2} \mB - \mI_n^{-1} \vm_n \vm_n^{T} \mI_n^{-1}]
        + \circ(\nu) 
        ) \enspace .
    \end{align*}
    
    Rewriting $\mI_n^{-1} = \mI_n^{-1} \mI_n \mI_n^{-1}$, using the circular invariance of the trace operator and stopping at order $\nu^0$, we get:
    
    \begin{align*}
        \mathrm{MSE}_{\mathrm{NCE}} 
        & = 
        \nu^{-1} \frac{1}{T}
        \langle \mI_n^{-2}, \mI_n - \vm_n \vm_n^\top \rangle
        + 
        \nu^{0}\frac{1}{T}
        \langle \mI_n^{-2}, \mB + \mI_n - 2 \vm_n \vm_n^\top \rangle
        + \circ(1) \\
        & = 
        \nu^{-1} \frac{1}{T}
        \langle \mI_n^{-2}, \mathrm{Var}_{N \sim p_n}\vg(\textbf{N}) \rangle
        + 
        \nu^{0}\frac{1}{T}
        \langle \mI_n^{-2}, \mB + \mI_n - 2 \vm_n \vm_n^\top \rangle
        + \circ(1) \enspace .
    \numberthis
    \label{eq:alldatamse}
    \end{align*} 


% \aapo{Please write here an expression using the terms used below: omit terms of order $o(1)$}
% \aapo{and use the term "var" (which you denote below by Cov but it is usually var in the multivariate case.).}
% \aapo{better use N not X here as well as above in a similar proof}

    \item Optimize the $\mathrm{MSE}_{\mathrm{NCE}}$ w.r.t. $p_n$
    
    Looking at the above MSE, the dominant term of order $\nu^{-1}$ is $\langle \mI_n^{-2}, \mathrm{Var}_{N \sim p_n}\vg(\textbf{N}) \rangle \geq 0$
    is minimized when it is $0$, that is, when $\vg$ is constant in the support of $p_n$. Typically this means that $p_n$ is concentrated on a set of zero measure. In the 1D case, such case is typically the Dirac delta $p_n = \delta_z$, or a distribution with two deltas in case of symmetrical $\vg$.
    
    We can plug this in the terms of the next order $\nu^0$, which remain to be minimized: 
    %
    \begin{align*}
        \langle \mI_n^{-2}, \mB + \mI_n - 2 \vm_n \vm_n^{T} \rangle
        & = 
        \langle \mI_n^{-2}, \mB - \mI_n + 2\mI_n - 2 \vm_n \vm_n^{T} \rangle \\
        & =
        \langle \mI_n^{-2}, \mB - \mI_n + 2\mathrm{Var}_{N \sim p_n}\vg(\textbf{N}) \rangle \\ 
        & =
        \langle \mI_n^{-2}, \mB - \mI_n \rangle
    \end{align*} 
    %
    given we chose $p_n$ so that the variance is 0.
    
    The integrands of $\mB$ and $\mI$ respectively involve $p_n^2$ and $p_n$. Because $p_n$ is concentrated on a set of zero measure (Dirac-like), the term in $\mB$ dominates the term in $\mI$. This is because if we consider the $p_n$ as the limit of a sequence of some proper pdf's, the value of the pdf gets infinite in the support of that pdf in the limit, and thus $p_n^2$ is infinitely larger than $p_n$. Hence we are left with $\langle \mI_n^{-2}, \mB \rangle$.

    The integral with respect to $p_n$ simplifies to simply evaluating the $\vg(\vx)\vg(\vx)^\top/p_d(\vx)$ the support of $p_n$. Since we know that $\vg(\vx)$ is constant in that set, the main question is whether $p_d$ is constant in that set as well. Here, we heuristically assume that it is; this is intuitively appealing in many cases, if not necessarily true. (This we denote by heuristic approximation~2.)
    
    Thus, we have
    %Using $p_n = \delta_z$ and assuming heuristically that the``squared-Dirac" has a similar effect (only infinitely larger) as a single-Dirac in the integrand of $\mB$:
    \begin{align*}
    \int \vg(\vx)\vg(\vx)^\top \frac{\delta_z^2}{p_d}(\vx)
    d\vx 
    %= 
    %\vg(\vx)\vg(\vx)^\top \frac{\delta_z}{p_d}(\vx)
    %d\vx
    \approx 
    c\; \vg(\vz)\vg(\vz)^\top \frac{1}{p_d(\vz)}
    \end{align*}
    for some constant $c$ taking into account the effect of squaring of $p_n$ (it is ultimately infinite, but the reasoning is still valid in any sequence going to the limit.)
    
    Next we make heuristic approximation~3: we neglect any problems of inversion of singular, rank 1 matrices (note this is not a problem in the 1D case), and further obtain
    \begin{align*}
        \langle \mI_n^{-2}, \mB \rangle 
        \approx
        \mathrm{tr} \bigg(
        (\vg(\vz)\vg(\vz)^\top)^{-1}
         \vg(\vz)\vg(\vz)^\top \frac{1}{p_d(\vz)}
        (\vg(\vz)\vg(\vz)^\top)^{-1}
        \bigg)
        \approx
        \frac{1}{p_d(\vz)}
        \mathrm{tr} \bigg(
        (\vg(\vz)\vg(\vz)^\top)^{-1}
        \bigg)
    \enspace .
    \numberthis
    \label{eq:alldatamsedominant}
    \end{align*}
    
Minimizing this term is equivalent to the following maximization setup (still applying heuristic approximation~3):
%
\begin{align*}
    \arg\max_{\boldsymbol{\xi}} p_d(\boldsymbol{\xi}) 
    \mathrm{tr} \bigg(
        (\vg(\xi)\vg(\xi)^\top)^{-1}
        \bigg)^{-1}
    \enspace .
\end{align*}

Those points $z$ obtained by the above condition are the best candidates for $p_n$ to concentrate its mass on.

We arrived this result by making three heuristic approximations as explained above; we hope to be able to remove some of them in future work.

Numerically, evaluating the optimal noise in the all-data limit requires computing a weight $w(\vx) = \mathrm{tr} \bigg((\vg(\xi)\vg(\xi)^\top)^{-1}\bigg)^{-1}$ that is intractable in dimensions bigger than 1, due to the singularity of the rank 1 matrix. We can avoid this numerically by introducing an (infinitesimal) perturbation $\eps > 0$ which removes the singularity problem:
%
\begin{align*}
    w_{\eps}(\xi)
    & =
    \mathrm{tr} \bigg( 
    (\vg(\xi) \vg(\xi)^\top + \eps \mathrm{Id})^{-1}
    \bigg)^{-1} \\
    & = 
    \mathrm{tr} \bigg( 
    \eps^{-1} \mathrm{Id} - \frac{1}{\eps^2 + \eps \vg(\xi)^\top \mathrm{Id} \vg(\xi)} \vg(\xi) \vg(\xi)^\top
    \bigg)^{-1} \quad \text{by the Sherman-Morrison formula} \\
    & = 
    \bigg( 
    \eps^{-1} d - \frac{1}{\eps^2 + \eps \| \vg(\xi) \|^2} \| \vg(\xi) \|^2
    \bigg)^{-1} \\
    & = 
    \bigg( 
    \eps^{-1} (d - 1) + \eps^{0} \frac{1}{\| \vg(\xi) \|^2} + \eps^{1} \frac{-1}{\| \vg(\xi) \|^4} + O(\eps^2)
    \bigg)^{-1} \quad \text{by Taylor expansion} \\
    & = 
    \eps \frac{1}{d-1}
    + \eps^2 \frac{-1}{\| \vg(\xi) \|^2 (d-1)^2}
    + \eps^3 \frac{(2-d)}{\| \vg(\xi) \|^4 (d-1)^3}
    + O(\eps^4) \quad \text{by further Taylor expansion}
\end{align*}
%
where we go up to order 3 to ensure the weight $w_{\eps}(\xi)$ is positive. Finally, we can approximate the $\arg \max$ operator with its relaxation 
$\mathrm{soft}\arg\max^{\eps}(x) 
= 
\frac{e^{\frac{x}{\eps}}}{\int e^{\frac{x}{\eps}} dx}  
$, 
so that
%
\begin{align*}
    p_n(\vx) \approx 
    \mathrm{soft}\arg\max^{\eps_1} \big( 
    p_d(\vx) w_{\eps_2}(\vx)
    \big)
\end{align*}
%
where $(\eps_1, \eps_2) \in (\mathbb{R}_+^*)^2$ are two hyperparameters taken close to zero.
\end{itemize}

\newpage


\section{Optimal Noise for Estimating a Distribution: Proofs}

\label{sec:distributionproofs}

So far, we have optimized hyperparameters (such as the noise distribution) so that the reduce the uncertainty of the \textit{parameter} estimation, measured by the Mean Squared Error $\mathbb{E} \big[ \| \hat{\vtheta}_T - \vtheta^* \|^2 \big] = \frac{1}{T_d} \mathrm{tr}(\mSigma)$.

Sometimes, we might wish to reduce the uncertainty of the \textit{distribution} estimation, which we can measure using the Kullback-Leibler (KL) divergence $\mathbb{E}\big[ \mathcal{D}_{\mathrm{KL}}(p_d, p_{\hat{\vtheta}_T}) \big]$. 

We can specify this error, by using the Taylor expansion of the estimated $\hat{\theta}_T$ near optimality, given in~\cite{gutmann2012nce}: 
%
\begin{align}
    \hat{\vtheta}_T - \vtheta^*
    & =
    \vz
    + 
    O(\|\hat{\vtheta}_T - \vtheta^*\|^2)
    \label{eq:paramerror}
\end{align}
%
where 
$z \sim \mathcal{N}(0, \frac{1}{T_d}\mSigma)$ 
and $\mSigma$ is the asymptotic variance matrix. 

We can similarly take the Taylor expansion of the KL divergence with respect to its second argument, near optimality:
%
\begin{align*}
    J(\hat{\vtheta}_T)
    & :=
    \mathcal{D}_{\mathrm{KL}}(p_d, p_{\hat{\vtheta}_T}) \\
    & = 
    J(\vtheta^*) 
    + 
    <\nabla_{\theta}J(\theta^*), \hat{\theta}_T - \theta^*>
    + 
    \frac{1}{2}
    <(\hat{\vtheta}_T - \vtheta^*), \nabla^2_{\vtheta}J(\vtheta^*) \, (\hat{\vtheta}_T - \vtheta^*)>
    +
    O(\| \hat{\vtheta}_T - \vtheta^* \|^3) \\
    & = 
    J(\vtheta^*) 
    + 
    <\nabla_{\vtheta}J(\vtheta^*), \hat{\vtheta}_T - \vtheta^*)>
    + 
    \frac{1}{2}
    \big\| \hat{\vtheta}_T - \vtheta^* \big\|^2_{\nabla^2_{\vtheta}J(\vtheta^*)}
    +
    O(\| \hat{\vtheta}_T - \vtheta^* \|^3) 
\end{align*}
%
Note that some simplifications occur:
\begin{itemize}
    \item $J(\vtheta^*) = \mathcal{D}_{\mathrm{KL}}(p_{\vtheta^*}, p_{\vtheta^*}) = 0$ 

    \item $\nabla_{\vtheta}J(\vtheta^*) = 0$
    as the gradient the KL divergence at $\vtheta^*$ is the mean of the (negative) Fisher score, which is null.
    
    \item $\nabla_{\vtheta}^2J(\theta^*) = \mI_F$
\end{itemize}
%
Plugging in the estimation error~\ref{eq:paramerror} into the distribution error yields:
%
\begin{align*}
    J(\hat{\vtheta}_T) 
    & =
    \frac{1}{2}
    \bigg\| 
    \vz + O(\| \hat{\vtheta}_T - \vtheta^* \|^2) \bigg\|_{\mI_F}^2
    +
    O(\| \hat{\vtheta}_T - \vtheta^* \|^3) \\
    & =
    \frac{1}{2}
    \bigg(
    \| \vz \|_{\mI_F}^2 
    + 
    2
    <\vz, O(\| \vtheta - \vtheta^*\|^2)>_{\mI_F}
    + 
    \big\| O(\| \vtheta - \vtheta^*\|^2) \big\|_{\mI_F}^2 
    \bigg) + O(\| \hat{\vtheta}_T - \vtheta^* \|^3) \\
    & = 
    \frac{1}{2} \| \vz \|_{\mI_F}^2 
    +
    O(\| \hat{\vtheta}_T - \vtheta^* \|^2)
\end{align*}
%
by truncating the Taylor expansion to the first order. Hence up to the first order, the expectation yields:
%
\begin{align*}
    \mathbb{E}\big[ \mathcal{D}_{\mathrm{KL}}(p_d, p_{\hat{\vtheta}_T}) \big] 
    & =
    \frac{1}{2}
    \mathbb{E}\big[ \| \vz \|_{\mI_F}^2 \big]
    =
    \frac{1}{2}
    \mathbb{E}\big[ \vz^T \mI_F \vz \big]
    =
    \frac{1}{2}
    \mathbb{E}\big[ \mathrm{tr}(\vz^T \mI_F \vz) \big] 
    =
    \frac{1}{2}
    \mathbb{E}\big[ \mathrm{tr}(\mI_F \vz \vz^T) \big] \\
    & =
    \frac{1}{2}
    \mathrm{tr}( \mI_F \mathbb{E}[\vz \vz^T])
    =
    \frac{1}{2}
    \mathrm{tr}( \mI_F \mathrm{Var}[\vz])
    =
    \frac{1}{2 T_d}
    \mathrm{tr}( \mI_F \mSigma)
\end{align*}
%
Note that this is a general and known result which is applicable beyond the KL divergence: for any divergence, the 0th order term is null as it measures the divergence between the data distribution and itself, the 1st order term is null in expectation if the estimator $\hat{\theta}_T$ is asymptotically unbiased, which leaves an expected error given by the 2nd-order term $\frac{1}{2T_d}\mathrm{tr}(\nabla^2 J \mSigma)$ where J is the chosen divergence. Essentially, one would replace the Fisher Information above, which is the Hessian for a forward-KL divergence, by the Hessian for a given divergence.

Finding the optimal noise that minimizes the distribution error  means minimizing $\frac{1}{T_d} \mathrm{tr}(\Sigma I_F)$. Contrast that with the optimal noise that minimizes the parameter estimation error (asymptotic variance) $\frac{1}{T_d} \mathrm{tr}(\Sigma)$. We can reprise each of the three limit cases from the previous proofs, and derive novel optimal noise distributions:

\newpage

\begin{theorem}
    In the two limit cases of Theorem 1,
    the noise distribution minimizing the expected Kullback-Leibler divergence is given by
%    
    \begin{align}
        p_n^{\mathrm{opt}}(\vx) \propto p_d(\vx) \|\mI_F^{-\frac{1}{2}} \vg(\vx)\|
        \enspace .
    \end{align}
\end{theorem}
%
\textit{Proof:} case of $\nu \rightarrow \infty$

We recall the asymptotic variance $\frac{1}{T_d}\Sigma$ in the all-noise limit is given by equation~\ref{eq:allnoisemse} at the first order and without the trace. Multiplying by $I_F$ introduces no additional dependency in $p_n$, hence we retain the only term dependent that was dependent on $p_n$, $\mI_F^{-2} \int \vg(x) \vg(x)^\top \frac{p_d^2}{p_n}(\vx)d\vx$, multiply it with $I_F$ and take the trace. This yields the following cost to minimize:
%
\begin{equation}
J (p_n)= \frac{1}{T} \int \|I_F^{-\frac{1}{2}} \vg(\vx)\|^2\frac{p_d^2}{p_n}(\vx)d\vx
\end{equation}
%
with respect to $p_n$. As in previous proofs, we compute the variational (Fr\'echet) derivative together with the Lagrangian of the constraint $\int p_n(\vx)=1$ (with $\lambda$ denoting the Lagrangian multiplier) to obtain
\begin{equation}
\delta_{p_n}J = - \|\mI_F^{-\frac{1}{2}} \vg\|^2\frac{p_d^2}{ p_n^2} + \lambda \enspace .
\end{equation}
Setting this to zero and taking into account the non-negativity  of $p_n$ gives
\begin{equation}
p_n(\vx) =   \|\mI_F^{-\frac{1}{2}} \vg(x) \| p_d(\vx) /Z
\end{equation}
where $Z=\int  \|\mI_F^{-\frac{1}{2}} \vg(x) \| p_d(\vx)d\vx$ is the normalization constant. This is thus the optimal noise distribution, as a first-order approximation. 

In the third case, the limit of all data, we have the following conjecture: 
%
\begin{conjecture}
    In the limit of Conjecture 1
    the noise distribution minimizing the expected Kullback-Leibler divergence is such that it is all concentrated at the set of those $\boldsymbol{\xi}$ which are given by
%
    \begin{align*}
        \arg\max_{\boldsymbol{\xi}} \,
        &p_d(\boldsymbol{\xi}) 
        \mathrm{tr} \bigg(
        (\vg(\xi)\vg(\xi)^\top)^{-\frac{1}{2}}
        \bigg)^{-1} \\
        \mathrm{s.t.} & \quad \vg(\xi) = \mathrm{constant}
        \numberthis 
    \end{align*}
\end{conjecture}
%
\textit{Proof: } case of $\nu \rightarrow 0$

By the same considerations, we can obtain the optimal noise that minimizes the asymptotic error in distribution space in the all-data limit, using equation~\ref{eq:alldatamsedominant} with a multiplication by $I_F$ inside the the trace. This leads to the result.

\newpage

\section{Numerical Validation of the Predicted Distribution Error}
\label{sec:distriberrornumerical}

\begin{figure}[!ht]. %!ht
\centering
\includegraphics[width=0.7\columnwidth]{img/kl_vs_noiseval.pdf}
\caption{KL vs. the noise parameter (Gaussian Mean). The noise proportion is fixed at $50\%$.
}
\label{fig:klvsnoiseval}
\end{figure} 

We here numerically validate our formulae predicting the asymptotic estimation error in distribution space $\mathcal{D}_{\mathrm{KL}}(p_d, p_{\hat{\theta}_{\mathrm{NCE}}})$, when the noise is constrained within a parametric family containing the data; here, the model is a one-dimensional centered Gaussian with unit variance, parameterized by its mean.


\bibliography{chehab_427}

\end{document}
