\documentclass[accepted]{uai2022} 
\usepackage[utf8]{inputenc}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{multirow}
\usepackage{amsthm}
\usepackage{wrapfig}
\usepackage{comment}
\usepackage{graphicx}
\usepackage{bm}
\usepackage[english]{babel}
\usepackage{natbib}
\usepackage{csquotes}
\usepackage[section]{placeins}
\usepackage{floatrow}
\usepackage{soul}
\usepackage{siunitx}
\usepackage[skip=2pt]{caption}
\usepackage[ruled]{algorithm2e}

\DeclareCaptionFormat{myformat}{\fontsize{6}{6}\selectfont#1#2#3}
\captionsetup{format=myformat, labelfont=bf}

\pdfpagewidth=8.5in
\pdfpageheight=11in

\usepackage{xcolor}
\newcount\Comments  % 0 suppresses notes to selves in text
\Comments=0
\definecolor{darkgreen}{rgb}{0,0.6,0}
\newcommand{\kibitz}[2]{\ifnum\Comments=1{\color{#1}{#2}}\fi}
\newcommand{\rmr}[1]{\kibitz{red}{[RESHEF: #1]}}
\newcommand{\tbs}[1]{\kibitz{blue}{[TSVIEL: #1]}}
\newcommand{\David}[1]{\kibitz{orange}{[DAVID: #1]}}

\newcount\cready
\cready=1
\newcommand{\crarxiv}[2]{\ifnum\cready=1{#1}\else{#2}\fi}

\def\calL{\mathcal{L}}
\def\calR{\mathcal{R}}
\def\calF{\mathcal{F}}
\def\XX{\boldsymbol{X}}

% an algorithm from matrix to answer vector
\newcommand{\alg}{\textsc{A}}
% an algorithm from matrix + variances to answer vector
\newcommand{\algv}{\textsc{A}^\sigma}
\newcommand{\ebBlue}{\textsc{EbBlue}^\sigma}
\newcommand{\ebAlg}[2]{\textsc{Eb}_{\textsc{#1}}^{#2}}
% estimate answers from a single worker
\newcommand{\estm}{\phi}
\newcommand{\estmv}{\phi^\sigma}
\newcommand{\EB}{\estm_{\textsc{EB}}}
\newcommand{\EBv}{\estmv_{\textsc{EB}}}
% estimate variance from matrix
\newcommand{\estn}{\psi}

\def\cite{\citep}

\newcommand\norm[1]{\left\lVert#1\right\rVert}
\newcommand\tnum[1]{\tiny\num{#1}}
\newcommand{\best}[1]{{\underline{\textbf{#1}}}}

\def\bm{\boldsymbol}
% \def\paragraph{\vspace{-0mm}\paragraph}

\newtheorem{theorem}{Theorem}[section]
\newtheorem{corollary}{Corollary}[theorem]
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{definition}{Definition}[section]
% \usepackage[margin=2cm]{geometry}
\sisetup{detect-weight=true, detect-family=true}
%\addbibresource{eb.bib}

\author[1]{\href{mailto:<tsviel@gmail.com>?Subject=Your UAI 2022 paper}{Tsviel Ben Shabat}{}}
\author[1]{Reshef Meir}
\author[1]{David Azriel}
\affil[1]{%
    Dept. of Industrial Engineering and Management\\
    Technion---Israel Institute of Technology
}

\title{Empirical Bayes approach to Truth Discovery problems - Supplementary material}


\begin{document}\onecolumn
\maketitle
\section{Deriving The Empirical Bayes Estimator (EBE)}\label{proof:EBEderive}
% \rmr{maybe add a few words on why you derive this part again if it is supposed to be known. Also to make it clear which parts are new.}
In Appendix A we present known results as a background for the Empirical Bayes approach.\\
On our path to deriving an Empirical Bayes Estimator (EBE) for $\vec{\mu}$ we first need to derive the appropriate Bayes estimator. Recall $\mathcal{L}(\alg{(X)}, \vec{\mu})$ is a loss square euclidean loss function, i.e, $\mathcal{L}(\alg{(X)}, \vec{\mu}) = \|\vec{\mu} - \alg(X)\|_{l2}^2$. 
\paragraph{Bayesian Estimator}
 The Bayesian Estimator minimizes the Bayes risk which we will soon present, but first, we introduce the notion of the Prior distribution.
\paragraph{Prior probability distribution}
The Bayesian framework assumes a known prior, this will later we will relax the known-prior-assumption and estimate its parameters. First, we assume $\vec{\mu} = (\mu_1,..,\mu_m)$ is normally distributed, that is, we assume
\begin{equation*}\label{prior0}
  \mu_j \sim \mathcal{N}_0 = \mathcal{N}(\mu_0, \sigma_0^2)  
\end{equation*}
For example we can think of a group of workers answering "what are the weights of the people is these images?", thus, the underlying ground truth are weights of people which are close to a normal distribution.
\paragraph{Observations $\vec{X} = (X_1,\dots,X_m)$}
Assuming a prior distribution over $\vec{\mu}$, (notice the difference from the preliminaries where $X_j \sim \mathcal{N}(\mu_j, \sigma^2)$), we now denote the conditional distribution $X_j| \mu_j \sim \mathcal{N}(\mu_j, \sigma^2)$ 
\paragraph{Posterior probability distribution $\mu_j | X_j$}
can be viewed as an update for the assumed prior distribution after viewing the observations. It is well known that the Bayesian Estimator (which we will later show that it is better than the BLUE estimator shown in Theorem 2.1) for the square loss function is the posterior's mean, that is, $E_{\mu\sim N_0, \sigma}(\mu_j|X_j)= \min_{\alg(X_j)}{E_{\mu\sim N_0, \sigma}(\alg(X_j)-\mu_j)^2}$ where the latter expectation is under the Bayesian framework, that is $E_{\mu\sim N_0, \sigma}(\alg(X_j)-\mu_j)^2 = \int_\mu E_{\vec{\mu}, \vec{\sigma}}(\alg(X_j)-\mu)^2d\mu$,  hence, to estimate $\mu_j$ via a Bayesian Estimator we first need to calculate the posterior mean.
\begin{theorem}\label{theoremPost}
The posterior distribution is $\mu_j|X_j \sim \mathcal{N}(\tilde{\mu}_j, \tilde{\sigma}^2)$, 
\begin{equation*}\label{posterior}
\tilde{\mu}_j = X_j * \frac{\sigma^2_{0}}{\sigma^2_{0} + \sigma^2} + \mu_0 * \frac{\sigma^2}{\sigma^2_{0} + \sigma^2}, \ \tilde{\sigma}^2 = \frac{\sigma^2_{0}\sigma^2}{\sigma^2_{0} + \sigma^2}
\end{equation*}
\end{theorem}
\begin{proof}
\begin{align*}
    \mathcal{P}(\mu_j|X_j) &= \frac{\mathcal{P}(X_j|\mu_j)\mathcal{P}(\mu_j)}{\mathcal{P}(X_j)} \propto \mathcal{P}(X_j|\mu_j)\mathcal{P}(\mu_j) \\
    &\propto e^{\frac{\frac{(X_j-\mu_j)^2}{\sigma^2} + \frac{(\mu_j-\mu_0)^2}{\sigma_0^2}}{-2}} \propto e^{\frac{\mu_j^2(\frac{1}{\sigma^2} + \frac{1}{\sigma_0^2}) -2\mu_j(\frac{X_j}{\sigma^2} + \frac{\mu_0}{\sigma_0^2})}{-2}} \propto e^{\frac{(\frac{1}{\sigma^2} + \frac{1}{\sigma_0^2})(\mu_j-\tilde{\mu})^2}{-2}}
\end{align*}
Where the last proportional-to follows from completing-the-square technique and $$\tilde{\mu}_j = \frac{X_j\sigma_0^2+\mu_0\sigma^2}{\sigma_0^2 + \sigma^2}$$
\end{proof}
\subsection{The Empirical Bayes Estimator (EBE)}
We now relax the previous known-prior-assumption, this subsection is the empirical part in Empirical Bayes. We will show how to estimate the posterior mean using the observed data.\\
First we will derive the marginal distribution of $X_j$, that is, we evaluate the following expression: $$\mathcal{P}_{\mu_j\sim \mathcal{N}_0, \sigma}(X_j)=\int_{\mu_j}{\mathcal{P}_{\mu_j, \sigma}(X_j|\mu_j)\mathcal{P}(\mu_j)}d\mu_j$$
But we already saw on (\ref{theoremPost}) that $\mathcal{P}_{\mu_j, \sigma}(X_j|\mu_j)\mathcal{P}(\mu_j)$ is normally distributed, thus what is left is to calculate its mean and variance.\\
By law of total expectation
\begin{equation*}
    E_{\mu_j\sim N_0, \sigma}(X_j) = E_{\mu_j \sim N_0, \sigma}(E_{\mu_j, \sigma}(X_j|\mu_j)) = E_{\mu_j \sim N_0, \sigma}(\mu_j) = \mu_0
\end{equation*}
By law of total variance
\begin{equation*}
    Var_{\mu_j \sim N_0, \sigma}(X_j) = E_{\mu_j \sim N_0, \sigma}(Var_{\mu_j, \sigma}(X_j|\mu_j)) + Var_{\mu_j \sim N_0, \sigma}(E_{\mu_j, \sigma}(X_j|\mu_j)) = \sigma^2 + \sigma_0^2
\end{equation*}
\paragraph{Empirical Bayes Estimator}
Similarly to the work \emph{An introduction to Empirical Bayes Data Analysis} [George Casella, 1985], we can construct an estimate for $\tilde{\mu}_j$ at (\ref{posterior}), denote $\bar{X} = \frac{1}{m} \sum_{j=1}^m X_j$

\begin{proposition}\label{propChi}
$E_{\mu\sim N_0, \sigma}[\frac{\sigma^2 + \sigma_0^2}{\sum_{j=1}^m(X_j-\bar{X})^2}] = \frac{1}{m-3}$
\end{proposition}
\begin{proof}
Notice that the expectation is over the marginal of $X_j$ and thus, the $X_j$'s are i.i.d which means the expectation is of an inverse-chi-squared distribution with $m-1$ degrees of freedom. 
\end{proof}

An immediate corollaries of \ref{propChi} are:
\begin{corollary}\label{B_Hat}
$E_{\mu\sim N_0, \sigma}[\frac{(m-3)\sigma^2}{\sum_{j=1}^m(X_j-\bar{X})^2}] = \frac{\sigma^2}{\sigma^2 + \sigma_0^2}$
\end{corollary}

\begin{corollary}An Empirical Bayes Estimator is
\begin{equation*}\label{casella}
\EB(\vec{X}, \sigma) = \frac{(m-3)\sigma^2}{\sum_{j=1}^m (X_j-\bar{X})^2}\bar{X}\boldsymbol{1}  + \Big[1 - \frac{(m-3)\sigma^2}{\sum_{j=1}^m (X_j -\bar{X})^2}\Big]\vec{X}  
\end{equation*}

\end{corollary}

\section{Proofs}
\subsection{Theorem 2.2 (Casella, 1985's Theorem)}\label{proof:casella}

In the AWG model with a single worker and $m>3$ questions, 
\begin{equation}
    \mathcal{R}_{\vec{\mu},{\sigma}}(\EBv) < \mathcal{R}_{\vec{\mu},\sigma}(\estm_I) \ \text{ for all }\vec{\mu} \in \mathbb{R}^m, {\sigma} \in \mathbb{R}^+.
\end{equation}    

\begin{proof}
Denote $\hat{B} = \frac{(m-3)\sigma^2}{S^2(\vec{X})}$
\begin{align*}
   \mathcal R_{\vec{\mu}, \sigma}(\EB) &= E_{\vec{\mu}, \sigma}[\lVert \bar{X} + (1-\hat{B})(X - \bar{X}) - \mu \rVert^2] \\
    &=\Sigma_{j=1}^m E_{\vec{\mu}, \sigma}[(\bar{X} + (1-\hat{B})(X_j - \bar{X}) - \mu_j)^2]\\
    &= \Sigma_{j=1}^m E_{\vec{\mu}, \sigma}[(X_j - \mu_j - \hat{B}(X_j - \bar{X}))^2]\\
    &= \Sigma_{j=1}^m E_{\vec{\mu}, \sigma}[(X_j - \mu_j)^2] - 2E_{\vec{\mu}, \sigma}[ \hat{B}(X_j - \mu_j)(X_j - \bar{X})] + E_{\vec{\mu}, \sigma}[(\hat{B}(X_j - \bar{X}))^2]\\
    &= \mathcal{R}_{\vec{\mu},\sigma}(\estm_I) - [\Sigma_{j=1}^m2E_{\vec{\mu}, \sigma}[ \hat{B}(X_j - \mu_j)(X_j - \bar{X})]] + \sigma^4(m-3)^2E_{\vec{\mu}, \sigma}[\frac{1}{S^2(\vec{X})}]
\end{align*}
\begin{lemma} Stein's Lemma: \\
Let $X\sim N(\mu, \sigma^2)$ and Let $g(X)$ be a function for which $E_{\vec{\mu}, \sigma}[g(X)(X-\mu)]$ and $E_{\vec{\mu}, \sigma}[\frac{d}{dx}g(X)]$ both exist, then $E_{\vec{\mu}, \sigma}[g(X)(X-\mu)] = \sigma^2E_{\vec{\mu}, \sigma}[\frac{d}{dx}g(X)]$
\end{lemma}
We now focus on the mixed term:\\
Denote $g(X_j) = \frac{X_j - \bar{X}}{S^2(\vec{X})}$\\
\begin{align*}
    E_{\vec{\mu}, \sigma}[ \hat{B}(X_j - \mu_j)(X_j - \bar{X})] &= \sigma^2(m-3)E_{\vec{\mu}, \sigma}[\frac{X_j-\bar{X}}{S^2(\vec{X})}(X_j-\mu_j)]\\
    &= \sigma^2(m-3)E_{\vec{\mu}, \sigma}[E_{\vec{\mu}, \sigma}[\frac{X_j-\bar{X}}{S^2(\vec{X})}(X_j-\mu_j)|X_1,\dots,X_{j-1}, X_{j+1},\dots, X_m]]\\
    &=\sigma^2(m-3)E_{\vec{\mu}, \sigma}[E_{\vec{\mu}, \sigma}[g(X_j)(X_j-\mu_j)|X_1,\dots,X_{j-1}, X_{j+1},\dots, X_m]]\\
    &=_{stein}\sigma^4(m-3)E_{\vec{\mu}, \sigma}[E_{\vec{\mu}, \sigma}[\frac{d}{dX_j}g(X_j)|X_1,\dots,X_{j-1}, X_{j+1},\dots, X_m]]\\
    &= \sigma^4(m-3)E_{\mu,
    \sigma}[\frac{d}{dX_j}g(X_j)]\\
    \frac{d}{dX_j}g(X_j) &= \frac{(1 - \frac{1}{m})S^2(\vec{X}) -2(X_j-\bar{X})^2}{\big (S^2(\vec{X})\big )^2}
\end{align*}
The mixed term can now be rewritten as follows:
\begin{align*}
    -\Sigma_{j=1}^m2E_{\vec{\mu}, \sigma}[ \hat{B}(X_j - \mu_j)(X_j - \bar{X})] &= -2\sigma^4(m-3)E_{\vec{\mu}, \sigma}[\frac{(1 - \frac{1}{m})mS^2(\vec{X}) -2S^2(\vec{X})}{\big ( S^2(\vec{X}) \big )^2}]\\
    &= -2\sigma^4(m-3)^2E_{\vec{\mu}, \sigma}[\frac{1}{S^2(\vec{X})}]
\end{align*}
Summing up everything:
\begin{align*}
 \mathcal R_{\vec{\mu}, \sigma}(\EB) &=\\
 &= \mathcal{R}_{\vec{\mu},\sigma}(\estm_I) -2\sigma^4(m-3)^2E_{\vec{\mu}, \sigma}[\frac{1}{S^2(\vec{X})}] + \sigma^4(m-3)^2E_{\vec{\mu}, \sigma}[\frac{1}{S^2(\vec{X})}]\\
 &= \mathcal{R}_{\vec{\mu},\sigma}(\estm_I) - \sigma^4(m-3)^2E_{\vec{\mu}, \sigma}[\frac{1}{S^2(\vec{X})}] \ \forall m > 3
\end{align*}

\end{proof}
\subsection{Proposition 3.1}\label{suff_statistic}
$\algv_B(\boldsymbol{X})$ is  a sufficient statistic for $\vec{\mu} = (\mu_1,..,\mu_m)$ under $\mathcal{P}_{\vec{\mu}, \vec{\sigma}}$.
\begin{proof}Since the regularity conditions hold for a multiplication of independent normal distributions, to prove sufficiency by the Fisher-Neyman factorization theorem, we only need to show that the model can be represented as a multiplication of a function $\tilde{h}$ of the observations and a function $\tilde{g}$ of a sufficient statistic and the unknown parameter as follows:
\begin{align*}
    \mathcal{P}_{(\mu_1,..,\mu_m, \sigma_1, \dots, \sigma_n)}(X_{11},..,X_{nm}) &= \Pi_{j=1}^m\Pi_{i=1}^n\mathcal{P}_{\mu_j, \sigma_i}(X_{ij})\\
    &= \Pi_{j=1}^m\Pi_{i=1}^n\frac{1}{\sigma_i\sqrt{2\pi}}e^{-\frac{1}{2\sigma_i^2}(X_{ij}-\mu_j)^2}\\ 
    &= \Pi_{j=1}^m[C \cdot e^{-\frac{1}{2}\sum_{i=1}^n\frac{X_{ij}^2}{\sigma_i^2}}]*[e^{-\frac{1}{2}\mu_j^2\sum_{i=1}^n\frac{1}{\sigma_i^2} + \mu_j \sum_{i=1}^n\frac{X_{ij}}{\sigma_i^2}}]\\ 
    &= \Pi_{j=1}^m h(X_{1j},..,X_{nj})g(\delta_j(X), \mu_j)\\
    &=\tilde{h}(X_{11},..,X_{nm})\tilde{g}(\delta(X), \mu)
\end{align*}
where C is a constant.
\end{proof}
\subsection{Theorem 4.1}\label{proof:general_model}
For any unbiased algorithm $\alg$, and $m>3$,\footnote{Since in this subsection we do not assume that the distribution of $\XX$ follows the AWG model, we do not need a parameter for the individual competence. Other than that, all definitions remain the same.}

$$\calR_{\vec\mu}(\ebAlg{\alg}{\estn}) <\calR_{\vec\mu}(\alg) \text{ for all }\vec{\mu} \in \mathbb{R}^m $$
if and only if
\begin{equation}
  2(m-3)\Sigma_{j=1}^m Cov\Big(X_j^A, \frac{\estn(\XX)(X_j^A-\bar{X}^A)}{\|\vec{X}^A -\bar{X}^A{\vec{1}}\|^2}\Big) 
  -(m-3)^2E_{\vec\mu}\Big(\frac{(\estn(\XX))^2}{\|\vec{X}^A -\bar{X}^A{\vec{1}}\|^2}\Big) >  0.
\end{equation}
For convenience we denote $\vec{X} = \vec{X}^A$
\begin{proof}
\begin{align*}
\calR_{\vec\mu}(\ebAlg{\alg}{\estn}) &= E_{\vec{\mu}}[\lVert \bar{X} + (1-\frac{\alpha\hat{\sigma}^2}{S^2(\vec{X})})(X - \bar{X}) - \mu \rVert^2] \\
    &= E_{\vec{\mu}}[\|(X - \mu)\|^2] - 2E_{\vec{\mu}}[ \frac{\alpha\hat{\sigma}^2}{S^2(\vec{X})}(X - \mu)^T(X - \bar{X})] + E_{\vec{\mu}}[(\|\frac{\alpha\hat{\sigma}^2}{S^2(\vec{X})}(X - \bar{X})\|^2]\\
    &= \calR_{\vec\mu}(\alg) -2\alpha E_{\vec{\mu}}[ \frac{\hat{\sigma}^2}{S^2(\vec{X})}(X - \mu)^T(X - \bar{X})] + \alpha^2E_{\vec{\mu}}[\frac{\hat{\sigma}^4}{S^2(\vec{X})}]\\
    &\text{Focusing on the mixed term:}\\
    E_{\vec{\mu}}[ \frac{\hat{\sigma}^2}{S^2(\vec{X})}(X - \mu)^T(X - \bar{X})] &= \sum_{i=1}^mE_{\vec{\mu}}[ \frac{\hat{\sigma}^2}{S^2(\vec{X})}(X_j - \mu_j)(X_j - \bar{X})]\\
    &=_{E_{\vec{\mu}}(X_j)=\mu_j} \sum_{i=1}^m Cov\Big(X_j - \mu_j, \frac{\hat{\sigma}^2(X_j - \bar{X})}{S^2(\vec{X})}\Big)\\
    &= \sum_{i=1}^m Cov\Big(X_j, \frac{\hat{\sigma}^2(X_j - \bar{X})}{S^2(\vec{X})}\Big)\\
    \text{Therefore we get:}\\
    \calR_{\vec\mu}(\alg) - \calR_{\vec\mu}(\ebAlg{\alg}{\estn}) &=  2\alpha\sum_{i=1}^m Cov\Big(X_j, \frac{\hat{\sigma}^2(X_j - \bar{X})}{S^2(\vec{X})}\Big) - \alpha^2E_{\vec{\mu}}[\frac{\hat{\sigma}^4}{S^2(\vec{X})}]\\
\end{align*}
\end{proof}
\subsection{Proposition 4.2}\label{proof:alpha_star}
Denote $\psi(\XX) = \hat{\sigma}^2$\\
Choosing $\alpha^* = \frac{\Sigma_{j=1}^m Cov\Big(X_j^A, \frac{\hat{\sigma}^2(X_j-\bar{X})}{S^2(\vec{X}^A)}\Big)}{E_{\vec{\mu}}\Big(\frac{\hat{\sigma}^4}{S^2(\vec{X}^A)}\Big)}$ minimizes $\calR_{\vec{\mu}}(\ebAlg{\alg}{\estn})$.

\begin{proof}
Denote  $C := Cov\Big(X_j^A, \frac{\hat{\sigma}^2(X_j^A-\bar{X}^A)}{S^2(\vec{X}^A)}\Big)$ and  $E := E_{\vec{\mu}}\Big(\frac{\hat{\sigma}^4}{S^2(\vec{X}^A)}\Big)$. Then from Theorem 4.1 We wish to maximize the parabola $2\alpha C -\alpha^2E$ which can be easily shown to maximized at $\alpha^* = \frac{C}{E}$.
\end{proof}
\subsection{Theorem 4.3}\label{proof:normalModel}
Denote $\psi(\vec{X})=\hat{\sigma}^2$ as an estimator of $\sigma^2$.


\begin{multline}
  \calR_{\vec{\mu}, \vec{\sigma}}(\ebAlg{\alg}{\estn}) = \calR_{\vec{\mu}, \vec{\sigma}}(\alg) +  \frac{(m-3)^2}{m-1}\Big(E_{\vec{\mu}, \vec{\sigma}}[\frac{(\estn(\vec X^A))^2}{S^2(\vec{X}^A)}]
  - 2{\sigma}^2\Big[E_{\vec{\mu}, \vec{\sigma}}[\frac{\estn(\vec X^A)}{S^2(\vec{X}^A)}] + E_{\vec{\mu}, \vec{\sigma}}\big[\frac{\Sigma_{j=1}^m\frac{d\psi(\vec X^A)}{dX^A_j}(X_j^A - \bar{X}^A)}{(m-3)S^2(\vec{X}^A)}\big]\Big]\Big)
\end{multline}

For convenience we denote $\vec{X} = \vec{X}^A$
\begin{proof}
Denote $g(X) = \frac{\hat{\sigma}^2(X_j - \bar{X})}{S^2(\vec{X})}$
\begin{align*}
    \calR_{\vec{\mu}, \vec{\sigma}}(\ebAlg{\alg}{\estn}) &= E_{\vec{\mu}, \vec{\sigma}}[\lVert \bar{X} + (1-\hat{B})(X - \bar{X}) - \mu \rVert^2] \\
    &= \Sigma_{j=1}^m E_{\vec{\mu}, \vec{\sigma}}[(X_j - \mu_j)^2] - 2E_{\vec{\mu}, \vec{\sigma}}[ \hat{B}(X_j - \mu_j)(X_j - \bar{X})] + E_{\vec{\mu}, \vec{\sigma}}[(\hat{B}(X_j - \bar{X}))^2]\\
    &= \calR_{\vec{\mu}, \vec{\sigma}}(\alg) - 2(m-3)[\Sigma_{j=1}^m E_{\vec{\mu}, \vec{\sigma}}[\frac{\hat{\sigma}^2(X_j - \mu_j)(X_j - \bar{X})}{S^2(\vec{X})}]] + (m-3)^2E_{\vec{\mu}, \vec{\sigma}}[\frac{\hat{\sigma}^4}{S^2(\vec{X})}]\\
    E_{\vec{\mu}, \vec{\sigma}}[\frac{\hat{\sigma}^2(X_j - \mu_j)(X_j - \bar{X})}{S^2(\vec{X})}] &= E_{\vec{\mu}, \vec{\sigma}}[g(X)(X_j - \mu_j)]\\
    &=_{stein} \sigma^2E_{\vec{\mu}, \vec{\sigma}}[\frac{d}{dX_j}g(X)]
    \end{align*}
    \begin{align*}
    \frac{d}{dX_j}g(X) &= \frac{d}{dX_j}\frac{\hat{\sigma}^2(X_j - \bar{X})}{S^2(\vec{X})}\\
    &= \frac{[(1-\frac{1}{m})\hat{\sigma}^2 + \frac{d\hat{\sigma}^2}{dX_j}(X_j - \bar{X})]S^2(\vec{X}) - 2\hat{\sigma}^2(X_j - \bar{X})^2}{S^4}\\
    &= \frac{\hat{\sigma}^2[(1-\frac{1}{m})S^2(\vec{X}) - 2(X_j - \bar{X})^2]+\frac{d\hat{\sigma}^2}{dX_j}(X_j - \bar{X})S^2(\vec{X})}{\big (S^2(\vec{X}) \big )^2}\\
    \Sigma_{j=1}^m E_{\vec{\mu}, \vec{\sigma}}[\frac{\hat{\sigma}^2(X_j - \mu_j)(X_j - \bar{X})}{S^2(\vec{X})}] &= \sigma^2(E_{\vec{\mu}, \vec{\sigma}}[\hat{\sigma}^2 \Sigma_{j=1}^m\frac{(1-\frac{1}{m})S^2(\vec{X}) - 2(X_j - \bar{X})^2}{\big (S^2(\vec{X}) \big )^2}] \\
    &~~~~~~~~+ E_{\vec{\mu}, \vec{\sigma}}[\frac{S^2(\vec{X}) \Sigma_{j=1}^m\frac{d\hat{\sigma}^2}{dX_j}(X_j - \bar{X})}{\big (S^2(\vec{X}) \big )^2}])\\
    &= \sigma^2[(m-3)E_{\vec{\mu}, \vec{\sigma}}[\frac{\hat{\sigma}^2}{S^2(\vec{X})}] + E_{\vec{\mu}, \vec{\sigma}}[\frac{\Sigma_{j=1}^m\frac{d\hat{\sigma}^2}{dX_j}(X_j - \bar{X})}{S^2(\vec{X})}]]
\end{align*}
Also mind that:
\begin{align*}
    E_{\vec{\mu}, \vec{\sigma}}[\frac{\hat{\sigma}^2(X-\mu)^T(X-\bar{X})}{S^2(\vec{X})}] &= cov(\hat{\sigma}^2,\frac{(X-\mu)^T(X-\bar{X})}{S^2(\vec{X})}) + E_{\vec{\mu}, \vec{\sigma}}[\hat{\sigma}^2]E_{\vec{\mu}, \vec{\sigma}}(\frac{(X-\mu)^T(X-\bar{X})}{S^2(\vec{X})}]\\
    &=_{stein} cov(\hat{\sigma}^2,\frac{(X-\mu)^T(X-\bar{X})}{S^2(\vec{X})}) + (m-3)\sigma^2E_{\vec{\mu}, \vec{\sigma}}[\hat{\sigma}^2]E_{\vec{\mu}, \vec{\sigma}}[\frac{1}{S^2(\vec{X})}]\\
    E_{\vec{\mu}, \vec{\sigma}}[\frac{\hat{\sigma}^2}{S^2(\vec{X})}] &= cov(\hat{\sigma}^2, \frac{1}{S^2(\vec{X})}) + E_{\vec{\mu}, \vec{\sigma}}[\hat{\sigma}^2]E_{\vec{\mu}, \vec{\sigma}}[\frac{1}{S^2(\vec{X})}]\\
\end{align*}
And therefore we get that:
\begin{align*}
    \sigma^2&E_{\vec{\mu}, \vec{\sigma}}[\frac{\Sigma_{j=1}^m\frac{d\hat{\sigma}^2}{dX_j}(X_j - \bar{X})}{S^2(\vec{X})}]] = cov(\hat{\sigma}^2,\frac{(X-\mu)^T(X-\bar{X})}{S^2(\vec{X})}) \\
    &~~~~+ (m-3)\sigma^2E_{\vec{\mu}, \vec{\sigma}}[\hat{\sigma}^2]E_{\vec{\mu}, \vec{\sigma}}[\frac{1}{S^2(\vec{X})}] - \sigma^2(m-3)E_{\vec{\mu}, \vec{\sigma}}[\frac{\hat{\sigma}^2}{S^2(\vec{X})}]\\
    &= cov(\hat{\sigma}^2,\frac{(X-\mu)^T(X-\bar{X})}{S^2(\vec{X})})\\ &~~~~+(m-3)\sigma^2E_{\vec{\mu}, \vec{\sigma}}[\hat{\sigma}^2]E_{\vec{\mu}, \vec{\sigma}}[\frac{1}{S^2(\vec{X})}] -\sigma^2(m-3)(cov(\hat{\sigma}^2, \frac{1}{S^2(\vec{X})}) + E_{\vec{\mu}, \vec{\sigma}}[\hat{\sigma}^2]E_{\vec{\mu}, \vec{\sigma}}[\frac{1}{S^2(\vec{X})}])\\
    &= cov(\hat{\sigma}^2,\frac{(X-\mu)^T(X-\bar{X})}{S^2(\vec{X})}) -\sigma^2(m-3)(cov(\hat{\sigma}^2, \frac{1}{S^2(\vec{X})}))\\
    &= cov(\hat{\sigma}^2,\frac{(X-\mu)^T(X-\bar{X}) - \sigma^2(m-3)}{S^2(\vec{X})})
\end{align*}

Plugging everything:
\begin{align*}
    \mathcal{R}_{\vec{\mu}, \vec{\sigma}}(\hat{\mu}, \mu) &= \calR_{\vec{\mu}, \vec{\sigma}}(\alg) - 2\sigma^2(m-3) \Big((m-3)E_{\vec{\mu}, \vec{\sigma}}[\frac{\hat{\sigma}^2}{S^2(\vec{X})}] + E_{\vec{\mu}, \vec{\sigma}}\Big[\frac{\Sigma_{j=1}^m\frac{d\hat{\sigma}^2}{dX_j}(X_j - \bar{X})}{S^2(\vec{X})}\Big]\Big)\\
    &~~~~+ (m-3)^2E_{\vec{\mu}, \vec{\sigma}}[\frac{\hat{\sigma}^4}{S^2(\vec{X})}]\\
    &= \calR_{\vec{\mu}, \vec{\sigma}}(\alg) + (m-3)^2\Big(E_{\vec{\mu}, \vec{\sigma}}[\frac{\hat{\sigma}^4}{S^2(\vec{X})}] - 2\sigma^2\Big[E_{\vec{\mu}, \vec{\sigma}}[\frac{\hat{\sigma}^2}{S^2(\vec{X})}] + E_{\vec{\mu}, \vec{\sigma}}[\frac{\Sigma_{j=1}^m\frac{d\hat{\sigma}^2}{dX_j}(X_j - \bar{X})}{(m-3)S^2(\vec{X})}]\Big]\Big)\\
    &= \calR_{\vec{\mu}, \vec{\sigma}}(\alg) + (m-3)^2(E_{\vec{\mu}, \vec{\sigma}}[\frac{\hat{\sigma}^4}{S^2(\vec{X})}] - 2\sigma^2E_{\vec{\mu}, \vec{\sigma}}[\frac{\hat{\sigma}^2}{S^2(\vec{X})}])\\
    &~~~~-2(m-3)cov\Big(\hat{\sigma}^2,\frac{(X-\mu)^T(X-\bar{X}) - \sigma^2(m-3)}{S^2(\vec{X})}\Big)
\end{align*}
\end{proof}
Denote by $\bar Y:=\frac{1}{m}\sum_jY_j$ and $S^2(Y):= \frac1{m-1}\sum_j(Y_j-\bar Y)^2$ its \emph{mean} and its \emph{sample variance}, respectively which yields the result.

\subsection{Corollary 4.3.2}\label{proof:meanAdjusted}
Under the Normal model,  if $\estn$ is mean-adjusted then 
\begin{align*}
    &\mathcal{R}_{\vec{\mu}, \vec{\sigma}}(\ebAlg{\alg}{\estn}) < \mathcal{R}_{\vec{\mu}, \vec{\sigma}}(\alg) 
    \\ &+ 
    \frac{(m-3)^2}{m-1}(E_{\vec{\mu}, \vec{\sigma}}[\frac{(\estn(\vec X^A))^2}{S^2(\vec{X}^A)}] - 2{\sigma}^2E_{\vec{\mu}, \vec{\sigma}}[\frac{\estn(\vec X^A)}{S^2(\vec{X}^A)}])
\end{align*}

\begin{proof}
From theorem 4.3:
$$\calR_{\vec{\mu}, \vec{\sigma}}(\ebAlg{\alg}{\estn}) = \calR_{\vec{\mu}, \vec{\sigma}}(\alg) +  \frac{(m-3)^2}{m-1}\Big(E_{\vec{\mu}, \vec{\sigma}}[\frac{(\estn(\vec X^A))^2}{S^2(\vec{X}^A)}]
  - 2{\sigma}^2\Big[E_{\vec{\mu}, \vec{\sigma}}[\frac{\estn(\vec X^A)}{S^2(\vec{X}^A)}] + E_{\vec{\mu}, \vec{\sigma}}\big[\frac{\Sigma_{j=1}^m\frac{d\psi(\vec X^A)}{dX^A_j}(X_j^A - \bar{X}^A)}{(m-3)S^2(\vec{X}^A)}\big]\Big]\Big)$$

Under the assumptions made it is easy to see that: 
\begin{align*}
    E_{\vec{\mu}, \vec{\sigma}}\big[\frac{\Sigma_{j=1}^m\frac{d\psi(\vec X^A)}{dX^A_j}(X_j^A - \bar{X}^A)}{(m-3)S^2(\vec{X}^A)}\big]\Big] &=  E_{\vec{\mu}, \vec{\sigma}}\big[\frac{\Sigma_{j=1}^m\frac{d\psi(\vec X^A)}{dX^A_j}(X_j^A - \bar{X}^A)}{(m-3)S^2(\vec{X}^A)}\big]1_{X_j^A \leq \bar{X}^A}\Big] \\
    &~~~+ E_{\vec{\mu}, \vec{\sigma}}\big[\frac{\Sigma_{j=1}^m\frac{d\psi(\vec X^A)}{dX^A_j}(X_j^A - \bar{X}^A)}{(m-3)S^2(\vec{X}^A)}\big]1_{X_j^A > \bar{X}^A}\Big] \\
    &> 0
\end{align*}
And the result immediately follows.
\end{proof}
\subsection{Additional Corollaries}\label{apx:additional_corollaries}
\begin{corollary}\label{indepcase}
If $\hat{\sigma}^2$ is independent of $X_j^A (\frac{d\hat{\sigma}^2}{dX_j^A} = 0) \ \forall j$ then directly from Theorem 4.3 we get that:
$$\calR_{\vec{\mu}, \vec{\sigma}}(\ebAlg{\alg}{\estn}) = \calR_{\vec{\mu}, \vec{\sigma}}(\alg) + E_{\vec{\mu},  \vec{\sigma}}[\frac{1}{S^2(\vec{X}^A)}](m-3)^2 (E_{\vec{\mu},  \vec{\sigma}}[\hat{\sigma}^4]-2\sigma^2E_{\vec{\mu},  \vec{\sigma}}[\hat{\sigma}^2])$$
and $\calR_{\vec{\mu}, \vec{\sigma}}(\ebAlg{\alg}{\estn}) < \calR_{\vec{\mu}, \vec{\sigma}}(\alg)$ if
\begin{align}
  \frac{E_{\vec{\mu},  \vec{\sigma}}[\hat{\sigma}^4]}{E_{\vec{\mu},  \vec{\sigma}}[\hat{\sigma}^2]} < 2\sigma^2  
\end{align}
\end{corollary}
\begin{corollary}
Assume $\hat{\sigma}^2$ is independent of $X_j \ \forall j$ and that $\exists \epsilon > 0$ $|\hat{\sigma}^2 - \sigma^2| < \epsilon$ w.p 1 then $\calR_{\vec{\mu}, \vec{\sigma}}(\ebAlg{\alg}{\estn}) < \calR_{\vec{\mu}, \vec{\sigma}}(\alg)$ if
\begin{equation}
\epsilon \in (0, \sigma^2) \  
\end{equation}
\end{corollary}
\begin{proof}
From corollary \ref{indepcase}, we found that $E_{\vec{\mu}, \vec{\sigma}}[\hat{\sigma}^4] < 2\sigma^2E_{\vec{\mu}, \vec{\sigma}}[\hat{\sigma}^2]$ and therefore:
\begin{align*}
    E_{\vec{\mu}, \vec{\sigma}}[\hat{\sigma}^4] &< (\sigma^2 + \epsilon)E_{\vec{\mu}, \vec{\sigma}}[\hat{\sigma}^2] <  2\sigma^2E_{\vec{\mu}, \vec{\sigma}}[\hat{\sigma}^2]\\
    (\sigma^2 + \epsilon) &< 2\sigma^2\\
    0 &< \epsilon < \sigma^2
\end{align*}
\end{proof}
\begin{corollary}\label{single_worker_corn}
Assume $\hat{\sigma}^2$ is independent of $X_j \ \forall j$ and that $\exists \epsilon > 0$ $|\hat{\sigma}^2 - \sigma^2| < \epsilon$ w.p $\delta$ and $\exists B$ s.t $\hat{\sigma}^2 < B$ w.p $1$ then $\calR_{\vec{\mu}, \vec{\sigma}}(\ebAlg{\alg}{\estn}) < \calR_{\vec{\mu}, \vec{\sigma}}(\alg) \ \forall \mu \in \mathbb{R}^m, m>3$ if
\begin{equation} \left\{
\begin{array}{@{}rl@{}}
B &< \frac{\delta}{1-\delta}\sigma^4\\
\epsilon &\in \big(0, -2\sigma^2 + \sqrt{5\sigma^4 + B(1 - \frac{1}{\delta})}\big)
\end{array}
\right .
\end{equation}
\end{corollary}
\begin{figure}
        \includegraphics[width=5cm]{biased_oracle.jpg}
        \centering
        \caption{EBE for 5 aggregated workers with biased $\hat{\sigma}^2$ vs BLUE; each data point is a 100,000 iteration average, each iteration includes new GT and new workers' responses}
\end{figure}
We know from corollary \ref{indepcase}, under the norml model, when the competence estimator is independent of the observations $X$, then EBE dominates BLUE if $E_{\vec{\mu}, \vec{\sigma}}[\hat{\sigma}^4] < 2\sigma^2E_{\vec{\mu}, \vec{\sigma}}[\hat{\sigma}^2]$.

Denote the event $G = \{|\sigma^2 - \hat{\sigma}^2| < \epsilon \}$
\begin{align*}
    E_{\vec{\mu}, \vec{\sigma}}[\hat{\sigma}^4] &= E_{\vec{\mu}, \vec{\sigma}}[\hat{\sigma}^4|G]\delta + E_{\vec{\mu}, \vec{\sigma}}[\hat{\sigma}^4|G^c](1-\delta)\\
    &\leq (\sigma^2 + \epsilon)^2\delta + B(1-\delta)\\
    2\sigma^2E_{\vec{\mu}, \vec{\sigma}}[\hat{\sigma}^2] &= 2\sigma^2E_{\vec{\mu}, \vec{\sigma}}[\hat{\sigma}^2|G]\delta + 2\sigma^2E_{\vec{\mu}, \vec{\sigma}}[\hat{\sigma}^2|G^c]\\
    &> 2\sigma^2(\sigma^2 - \epsilon)\delta
\end{align*}
Therefore it is sufficient to require that:
\begin{align*}
    (\sigma^2 + \epsilon)^2\delta + B(1-\delta) &< 2\sigma^2(\sigma^2 - \epsilon)\delta\\
    0 &< -\epsilon^2 -4\epsilon\sigma^2 + \sigma^4 + B(1 - \frac{1}{\delta})
\end{align*}
Which is a parabola of $\epsilon$ with roots $\epsilon=-2\sigma^2 \pm \sqrt{5\sigma^4 + B(1 - \frac{1}{\delta})}$. Notice that $B(1 - \frac{1}{\delta}) < 0$ and thus, we require that $2\sigma^2 < \sqrt{5\sigma^4 + B(1 - \frac{1}{\delta})}$ with simple algebra we derive the following conditions:
\begin{equation*}\left\{
\begin{array}{@{}rl@{}}
B &< \frac{\delta}{1-\delta}\sigma^4\\
\epsilon &\in \big(0, -2\sigma^2 + \sqrt{5\sigma^4 + B(1 - \frac{1}{\delta})}\big)\\
\end{array}
\right .
\end{equation*}
Notice when $\delta \xrightarrow{} 1$ we get that $\epsilon \in (0, \sigma^2(\sqrt{5} - 2))$, this upper bound is less than one forth of the case when $\delta=1$, this stricter result is due to the bounds we had to use to derive it.

\section{Deterministic Estimators for Multiple Workers}\label{apx:deterministic_est}
On this model we assume some oracle guessed and told us all of the different $\hat{\sigma}^2_i$ and thus, we treat them as  constants (i.e. independent of the data $\XX$), we would like to know how close the oracle has to be to the actual competences such that EBE would still have lower risk than estimated BLUE by some algorithm A, i.e $\calR_{\vec{\mu}, \vec{\sigma}}(\ebAlg{\alg}{\estn}) < \calR_{\vec{\mu}, \vec{\sigma}}(\alg)$.\par
$Let \ X_{ij} \sim \mathcal{N}(\mu_j, \sigma_i^2) \ i = 1,..,n \ j = 1,..,m $ and denote $\hat{\sigma}^2_i$ as an estimator which was somehow estimated for $\sigma^2_i, \forall i$.
Notice that the BLUE estimator (BLUE-aggregated worker) for $\mu_j$ is:
$$\hat{X}_j^A = (\Sigma_{i=1}^n\frac{1}{\hat{\sigma}^2_i})^{-1}\Sigma_{i=1}^n \frac{X_{ij}}{\hat{\sigma}^2_i}$$

 Then, notice that $\hat{X}_j^A$ is a linear combination of independent normal random variables therefore normal, i.e, $\hat{X}_j \sim N(\mu_j, \sigma^2)$ where:
\begin{align*}
    E_{\vec{\mu}, \vec{\sigma}}(\hat{X}_j^A) &= E_{\vec{\mu}, \vec{\sigma}}((\Sigma_{i=1}^n\frac{1}{\hat{\sigma}^2_i})^{-1}\Sigma_{i=1}^n \frac{X_{ij}}{\hat{\sigma}^2_i})= \mu_j\\
    Var_{\vec{\mu}, \vec{\sigma}}(\hat{X}_j^A) &= (\Sigma_{i=1}^n\frac{1}{\hat{\sigma}^2_i})^{-2}\Sigma_{i=1}^n \frac{\sigma^2}{\hat{\sigma}^4_i}
\end{align*}
Therefore, we can reduce this case to the case of single worker where: %\rmr{something messed up here:}
\begin{align*}
  \hat{\sigma}^2 = (\Sigma_{i=1}^n\frac{1}{\hat{\sigma}^2_i})^{-2}\Sigma_{i=1}^n \frac{\hat{\sigma}_i^2}{\hat{\sigma}^4_i} = (\Sigma_{i=1}^n\frac{1}{\hat{\sigma}^2_i})^{-1}\\  
  \calR_{\vec{\mu}, \vec{\sigma}}(\ebAlg{\alg}{\estn})_j =\hat{\bar{X}}^A + [1 - \frac{(m-3)\hat{\sigma}^2}{S^2(\vec{X}^A)}](\hat{X}_j^A - \hat{\bar{X}}^A)\\
  S^2(\vec{X}^A) = \sum_{j=1}^m(\hat{X}_j^A - \hat{\bar{X}}^A)^2
\end{align*}
\begin{proposition}
Under the Oracle model $\calR_{\vec{\mu}, \vec{\sigma}}(\ebAlg{\alg}{\estn}) < \calR_{\vec{\mu}, \vec{\sigma}}(\alg) \ \forall \mu \in \mathbb{R}^m, m>3$ if: 
\begin{equation}
\hat{\sigma}^2 < 2\sigma^2    
\end{equation}
\end{proposition}
\begin{proof}
Since we assumed an Oracle model (constant guesses of $\sigma^2$) and showed that $\hat{X}^A$ is following a normal distribution applying corollary \ref{indepcase} yields the result.
\end{proof}

\end{document}