% \vspace{-4mm}
\section{Confident Sampling Algorithm}
\label{sec:sampling}
% \textcolor{red}{TODO: Is this type of algorithm related to any other existing sampling algorithms? [not sure]
% Related to bandit, but is there any other field that might use this type of algorithm?[see discussion in the related words]
% }
% \textcolor{red}{TODO: Emphasize our key insight where we need to sample less if the true value is far from the threshold, that we don't need to approximate $f$ to a close precision.}

In this section, we propose and analyze the \samplong (\samp) algorithm. \samp is used in order to determine if the expected value of a random variable $X$ is approximately above or below a threshold value with high probability. \samp works for any random variable that is $R$-sub-Gaussian (see Theorem \ref{thm:sampling}) or bounded in the range of $[0,R]$ (see Theorem \ref{thm:sampling2}). 
% \textcolor{red}{TODO: Describe in what generality \samp works}. 
% \samp is useful as a subroutine for a variety of submodular maximization algorithms where noisy queries are made to $f$, since many require determining if the value of a submodular function $f(X)$ on some set $X$ is above or below a threshold. 
In Sections \ref{sec:monotone}, \ref{sec:nonmono}, and \ref{sec:matroid}, we show that \samp is useful as a subroutine for a variety of submodular maximization algorithms where we only have noisy access to the marginal gains. 

We now describe \samp. \samp takes as input failure probability $\delta\in\mathbb{R}_{>0}$, threshold error parameter $\epsilon\in\mathbb{R}_{>0}$, a threshold value $w\in\mathbb{R}_{>0}$, the unknown distribution $\mathcal{D}_X$ of the random variable $X$, and the sub-Gaussian parameter $R$. \samp iteratively takes at most $N_1$ samples from $\mathcal{D}_X$, while maintaining a sample average and a confidence interval. In particular $\widehat{X}_t$ is the sample average after taking $t$-th samples of $X$. The confidence region, after taking the $t$-th sample of $X$, is a shrinking region $[\widehat{X}_t-C_t, \widehat{X}_t+C_t]$ around $\widehat{X}_t$ that reflects where \samp is almost certain that the true value of $\mE X$ lies. We leave the exact definition of both $C_t$ and $N_1$ until Theorems \ref{thm:sampling} and \ref{thm:sampling2} for reasons that will become clear. Once the lower bound of the confidence region crosses $w-\epsilon$, or the upper bound crosses $w+\epsilon$, \samp completes and returns true or false respectively. {Note that the \samp algorithm differs significantly from the fixed-$\epsilon$ approximation approach commonly used in the submodular optimization literature, such as Algorithm 2 in \cite{fahrbach2019submodular}. A detailed discussion of this distinction is in Section \ref{appdx:compare_to_fixed_eps_approx} of the appendix.} 
% \samp stops sampling in at most $N_1$ samples regardless, and in this case returns true or false according to whether $\widehat{X}_t\geq w$. An illustration of the various states of \samp is depicted in Figure \ref{fig:samp}, and pseudocode for \samp is provided in Algorithm \ref{alg:samp}.



\input{sections/figures/stream}


\begin{algorithm}[t]
\caption{\samplong (\samp)}
\label{alg:samp}
\begin{algorithmic}[1]
    \STATE \textbf{Input:} $w$, $\epsilon$, $\delta$, $\mathcal{D}_X$, $R$
    \FOR{$t=1,2,...N_1$}\label{line:sample_N_1}
        \STATE $\widehat{X}_t\gets$ updated sample mean after taking $t$-th sample from $\mathcal{D}_X$
        \STATE $\conf\gets$ updated confidence interval \label{alg: update confidence interval}
 %\STATE Sample $\Delta f_t(S,s)$
 %\STATE Calculate $\widehat{\Delta f_t}(S,s)=\frac{\sum_{i=1}^t\Delta f_i(S,s)}{t}$
        \IF{$\widehat{X}_t-\conf \geq w-\epsilon$} 
        \label{line: comparison to thres 1}
            \STATE \textbf{return true}
        \ELSIF{$\widehat{X}_t+\conf \leq w+\epsilon$} 
        \label{line: comparison to thres 2}
            \STATE \textbf{return false}
        \ENDIF
 \ENDFOR
 \IF{$\widehat{X}_t\geq w$}
 \STATE \textbf{return true}
 \ELSE
 \STATE \textbf{return false}
 \ENDIF
\end{algorithmic}
\end{algorithm}


We now state our first main result for \samp in Theorem \ref{thm:sampling} below. The second item of Theorem \ref{thm:sampling} states that with high probability, \samp will correctly return the answer to whether $\mE X$ is approximately above or below the input threshold $w$. The first item states that, in the worst case, \samp takes $O(R^2\log(1/\delta)/\epsilon^2)$ samples from $\mathcal{D}_X$ to return true or false no matter what the value of $\mE X$ is. However, the further the value of $\mE X$ is from $w$, as reflected by $\phi$, the fewer samples \samp needs to make a decision. Figure \ref{fig:sample_complexity} illustrates how the sample complexity changes with the increase of gap function $\phi$ in the result of Theorem \ref{thm:sampling}. 


\textcolor{blue}{}



The details of the proof of Theorem \ref{thm:sampling} can be found in Section \ref{appdx:proof_of_samp} of the supplementary material.
%and applying the Hoeffding's inequality, the sample mean $\widehat{X}$ is an $\epsilon$-approximation of the true mean value $\mE X$ and the property is satisfied.
% \textcolor{red}{TODO: Define R below.}
\begin{theorem}
    \label{thm:sampling}
   For any random variable $X$ that is $R$-sub-Gaussian, if we define $N_1=2R^2/\epsilon^2\log \frac{4}{\delta}$, and $\conf =R\sqrt{\frac{2}{t}\log \frac{8 t^2}{\delta}}$, then the algorithm \samplong achieves that with probability at least $1-\delta$
    \begin{enumerate}[noitemsep]
        \item \samp on input $(w,\epsilon,\delta,\mathcal{D}_X,R)$ takes at most the minimum between
    \begin{align*}
    % \label{eq:sam_complxt}
        \left\{\frac{2R^2}{\epsilon^2}\log \left(\frac{4}{\delta}\right),\frac{8R^2}{\phi_X^2}\log\left(\frac{16R^2}{\phi_X^2}\sqrt{\frac{2}{\delta}}\right)\right\}
    \end{align*}
    noisy samples, where $\phi_X = \frac{\epsilon + |w-\mathbb{E} X|}{2}$.
    \item If \samp returns true, then $\mE X\geq w-\epsilon$. If \samp returns false, then $\mE X\leq w+\epsilon$.
    \end{enumerate}
   % Besides, it holds with probability $1$ that 
    % where $\epsilon=3\epsilon$.    
\end{theorem}

{Here we provide explanation for the result of sample complexity in the first point of Theorem \ref{thm:sampling}. The term on the left-hand side, $\frac{2R^2}{\epsilon^2}\log \left(\frac{4}{\delta}\right)$, represents the number of samples required to approximate $X$ within $\epsilon$-distance with probability, i.e., $|X-\mE X|\leq\epsilon$. This corresponds to case (d) in Figure \ref{fig:samp}, and is the number of samples that the fixed $\epsilon$-approximation would take. Such a large number of samples is only necessary when $\mE X$ is close to the threshold. }

{The value on the right-hand side comes from the adaptive sampling, and it is the number of samples required to shrink the confidence interval just enough so that we can conclude whether $\mathbb{E}X$ is approximately above or below the threshold, and it depends on how far $\mathbb{E}X$ is from the threshold. This value cannot be computed before we start sampling, and is a result of adaptive sampling where we do not know how many samples we will take initially. This corresponds to cases (a) and (b) in Figure \ref{fig:samp}.}


\begin{figure}[t!]
%\hspace{-0.5em}
\centering
\includegraphics[width=0.5\columnwidth]{figures/sample_complexity}
\captionof{figure}{A plot to illustrate how the number of samples taken by \samp (num) changes with the gap function $\phi_X$ (see Theorem \ref{thm:sampling}). There exists some $x_0$ such that when $0<\phi_X\leq x_0$, the required number of samples is $\frac{R^2}{2\epsilon^2}\log \frac{2}{\delta}$ (the left side in the sample complexity result in Theorem \ref{thm:sampling}).
%which is consistent with the result of fixing confidence interval to $\epsilon$.
When $\phi_X>x_0$, the right-hand side in Theorem \ref{thm:sampling} is the minimum and the sample complexity of the algorithm decreases fast as $\phi_X$ increases. 
% {\color{blue}TODO: Make more long horizontally and thin vertically because it takes up a lot of space.}
}
        \label{fig:sample_complexity}
\end{figure}

%The proof of Theorem \ref{thm:sampling} is provided . \textcolor{red}{TODO: Better description about how the confidence intervals comes from the and of all the events?}. At the $t$-th of \samp, given a probability, Hoeffding's Inequality implies a distance that $\mE X$ is from the sample average $X_t$. $C_t$ is derived using this, but importantly the probability is set to be sufficiently small such that even with the union bound over all iterations of \samp, the confidence interval holds with high probability. Once the confidence region crosses the threshold of $w+\epsilon$ or $w-\epsilon$, \samp can then exit. Clearly, if $\mE X$ is further from $\tau$ then fewer samples are needed. On the other hand, we know that the confidence region must shrink to a sufficiently small diameter (less than $2\epsilon$) by another application of Hoeffding's Inequality in  $O(R^2\log(1/\delta)/\epsilon^2)$ number of samples.
%since $X$ is assumed to be $R$- sub-Gaussian, by taking $O(R^2\log(1/\delta)/\epsilon^2)$ number of samples from $\mathcal{D}_X$ and applying the Hoeffding's inequality, the sample mean $\widehat{X}$ is an $\epsilon$-approximation of the true mean value $\mE X$ and the property is satisfied. More specifically, the sample mean satisfies that $|\widehat{X}-\mathbb{E} X|\leq\epsilon$ with probability at least $1-\delta$. If $\widehat{X}\geq w$, the algorithm returns true. Otherwise, the algorithm returns false. Then the specified property is satisfied. However, we observe that when the difference between the mean of $X$ and the threshold $w$ is substantial, then the number of samples can be reduced significantly by adaptively adjusting the confidence interval and determining when to terminate the sampling process.
%{\color{blue}The output of the algorithm satisfies the following property: with a small probability of failure, \samp repeatedly samples from the distribution $\mathcal{D}_X$, and if \samp returns true then $\mE X\geq w-\epsilon$, and if \samp returns false then $\mE X\leq w+\epsilon$. Notice that  Pseudocode for \samp is given in Algorithm \ref{alg:samp}.}
%The \samp algorithm for the second theorem also requires an input error parameter, denoted as $\alpha$. 
%In Theorem \ref{thm:sampling}, the outcome regarding the disparity between the threshold $w$ and the expectation of $X$ is characterized by an $\epsilon$-additive approximation error. Nonetheless, in several threshold greedy-based algorithms, as demonstrated in the subsequent sections, we don't need a tight additive error bound. In the following theorem, we introduce another result. By employing distinct definitions of $C_t$ and $N_1$, we establish the following result:

Our second result, Theorem \ref{thm:sampling2}, is related to Theorem \ref{thm:sampling} but instead of an additive approximation error (i.e. $\mE X\geq w-\epsilon$ or $\mE X\leq w+\epsilon$), the error is a combination of multiplicative and additive. 
% This alternative result is useful in some algorithms such as that considered in Section \ref{appdx:mono2} in the appendix and Section \ref{sec:matroid}. 
The intuition behind using this result is that in many submodular algorithms that require the thresholding procedure, the threshold decreases exponentially which allows the multiplicative error. On the other hand, in the case where $R$ can be as large as $n$, the result in Theorem \ref{thm:sampling2} can be more sample efficient.
In order to get Theorem \ref{thm:sampling2}, a different definition of the confidence radius $C_t$ as well as the maximum number of samples $N_1$ is needed. Theorem \ref{thm:sampling2} is proven in the supplementary material in Section \ref{appdx:proof_of_samp2}.
% \vspace{-3mm}
\begin{theorem}
\label{thm:sampling2}
    For any random variable $X$ that is bounded in the range of $[0,R]$, if we define $C_t=\frac{3R}{t\alpha}\log(\frac{8R^2}{\delta})$, and $N_1=\frac{3R}{\alpha\epsilon}\log(\frac{4}{\delta})$ where $\alpha$ is an additional parameter that controls the multiplicative error rate, the algorithm \samplong achieves that with probability at least $1-\delta$
    \vspace{-2mm}
    \begin{enumerate}[noitemsep]
        \item \samp on input $(w,\epsilon,\delta,\mathcal{D}_X,R)$ takes at most the minimum between
    \begin{align*}
    % \label{eq:sam_complxt}
        \left\{\frac{3R}{\epsilon\alpha}\log \left(\frac{4}{\delta}\right),\frac{12R}{\alpha\phi_X'}\log\left(\frac{12R}{\alpha\phi_X'}\sqrt{\frac{8}{\delta}}\right)\right\}
    \end{align*}
    noisy samples, $\phi_X' = \frac{\epsilon -\alpha\mE X+| w-\mathbb{E} X|}{2}$.
    \item If the output is true, then $(1+\alpha)\mE X\geq w-\epsilon$. If the output is false, then $(1-\alpha)\mE X\leq w+\epsilon$.
    \end{enumerate}
\end{theorem}
%If we look at the sample complexity of \samp in the first result in both Theorem \ref{thm:sampling} and Theorem \ref{thm:sampling2}, the term on the left is the worst-case sample complexity that occurs if we need to take $N_1$ number of samples to get an estimation of $X$ that is close enough to $\mathbb{E}X$. However, if the mean value of random variable $X$ is relatively far from the threshold $w$ (which is reflected in the value of $\phi$), then many fewer samples are needed. 
%Since the proof of Theorem \ref{thm:sampling2} is similar to that of Theorem \ref{thm:sampling}, we defer the analysis of Theorem \ref{thm:sampling2} to the appendix. In the next section, we prove the result in Theorem \ref{thm:sampling}.



% ***********now we move proof to appendix
% \subsection{Proof Sketch of Theorem \ref{thm:sampling}}
% \label{sec:proof_of_samp}

% The details of the proof of Theorem \ref{thm:sampling} can be found in Section \ref{appdx:proof_of_samp} of the supplementary material, but here we provide an overview of the proof. In order for \samp to correctly determine whether $\mE X$ is approximately above or below the threshold $w$, i.e. Items (ii) of Theorem \ref{thm:sampling}, two random events must occur during \samp. The first event is that at all iterations during the for loop, the confidence regions around the sample means ($\hat{X}_t$) contain the true expected value ($\mE X$). The second event is that after $N_1$ samples taken by the for loop on Line \ref{line:sample_N_1}, we have achieved an $\epsilon$-additive approximation of the expected value. Basically these two events together mean that \samp is correct about the region where $\mE X$ is throughout the algorithm, and therefore it returns the correct answer to whether $\mE X$ is approximately above or below the threshold $w$. The following Lemma states that on a run of \samp, the two events hold with probability at least $1-\delta$.

% \begin{lemma}
% \label{lem:clean_event}
%     With probability at least $1-\delta$, the following two events hold.
%     \begin{enumerate}       
%     \item At any time $t\in\mathbb{N}_+$, the sample mean $\widehat{X}_t$ satisfies that
%     $|\widehat{X}_t-\mE X|\leq \conf$,
%     where $\conf:=R\sqrt{\frac{2}{t}\log \frac{8 t^2}{\delta}}$.
%     \item The sample mean $\widehat{X}_{N_1}$ at time $N_1:=\frac{2R^2}{\epsilon^2}\log \frac{4}{\delta}$ satisfies that $|\widehat{X}_{N_1}-\mE X|\leq \epsilon  $.
%     \end{enumerate}
% \end{lemma}




% The second lemma required for establishing Theorem \ref{thm:sampling} concerns the number of samples that \samp takes before its approximation of $\mE X$ is sufficiently accurate so that it can terminate. The number of samples depends on how far away the true value of $f$ is from the threshold. In particular, Lemma \ref{lem:conf_int} below states that once the confidence interval goes beneath the corresponding $\phi$ value (as defined in Theorem \ref{thm:sampling}), then \samp will complete.
% %But even in the worst case where the distance $\phi$ is small, \samp will get an $\epsilon$-approximation to $\mE X$ after $N_1$ samples.
% Lemma \ref{lem:conf_int} is stated below, and its proof can be found in Section \ref{appdx:proof_of_samp} of the supplementary material.
% \textcolor{red}{TODO: Probably need to change the way this is stated, the probabilities with the two lemmas feel a little redundant, maybe there is a better way to state this second one. Maybe "Once $C_t$ satisfies blah blah, then \samp will terminate, and this occurs in at most blah blah many samples".}
% % first prove Lemma \ref{lem:clean_event}, which essentially states that \samp correctly identifies elements to be added or not added to the solution throughout \alg. Next, we
% \begin{lemma}
% \label{lem:conf_int}
%     With probability at least $1-\delta$, when the confidence interval $\conf$ satisfies that
%     \begin{align*}
%         \conf \leq \phi_X,
%     \end{align*}
%     the sampling of $ X$ finishes, where $\phi_X = \frac{\epsilon + |w-\mE X|}{2}$.
    
% \end{lemma}
% The proof of this lemma is deferred to the supplementary material.

% Now we present the proof of Theorem \ref{thm:sampling}. 
% \begin{proof}
%     We first prove the result on sample complexity, which is the first result in Theorem \ref{thm:sampling}. From Lemma \ref{lem:conf_int}, we have if
%     \begin{align}
%     \label{ineq:confidence_intv}
%      \conf \leq \phi_X,
%     \end{align}
%     then the Algorithm \ref{alg:samp} finishes.
%     Since $\conf =R\sqrt{\frac{2}{t}\log \frac{8 t^2}{\delta}}$, we have the above inequality (\ref{ineq:confidence_intv}) is equivalent to that
%     \begin{align*}
%      \frac{4\log (\sqrt{\frac{8}{\delta}}t)}{t}\leq \frac{\phi^2_X}{R^2}.
%     \end{align*}
%     Since $\sqrt{\frac{8}{\delta}}t\geq 2$, from Lemma \ref{lem:logx_over_x}, we have when
%     \begin{align*}
%         t\geq\frac{8R^2}{\phi^2_X}\log(\frac{16R^2}{\phi^2_X}\sqrt{\frac{2}{\delta}}),
%     \end{align*}
%     the above inequality holds and the Algorithm \ref{alg:samp} ends. Therefore, the number of samples required is bounded by $\min\{\frac{8R^2}{\phi^2_X}(\log\frac{16R^2}{\phi^2_X}\sqrt{\frac{2}{\delta}}),N_1\}$. 
%     % We conclude the proof by applying the above results on the number of samples for each $s$ and $S_{i,s}$.

%     Next, we prove the second result in Theorem \ref{thm:sampling}. If $t=N_1$ when \samp ends, then conditioned on the events in Lemma \ref{lem:clean_event}, $|\widehat{X}_{N_1}-\mE X|\leq\epsilon$. Thus 
%   if the algorithm returns true, $\mE X\geq \widehat{X}_t - \epsilon\geq w-\epsilon$. If the output of the algorithm is false, then $\widehat{X}_t \leq w$. Similarly we have that $\mE X\leq \widehat{X}_t + \epsilon\leq w+\epsilon$. Secondly, let us consider the case where $t<N_1$ when the algorithm \samp ends. Conditioned on the second event in Lemma \ref{lem:clean_event}, we have if the algorithm \samp returns true, $\mE X\geq\widehat{X}_t-\conf\geq w-\epsilon$. If the output is false, $\mE X\leq\widehat{X}_t+\conf\leq w+\epsilon$.
% \end{proof}




