\input{sections/nonmonotone}



Now we start to prove the results in Theorem \ref{thm:nonmono}. Notice that conditioned on the solution set $A_{i-1}$ and $B_{i-1}$, the random variables $\widetilde{\Delta f}(A_{i-1}, u_i)$ and $\widetilde{\Delta f}(B_{i-1}/\{u_i\},u_i)$ are $R$-sub-Gaussian. Therefore, $X_i:=\widetilde{\Delta f}(A_{i-1}, u_i)+\widetilde{\Delta f}(B_{i-1}/\{u_i\},u_i)$ is $\sqrt{2}R$-sub-Gaussian, the second result is implied by applying Theorem \ref{thm:sampling} immediately. To prove the first result in Theorem \ref{thm:nonmono}, we need the following lemma.
\begin{lemma}
\label{lem:nonmono}
With probability at least $1-\frac{\delta}{n}$, the $i$-th call of \samp satisfies the following inequality 
\begin{align}
\label{eqn:nonmono_iter}
     f(A_{i-1}&\cup OPT_{i-1})-f(A_{i}\cup OPT_{i})\leq \nonumber\\
    &[f(A_i)-f(A_{i-1})] + [f(B_i)-f(B_{i-1})]+\frac{3\epsilon}{n}.
\end{align}
where $OPT_i$ is the set of all elements from $OPT$ that arrives after the $i$-th iteration.
\end{lemma}
\begin{proof}
    From the statement of the algorithm, we know that the element $u_i$ is added to the solution if and only if the output of \samp is true. By applying the results in Theorem \ref{thm:sampling}, we have that for each fixed $i$, with probability at least $1-\delta/n$ if $u_i$ is added, then $\Delta f(A_{i-1},u_i)\geq -\Delta f(B_{i-1}/\{u_i\},u_i)-\frac{3\epsilon}{n}$. Otherwise, $\Delta f(A_{i-1},u_i)\leq -\Delta f(B_{i-1}/\{u_i\},u_i)+\frac{3\epsilon}{n}$. Let us denote the above event as $\mathcal{E}_i$, we discuss the following four cases in our analysis
    \begin{enumerate}
        \item If $u_i\in A_i$, and $u_i\in OPT$, then 
        \begin{align*}
             f(A_{i-1}\cup OPT_{i-1})-&f(A_{i}\cup OPT_{i})=0
        \end{align*}
        Notice that $u_i\in A_i$, then conditioned on $\mathcal{E}_i$, we have $\Delta f(A_{i-1},u_i)\geq -\Delta f(B_{i-1}/\{u_i\},u_i)-\frac{3\epsilon}{n}$. By submodularity, $\Delta f(B_{i-1}/\{u_i\},u_i)\leq\Delta f(A_{i-1},u_i)$. Then it follows that $\Delta f(A_{i-1}, u_i)+\frac{3\epsilon}{2n}\geq 0$. Therefore, the term on the right-hand side of $(\ref{eqn:nonmono_iter})$ satisfies 
        \begin{align*}
            [f(A_i)-f(A_{i-1})]& + [f(B_i)-f(B_{i-1})]+\frac{3\epsilon}{n}\\
            &=\Delta f(A_{i-1}, u_i)+\frac{3\epsilon}{n}\geq 0.
        \end{align*}

        
        \item If $u_i\in A_i$, and $u_i\notin OPT$, then 
        \begin{align*}
            f(A_{i-1}\cup OPT_{i-1})-&f(A_{i}\cup OPT_{i})\\
            &=-\Delta f(A_{i-1}\cup OPT_i,u_i)\\
            &\leq -\Delta f(B_{i-1}/\{u_i\},u_i),
        \end{align*}
  where the inequality is obtained by submodularity. The right-hand side in (\ref{eqn:nonmono_iter}) is 
        \begin{align*}
            [f(A_i)-f(A_{i-1})]& + [f(B_i)-f(B_{i-1})]+\frac{3\epsilon}{n} \\
            &= \Delta f(A_{i-1}, u_i)+\frac{3\epsilon}{n}.
        \end{align*}
        Notice that $u_i\in A_i$, then conditioned on $\mathcal{E}_i$, we have $\Delta f(A_{i-1},u_i)\geq -\Delta f(B_i/\{u_i\},u_i)-\frac{3\epsilon}{n}$. Therefore, 
        \begin{align*}
            [f(A_i)-f(A_{i-1})]& + [f(B_i)-f(B_{i-1})]+\frac{3\epsilon}{n}\\
            &=\Delta f(A_{i-1}, u_i)+\frac{3\epsilon}{n}\\
            &\geq -\Delta f(B_{i-1}/\{u_i\},u_i).
        \end{align*}

        
        \item If $u_i\notin A_i$, and $u_i\notin OPT$, then 
        \begin{align*}
            f(A_{i-1}\cup OPT_{i-1})-&f(A_{i}\cup OPT_{i})=0.
        \end{align*}
        Similarly as the first case, we have that $-\Delta f(B_{i-1}/\{u_i\},u_i)\geq\frac{3\epsilon}{2n}$. Since the right-hand side is $-\Delta f(B_{i-1}/\{u_i\},u_i)+\frac{3\epsilon}{n}$, the inequality holds.
        \item If $u_i\notin A_i$, and $u_i\in OPT$, then
        \begin{align*}
            &f(A_{i-1}\cup OPT_{i-1})-f(A_{i}\cup OPT_{i})\\
            &=\Delta f(A_{i-1}\cup OPT_i,u_i)\leq \Delta f(A_{i-1},u_i),
        \end{align*}
        where the inequality holds by submodularity. Conditioned on the event $\mathcal{E}_i$, it follows that $\Delta f(A_{i-1},u_i)\leq -\Delta f(B_i/\{u_i\},u_i)+\frac{3\epsilon}{n}$. Since the right-hand side is
        \begin{align*}
            [f(A_i)-f(A_{i-1})]& + [f(B_i)-f(B_{i-1})]+\frac{3\epsilon}{n}\\
            &=-\Delta f(B_i/\{u_i\},u_i)+\frac{3\epsilon}{n},
        \end{align*}
the result is proved.
    \end{enumerate}
\end{proof}

% The analysis of Lemma \ref{lem:nonmono} is deferred to the appendix. 
Now we prove Theorem \ref{thm:nonmono}. 
\begin{proof}
    Define the event 
    \begin{align*}
        \mathcal{F}_i&=\{f(A_{i-1}\cup OPT_{i-1})-f(A_{i}\cup OPT_{i})\leq \\
    &[f(A_i)-f(A_{i-1})] + [f(B_i)-f(B_{i-1})]+\frac{3\epsilon}{n}\}.
    \end{align*}
    From Lemma \ref{lem:nonmono} and by taking the union bound, it follows that 
    \begin{align*}
        P(\mathcal{F}_i,\forall i\in[n])\geq 1-\delta
    \end{align*}
    Therefore, with probability at least $1-\delta$, $\mathcal{F}_i$ holds for all $i$. Then by summing over all $i$, we would get
    \begin{align*}
     \sum_{i=1}^nf(A_{i-1}&\cup OPT_{i-1})-f(A_{i}\cup OPT_{i})\leq \\
    &\sum_{i=1}^n\{[f(A_i)-f(A_{i-1})] \\
    &+ [f(B_i)-f(B_{i-1})]\}+3\epsilon.
\end{align*}
    It follows that 
    \begin{align*}
     f( OPT_{0})&-f(A_{n})\leq \\
   & [f(A_n)-f(A_{0})] + [f(B_n)-f(B_{0})]\}+3\epsilon.
\end{align*}
Since the submodular function is nonnegative, and that $f(A_n)=f(B_n)$, $OPT_0=OPT$, it follows that $f(A)\geq f(OPT)/3-\epsilon$.
\end{proof}



\section{Appendix for Section \ref{sec:matroid}}
\label{appdx:continuous}
In this section, we present supplementary material to Section \ref{sec:matroid}. In particular, we present the comparison of the result of \contialglong in Theorem \ref{thm:continuous} to the Accelerated Continuous Greedy algorithm (ACG) in \cite{badanidiyuru2014fast}. Then in Section \ref{appdx:continuous}, we provide detailed proof of Theorem \ref{thm:continuous}. In addition, we provide the psedocode of \contialglong in Algorithm \ref{alg:CCTG}. 

\subsection{Comparison of \contialg with Accelerated Continuous Greedy algorithm}
\label{appdx:comparison_to_ACG}
In this section, we compare the results of Theorem \ref{thm:continuous} and the Accelerated Continuous Greedy algorithm (ACG) as presented in \cite{badanidiyuru2014fast}.
\begin{enumerate}
% [noitemsep]
    \item First of all, we consider the case where we have exact access to the value oracle. In this case, we can get that $\widetilde{\Delta f}(S,s)=\Delta f(S,s)\leq \max_{s\in S}f(s)$ for any subset $S\subseteq U$ and element $s\in U$. This implies that $R$ can be set to be $\max_{s\in S}f(s)$. Consequently, from Theorem \ref{thm:continuous}, the output solution set of \contialg satisfies that $f(S)\geq (1-1/e-O(\epsilon))f(OPT)$, which aligns with the approximation ratio presented in \cite{badanidiyuru2014fast}. For the result on sample complexity, notice that each call of \samp takes at most $\min\{O(\frac{\kappa}{\epsilon^2}\log\frac{n}{\delta\epsilon}), O(\frac{\kappa}{\epsilon\phi_X''}\log\frac{n}{\delta\epsilon\phi_X''})\}$ number of samples, where the first result is obtained by considering the worst case sample complexity of a fixed $\epsilon$-approximation. Since there are at most $\frac{3n}{\epsilon^2}\log\frac{\kappa}{\epsilon}$ calls of \samp during \contialg, if we only consider the worst-case sample complexity, the total required sample complexity is at most $O(\frac{\kappa n}{\epsilon^3}\log^2\frac{n}{\epsilon})$ for \contialg. This matches the result in \cite{badanidiyuru2014fast}. In this sense, we improve the sample complexity when reduced to the case of assuming an exact oracle to the marginal gains.
    
    \item On the other hand, from Theorem \ref{thm:continuous}, we can see that even if the access to $\Delta f$ is noisy, as long as the upper bound on the noisy marginal gain $R$ is less than $ f(OPT)$, the above analysis on sample complexity and approximation ratio holds. Hence, we can conclude that compared to access to an exact value oracle, the assumption of access to noisy marginal gain does not lead to additional sample complexity or a deterioration in the approximation ratio when compared to the scenario with an exact value oracle.
\end{enumerate}
\subsection{Proof of Theorem \ref{thm:continuous}}
\label{appdx:proof_of_conti}
In this section, we present the detailed proof of Theorem \ref{thm:continuous} about our algorithm \contialg.

\noindent\textbf{Theorem \ref{thm:continuous}. }\textit{\contialg makes at most $\frac{3n}{\epsilon^2}\log\frac{3\kappa}{\epsilon}$ calls of \sampnew. In addition, with probability at least $1-\delta$, the following statements hold:
\begin{itemize}
        \item The output fractional solution $\vect{x}$ achieves the approximation guarantee of $\vect{F}(\vect{x})\geq(1-e^{-1}-2\epsilon)f(OPT)-R\epsilon$.
        \item Each call of \sampnew on input  ($w$, $\frac{\epsilon R}{2\kappa}$, $\frac{\delta\epsilon}{2nh'(\epsilon)}$, $\mathcal{D}_X$, $R$) requires at most the minimum between
    \begin{align*}
    % \label{eq:sam_complxt}
      \frac{18\kappa}{\epsilon^2}\log \left(\frac{8nh'(\epsilon)}{\delta\epsilon}\right)
    \end{align*}
    and
    \begin{align*}
    % \label{eq:sam_complxt}
        \frac{36R}{\epsilon\phi''_X}\log\left(\frac{144R}{\epsilon\phi''_X}\sqrt{\frac{nh'(\epsilon)}{\delta\epsilon}}\right)
    \end{align*}
    noisy queries to the marginal gain. Here $OPT$ is an optimal solution to the MSMM problem, $\phi''_X = \frac{\frac{\epsilon R}{2\kappa} -\epsilon\mathbb{E}X  /{3}+ |w-\mathbb{E}X|}{2}$, and $h'(\epsilon)=\frac{3}{\epsilon}\log{(\frac{3\kappa}{\epsilon})}$.
    \end{itemize}
}
    \begin{proof}
    The second result on the sample complexity of calling the subroutine algorithm \sampnew can be obtained immediately by applying the second result in (\ref{lem:item_samp_continuous}) in Lemma \ref{lem:clean_event_continuous}. Here we prove the first result in the theorem. Let us denote the fractional solution at time step $t$ as $\vect{x}_t$. From Lemma \ref{lem:Continuous_decreading_threshold}, it follows that conditioned on the events in Lemma \ref{lem:clean_event_continuous}, we have
    \begin{align*}
        \vect{F}(\vect{x}_{t+1})-\vect{F}(\vect{x}_{t})&\geq\epsilon(1-\epsilon)f(OPT)\\
        &\qquad-\epsilon(1-\epsilon)\vect{F}(\vect{x}_{t+1})-\epsilon^2 R.
    \end{align*} 
    It then follows that
    \begin{align*}
        \vect{F}(\vect{x}_{t+1})&\geq\frac{\vect{F}(\vect{x}_{t})+\epsilon(1-\epsilon)f(OPT)-\epsilon^2 R}{1+\epsilon(1-\epsilon)}\\
        &\geq(1-\epsilon)\vect{F}(\vect{x}_{t})+\epsilon(1-\epsilon)^2f(OPT)-\epsilon^2 R
    \end{align*}
   Since there are $1/\epsilon$ iterations in \contialg, the output $\vect{x}$ satisfies that $\vect{x}=\vect{x}_{1/\epsilon}$. By applying induction to the above inequality, we would get
    \begin{align*}
        \vect{F}(\vect{x}_{1/\epsilon})
        &\geq(1-(1-\epsilon)^{1/\epsilon})\{(1-\epsilon)^2f(OPT)-\epsilon R\}\\
        &\geq(1-1/e)\{(1-\epsilon)^2f(OPT)-\epsilon R\}\\
        &\geq(1-1/e-2\epsilon)f(OPT)-\epsilon R.
    \end{align*}
    \end{proof}
\begin{algorithm}[t]
\caption{\contialglong(\contialg)}\label{alg:CCTG}
 \begin{algorithmic}[1]
 \STATE \textbf{Input:} $\epsilon$, $\delta$, $\mathcal{M}\in 2^U$
 \STATE  $\vect{x}\gets \vect{0}$
 \FORALL{$s\in U$ and $s\in\mathcal{M}$}
 \STATE $\hat{f}(s) \gets $ sample mean over $\frac{18\kappa}{\epsilon^2}\log\frac{4n}{\delta}$ samples from $\mathcal{D}(\emptyset,s)$
 % \STATE \label{line:BAI}  repeatedly sample $f(s)$ for $N_2:=\frac{R^2\log(6n/\delta)}{2\epsilon^2}$ number of times, define the empirical mean $\hat{f}(s)=\frac{\sum_{i=1}^{N_2}f_i(s)}{N_2}$.
 \ENDFOR
  
  \STATE $d:=\max_{s\in \mathcal{M}}\hat{f}(s)$, 
 % using BAI to approximate $\max_{s\in S}f(s)$, i.e., $(1+\epsilon)\max_{s\in U}f(s)\geq d\geq (1-\epsilon)\max_{s\in U}f(s)$.
 
 \FOR{$t=1$ to $1/\epsilon$}
 \STATE $B\gets$\contisublong($\vect{x}$, $\epsilon$, $\delta$, $d$, $\mathcal{M}$)
 \STATE $\vect{x}\gets \vect{x}+\epsilon\cdot\vect{1}_{B}$
 \ENDFOR
 \STATE \textbf{return} $\vect{x}$
 \end{algorithmic}
\end{algorithm}

\begin{algorithm}[t]
\caption{\contisublong(\contisub)}\label{alg:CCTG_subroutine}
 \begin{algorithmic}[1]
 \STATE \textbf{Input:} $\vect{x}$, $\epsilon$, $\delta$, $d$, $\mathcal{M}\in 2^U$
 \STATE $w\gets d$, $B\gets \emptyset$
 \WHILE{$w>\frac{\epsilon d}{3\kappa}$}
 \FORALL{$u\in U$} 
\IF{$B\cup\{u\}\in\mathcal{M}$}
\STATE $X=\widetilde{\Delta f}(S(\vect{x}+\epsilon\vect{1}_B),u)$
 \STATE thre = \sampnewlong($w$, $\frac{R\epsilon}{2\kappa}$, $\frac{\delta\epsilon}{2nh'(\epsilon)}$, $\mathcal{D}_X$, $R$)
 \IF{thre}
 \STATE $B\gets B\cup \{u\}$
 \ENDIF
  \ENDIF
 \ENDFOR

 \STATE $w=w(1-\epsilon/3)$
 \ENDWHILE
 \STATE \textbf{return} $B$ \end{algorithmic}
\end{algorithm}
\begin{lemma}
\label{lem:clean_event_continuous}
    With probability at least $1-\delta$, the following two events hold.
    \begin{enumerate}
     \item $(1-\epsilon/3)\max_{s\in U}f(s)-\frac{R\epsilon}{2\kappa}\leq d\leq(1+\epsilon/3)\max_{s\in U}f(s)+\frac{R\epsilon}{2\kappa}$.
         \item During each call of \sampnew on the input ($w$, $\frac{\epsilon R}{2\kappa}$, $\frac{\delta\epsilon}{2nh'(\epsilon)}$, $\mathcal{D}_X$, $R$, $\epsilon/3$
        ) with the evaluated random variable being $X=\widetilde{\Delta f}(S(\vect{x}+\epsilon\vect{1}_B),u)$ where $\vect{x}$ is the fractional solution , $B$ is the set of coordinates and $u$ is an element in $U$, the results in Theorem \ref{thm:sampling2} holds. I.e.,
        \begin{enumerate}
            \item\label{lem:item_samp_continuous} \sampnew takes at most the minimum between 
            \begin{align*}
    % \label{eq:sam_complxt}
      \frac{18\kappa}{\epsilon^2}\log \left(\frac{8nh'(\epsilon)}{\delta\epsilon}\right)
    \end{align*}
    and
    \begin{align*}
    % \label{eq:sam_complxt}
        \frac{36R}{\epsilon\phi''_X}\log\left(\frac{144R}{\epsilon\phi''_X}\sqrt{\frac{nh'(\epsilon)}{\delta\epsilon}}\right).
    \end{align*}
\item\label{lem:item_approx_continuous} If the output is true, then $$(1+\epsilon/3)\mE \widetilde{\Delta f}(S(\vect{x}+\epsilon\vect{1}_B),u)\geq w-\frac{\epsilon R}{2\kappa}.$$ 
         If the output is false, then $$(1-\epsilon/3)\mE \widetilde{\Delta f}(S(\vect{x}+\epsilon\vect{1}_B),u)\leq w+\frac{\epsilon R}{2\kappa}.$$
        \end{enumerate}
        
        
        
        
     \end{enumerate}
\end{lemma}
 \begin{proof}
     First of all, by applying the inequality in Lemma \ref{lem:chernoff}, we have that for each fixed $s\in U$, after taking $N_4=\frac{18\kappa}{\epsilon^2}\log\frac{4n}{\delta}$ number of samples, it follows that
     \begin{align*}
         P\big(|\hat{f}_{N_4}(s)-f(s)|\geq\frac{\epsilon}{3}f(s)+\frac{R\epsilon}{2\kappa}\big)\leq\frac{\delta}{2n}.
     \end{align*}
     Taking a union bound over all elements in $U$, it follows that 
      \begin{align*}
          P\big(|\hat{f}_{N_4}(s)-f(s)|\geq\frac{\epsilon}{3}f(s)+\frac{R\epsilon}{2\kappa},\forall s\in U\big)\leq\frac{\delta}{2}.
      \end{align*}
      Following the similar idea as in the proof of the Lemma \ref{lem:clean_event_samp2}, we can prove the first result. 
      
      Now we start to prove the second result. For each fixed call of \sampnew  with input ($w$, $\frac{\epsilon R}{2\kappa}$, $\frac{\delta\epsilon}{2nh'(\epsilon)}$, $\mathcal{D}_X$, $R$, $\epsilon/3$
        ), by applying the results in Theorem \ref{thm:sampling2}, we have that with probability at least $1-\frac{\delta\epsilon}{2nh'(\epsilon)}$, both the statements about the sample complexity in (\ref{lem:item_samp_continuous}) and approximation guarantee in (\ref{lem:item_approx_continuous}) in the lemma holds. Since there are $1/\epsilon$ calls of the \contisublong and each \contisublong makes at most $nh'(\epsilon)$ calls of the \sampnew algorithm, there are at most $nh'(\epsilon)/\epsilon$ calls of the \sampnew algorithm. By taking the union bound, we can prove that with probability at least $1-\delta/2$, the second results hold. By taking the union bound again, we can see that with probability at least $1-\delta$, all of the results in the lemma hold.
 \end{proof}  
 \begin{lemma}
     \label{lem:Continuous_decreading_threshold}
     Conditioned on the two events defined in Lemma \ref{lem:clean_event_continuous}, we have that during each implementation of \contisublong, the output coordinate set $B$ satisfies that
     \begin{align*}
         \vect{F}(\vect{x}+\epsilon\vect{1}_B)-\vect{F}(\vect{x})&\geq\epsilon(1-\epsilon)\{f(OPT)-\vect{F}(\vect{x}+\epsilon\vect{1}_B)\}\\
         &\qquad-\epsilon^2R.
     \end{align*}
 \end{lemma}
 \begin{proof}
     Here we denote the output solution set as $B=\{b_1,b_2,...,b_{\kappa}\}$ where $b_i$ is the $i$-th element that is added to set $B$. Here if $|B|<\kappa$, then for any $i>|B|$, $b_i$ is defined as a dummy variable. Since $\mathcal{M}$ is a matroid, there exists a permutation of the optimal solution $OPT=\{o_1,o_2,...,o_\kappa\}$ such that $B_{i-1}\cup \{o_i\}\in\mathcal{M}$ for each $i\in[\kappa]$. For notation simplicity, we also define $G(\vect{x},u)=\mE\widetilde{\Delta f}(S(\vect{x}),u) $. First of all, we prove the following claim: for each $i\in[\kappa]$, we have that
     \begin{align*}
         G(\vect{x}+\epsilon\vect{1}_{B_{i-1}},b_i)\geq(1-\epsilon)G(\vect{x}+\epsilon\vect{1}_{B_{i-1}},o_i)-\frac{\epsilon R}{\kappa}
     \end{align*}
     The proof is as follows: if the element $b_i$ is added at the first iteration, then from Lemma \ref{lem:clean_event_continuous}, we have that 
$(1+\epsilon/3)G(\vect{x}+\epsilon\vect{1}_{B_{i-1}},b_i)\geq w-\frac{\epsilon R}{2\kappa}$. Since the threshold at the first iteration is $w=d$, and $d\geq (1-\epsilon/3)\max_{s\in U}f(s)-\frac{R\epsilon}{2\kappa}$ according to the first result in Lemma \ref{lem:clean_event_continuous}, then
\begin{align*}
    (1+\epsilon/3)G(\vect{x}+\epsilon\vect{1}_{B_{i-1}},b_i)\geq (1-\epsilon/3)\max_{s\in U}f(s)-\frac{\epsilon R}{\kappa}.
\end{align*}
Since $\max_{s\in U}f(s)\geq\max_{o\in OPT}f(o)\geq G(\vect{x}+\epsilon\vect{1}_{B_{i-1}},o_i) $, $\forall i\in[\kappa]$, it then follows that 
\begin{align*}
    G(\vect{x}+\epsilon\vect{1}_{B_{i-1}},b_i)\geq (1-\epsilon)G(\vect{x}+\epsilon\vect{1}_{B_{i-1}},o_i)-\frac{\epsilon R}{\kappa}.
\end{align*}
If $b_i$ is not a dummy variable and is not added in the first iteration, we can see that $(1+\epsilon/3)G(\vect{x}+\epsilon\vect{1}_{B_{i-1}},b_i)\geq w-\frac{R\epsilon}{2\kappa}$. Since the element $o_i$ is not added to $B$, it is not added at the last iteration. By the construction of $OPT$, we have that $B_{i-1}\cup\{o_i\}\in\mathcal{M}$. Therefore, 
\begin{align*}
(1-\epsilon/3)G(\vect{x}+\epsilon\vect{1}_{B_{i-1}},o_i)\leq \frac{w}{1-\epsilon/3}+\frac{R\epsilon}{2\kappa}.
\end{align*}
Then
\begin{align*}
    G(\vect{x}+\epsilon\vect{1}_{B_{i-1}},b_i)&
    \geq \frac{(1-\epsilon/3)^2G(\vect{x}+\epsilon\vect{1}_{B_{i-1}},o_i)}{1+\epsilon/3}\\
    &\qquad-\frac{(1-\epsilon/3)\epsilon R}{2(1+\epsilon/3)\kappa}-\frac{R\epsilon}{2(1+\epsilon/3)\kappa}\\
    &\geq(1-\epsilon)G(\vect{x}+\epsilon\vect{1}_{B_{i-1}},o_i)-\frac{\epsilon R}{\kappa}.
\end{align*}
Next, we consider the case where $b_i$ is a dummy variable. In this case $ G(\vect{x}+\epsilon\vect{1}_{B_{i-1}},b_i)=0$. Since $o_i$ is not added,
\begin{align*}
    (1-\epsilon/3)G(\vect{x}+\epsilon\vect{1}_{B_{i-1}},o_i)\leq \frac{\epsilon d}{3\kappa}+\frac{R\epsilon}{2\kappa}.
\end{align*}
Since $d\leq(1+\epsilon/3)\max_{s\in U}f(s)+\frac{R\epsilon}{2\kappa}\leq(1+\epsilon/3)R+\frac{R\epsilon}{2\kappa}$. Notice that when $\epsilon>0.5$, the approximation guarantee in Theorem \ref{thm:continuous} is trivial. Therefore, here we can assume $\epsilon\leq0.5$, which implies that $d\leq 3R/2$. Then we have that
\begin{align*}
    (1-\epsilon/3)G(\vect{x}+\epsilon\vect{1}_{B_{i-1}},o_i)\leq \epsilon R/\kappa.
\end{align*}
Therefore,
\begin{align*}
    G(\vect{x}+\epsilon\vect{1}_{B_{i-1}},b_i)&=0\\
    &\geq(1-\epsilon/3)G(\vect{x}+\epsilon\vect{1}_{B_{i-1}},o_i)-\epsilon R/\kappa.
\end{align*}
With this claim, we can prove the results of the lemma.
\begin{align*}
    \vect{F}(\vect{x}+\epsilon\vect{1}_B)-\vect{F}(\vect{x})&=\sum_{i=1}^\kappa\vect{F}(\vect{x}+\epsilon\vect{1}_{B_i})-\vect{F}(\vect{x}+\epsilon\vect{1}_{B_{i-1}})\\
    &=\sum_{i=1}^\kappa \epsilon\cdot\frac{\partial \vect{F}}{\partial b_i}\big|_{x=\vect{x}+\vect{1}_{B_{i-1}}}\\
    &\geq \epsilon\sum_{i=1}^\kappa\mE \Delta f(S(\vect{x}+\epsilon\vect{1}_{B_{i-1}}),b_i)\\
    &= \epsilon\sum_{i=1}^\kappa G(\vect{x}+\epsilon\vect{1}_{B_{i-1}},b_i).
\end{align*}
Here the last equality comes from the fact that $\mathbb{E}\Delta f(S(\vect{x}),u)=\mathbb{E}\widetilde{\Delta f}(S(\vect{x}),u)$.
By the claim, it follows that
\begin{align*}
    \vect{F}(\vect{x}+\epsilon\vect{1}_B)-\vect{F}(\vect{x})
    &\geq\epsilon\sum_{i=1}^\kappa (1-\epsilon)G(\vect{x}+\epsilon\vect{1}_{B_{i-1}},o_i)-\epsilon^2R\\
    &=\epsilon(1-\epsilon)\sum_{i=1}^\kappa \mE \Delta f(S(\vect{x}+\epsilon\vect{1}_{B_{i-1}}),o_i)\\
    &\qquad-\epsilon^2R\\
    &\geq\epsilon(1-\epsilon)\sum_{i=1}^\kappa \mE \Delta f(S(\vect{x}+\epsilon\vect{1}_{B}),o_i)\\
    &\qquad-\epsilon^2R\\
    &\geq\epsilon(1-\epsilon)\{f(OPT)-\vect{F}(\vect{x}+\epsilon\vect{1}_B)\}\\
    &\qquad-\epsilon^2R.
\end{align*}
Here the second and third inequality are due to submodularity and monotonicity.
 \end{proof}
 
% \begin{lemma}
% \label{lem:iterative_continuous}
%  Conditioned on the two events defined in Lemma \ref{lem:clean_event_continuous}, we have that during each implementation of \contisublong, we have that
%  \begin{align*}
     
%  \end{align*}
% \end{lemma}

\section{Technical Lemmas}

\begin{lemma}[Hoeffding's Inequality]
    \label{hoeffding}
    Let $X_1,...,X_N$ be independent random variables such that $X_i$ is $R$-sub-Gaussian and $\mathbb{E}[X_i]=\mu$ for all $i$. Let $\overline{X}=\frac{1}{N}\sum_{i=1}^NX_i$. Then for any $t>0$,
    \begin{align*}
        P(|\overline{X}-\mu|\geq t)\leq 2\exp\{-\frac{Nt^2}{2R^2}\}.
    \end{align*}
\end{lemma}
\begin{lemma}[Relative $+$ Additive Chernoff Bound (Lemma 2.3 in \cite{badanidiyuru2014fast})]
\label{lem:chernoff}
    Let $X_1,...,X_N$ be independent random variables such that for each $i$, $X_i\in[0,R]$ and $\mathbb{E}[X_i]=\mu$ for all $i$. Let $\widehat{X}_N=\frac{1}{N}\sum_{i=1}^NX_i$. Then 
    \begin{align*}
        P(|\widehat{X}_N-\mu|> \alpha\mu+\epsilon)\leq 2\exp\{-\frac{N\alpha\epsilon}{3R}\}.
    \end{align*}
\end{lemma}










% \noindent\textbf{Lemma \ref{lem:clean_event_fixed_size}}\textit{
%     With probability at least $1-\delta/3$, we have that for the solution set $S$ at any iteration of Algorithm \ref{alg:ATG} and any element $s\in U$ 
%     \begin{align*}
%         |\widehat{\Delta f_{N_1'}(S,s)}-\Delta f(S,s)|\leq \epsilon \qquad \forall t\in\mathbb{N}_+,
%     \end{align*}
%     where $N_1'=\frac{R^2\log \frac{6nh(\alpha)}{\delta}}{2\epsilon^2}$ and $h(\alpha)=\frac{\log{\kappa/\alpha}}{\alpha}$.
% }
% \begin{proof}
%     First, by applying the Hoeffding's inequality, we have that for fixed $S$ and $s$
%     \begin{align}
%         P(|\widehat{\Delta f_{N_1'}(S,s)}-\Delta f(S,s)|)\leq \frac{\delta}{3nh(\alpha)}.
%     \end{align}
%     From Lemma \ref{lem:clean_event}, we know there are at most $nh(\alpha)$ number of marginal gains to evaluate. Taking the union bound, we can conclude the proof.
% \end{proof}

\begin{lemma}
    Let $X_1,...,X_N$ be independent random variables such that $X_i\in[0,R]$ and $\mathbb{E}[X_i]=\mu$ for all $i$. Let $\overline{X}=\frac{1}{N}\sum_{i=1}^NX_i$. Then for any $t>0$ and $\delta>0$, if
    \begin{align*}
        N\geq \frac{R^2\ln(1/\delta)}{t^2},
    \end{align*}
    then $P(|\overline{X}-\mu|\geq t)\leq\delta.$
\end{lemma}
\begin{proof}
    This result follows easily from Hoeffding's Inequality.
\end{proof}

\begin{lemma}
    Let $X_1,...,X_N$ be independent random variables such that $X_i\in[0,R]$ and $\mathbb{E}[X_i]=\mu$ for all $i$. Let $\overline{X}=\frac{1}{N}\sum_{i=1}^NX_i$. Then for any $\delta>0$,
    if
    \begin{align}
        c\geq R\sqrt{\frac{\ln(2/\delta)}{2N}},
    \end{align}
    it is the case that
    \begin{align*}
        P(\mu\in[\overline{X}-c,\overline{X}+c])\leq\delta.
    \end{align*}
\end{lemma}
\begin{proof}
    This result follows easily from Hoeffding's Inequality.
\end{proof}

\begin{lemma}
    \label{lem:logx_over_x}
    Suppose $x\in\mathbb{R}$ and $x\geq 2$, if we have $x\geq\frac{2}{a}\log\frac{2}{a}$, then it holds that 
    \begin{align*}
        \frac{\log x}{x}\leq a
    \end{align*}
\end{lemma}
\begin{proof}
    % First, consider the case where $a\geq e^{-1}$. In this case $\frac{1}{a}\log\frac{1}{a}\leq e$. 
    % Since $x\geq 3$, the bound is trivial. 
    Since $y=\frac{\log x}{x}$ is decreasing when $x\geq 2$, if $x>\frac{2}{a}\log\frac{2}{a}$, then we have 
    \begin{align*}
        \frac{\log x}{x}< \frac{a}{2} \cdot \frac{\log (\frac{2}{a}\log\frac{2}{a})}{\log \frac{2}{a}}\leq a.
    \end{align*}
    % Since the inequality contradicts with the fact that $\frac{\log x}{x}\geq a$, the lemma is proven.
\end{proof}




