%=============================================================================
\section{Deferred proofs: Smoothing preliminaries}\label{sec:prelims}

Recall the definitions of our smoothing operators.

Let $B_{\eta}(x)$ be the ball of radius $\eta$ around $x$. For any function $f:\R \rightarrow \R$ and real-valued $\eta > 0$, define the function $S_{\eta}[f]:\R^n \rightarrow \R$ as $S_{\eta}[f](x) = \E_{y \in B_{\eta}(x)}[f(y)]$. 

\smoothing*

We claimed the following about $\S$.

\smoothingproperties*

While most of the above statements are proven below from first principles, a couple of them follow from~\cite[Corollary~2.4]{agarwal2018lower}. For these statements, we only detail the main ideas involved in their proofs.
\begin{proof}
    We prove the above statements in order.
    \begin{enumerate}
        \item This is a simple consequence of the linearity of expectation.
        \item By expanding the expectations in the definition of $\S$, we get that $\S[f](x) = \E_{y \sim \mu_x}[f](x)$ where $\mu_x$ is a distribution supported in $B_{(1-2^{-p})\beta}(x)$.
        \item The gradient and higher order derivatives of $\S[f]$ at $x$ depend only on the values of $\S[f]$ in an open ball around $x$, say $B_{2^{-p}\beta}(x)$. For any $y \in B_{2^{-p}\beta}(x)$, $\S[f](y)$ depends only on values of $f$ in $B_{(1-2^{-p})\beta}(y) \subseteq B_{\beta}(x)$.
        \item This follows from the proof of~\cite[Corollary~2.4]{agarwal2018lower}. It is easy to see that if $f$ is $L$-Lipschitz then $S_{\eta}[f]$ is also $L$-Lipschitz. $\nabla f$ being $L$-Lipschitz is equivalent to saying that for any unit vector $v \in \R^n$, $g_v(x) \defeq \nabla f(x) [v]$ is an $L$-Lipschitz function. However by linearity of $S_{\eta}$, $S_{\eta}[g_v](x) = \nabla S_{\eta}[f](x) [v]$. Hence $\nabla S_{\eta}[f]$ is $L$-Lipschitz. A repeated usage of this argument as done in~\cite{agarwal2018lower} proves the statement.
        \item The proof of this is via a repeated usage of~\cite[Lemma~2.3]{agarwal2018lower} as done in~\cite[Corollary~2.4]{agarwal2018lower}. They argue via Stoke's theorem that $S_{\eta}[f]$ is differentiable even when $f$ may not be differentiable in a set of measure $0$, and that $\nabla S_{\eta}[f]$ is $\frac{n}{\eta}G$-Lipschitz. They then use directional derivatives to inductively show via the same argument the Lipschitzness of the higher-order derivatives.
        \item This is a simple consequence of the fact that $\S[f](x)$ is a convex combination of the values $f(y)$ for $y \in B_{\beta}(x)$.
        \item For $y \in B_{\beta}(\vec{0})$, let $f_y$ be defined as $f_y(x) = f(x+y)$. Since $f$ is convex, it follows that each $f_y$ is convex. Also $\S[f] = \E_{y \in B_{\beta}(\vec{0})}[f_y]$. Since $\S[f]$ is a convex combination of convex functions, it follows that $\S[f]$ is convex.\qedhere
    \end{enumerate}
\end{proof}

Recall the definition of the softmax function.

\softmax*

We claimed the following about the softmax function.

\softmaxproperties*

\begin{proof}
    We prove the statements in order.
    \begin{enumerate}
        \item This is straightforward.
        \item We can conclude the $1$-Lipschitzness by looking at the norm of the gradients.\\
        \begin{equation}
            \| \nabla \smax_{\rho}(x) \| = \frac{\| (\exp(x_1/\rho),\dots,\exp(x_n/\rho)) \|}{\sum_i \exp(x_i/\rho)} \leq \frac{\sum_i \abs{\exp(x_i/\rho)}}{\sum_i \exp(x_i/\rho)} = 1.
        \end{equation}
        The convexity follows from analyzing the Hessian. We get the following as the Hessian.\\
        \begin{equation}
            \nabla^2 \smax_{\rho}(x)_{i,j} =
            \begin{dcases}
                \frac{1}{\rho} \left( \frac{\exp(x_i/\rho)}{\sum_{t \in [n]}\exp(x_t/\rho)} - \frac{\exp(2x_i/\rho)}{(\sum_{t \in [n]}\exp(x_t/\rho))^2} \right) &\text{ if $i=j$}\\
                \frac{1}{\rho} \left( - \frac{\exp((x_i + x_j)/\rho)}{(\sum_{t \in [n]}\exp(x_t/\rho))^2} \right) &\text{ otherwise}
            \end{dcases}
        \end{equation}
        Let $v \in \R^n$ denote the column vector $\nabla \smax_{\rho}(x)$. Then $\nabla^2 \smax_{\rho}(x) = \frac{1}{\rho} (\mathrm{diag}(v) - vv^{\mathsf{T}})$. The convexity of $\smax_{\rho}$ is equivalent to $\nabla^2 \smax_{\rho}(x)$ being positive semidefinite for all $x$. Since $\rho>0$, it suffices to prove that $M = \rho \nabla^2 \smax_{\rho}(x)$ is positive semidefinite. To this end, let $y \in \R^n$ be any column vector.\\
        \begin{align}
            y^{\mathsf{T}} M y &= \sum_{i \in [n]} y_i^2 v_i - \dotp{y}{v}^2\\
            &= \left(\sum_{i \in [n]} y_i^2 v_i\right) \left(\sum_{i \in [n]} v_i\right) - \left(\sum_{i \in [n]} y_iv_i\right)^2 \tag{since $\sum_{i \in [n]} v_i = 1$}\\
            &\geq 0.
        \end{align}
        The last inequality follows by using the Cauchy-Schwarz inequality on the vectors $(y_i \sqrt{v_i})_{i \in [n]}$ and $(\sqrt{v_i})_{i \in [n]}$. Here we use the fact that each $v_i$ is nonnegative.
        \item This is proven in~\cite[Theorem~7]{bullins20highlysmooth}.\qedhere
    \end{enumerate}
\end{proof}

\softmaxgradients*

\begin{proof}
    $\frac{\smax_{\rho}(x) - \smax^{\leq m}_{\rho}(x)}{\rho} = \delta$ implies that
    \begin{equation}
    \delta = \ln\left(\frac{
        \sum_{i=1}^{n} \exp\left(
            \frac{x_i}{\rho}
        \right)
    }{
        \sum_{i=1}^{m} \exp\left(
            \frac{x_i}{\rho}
        \right)
    }\right)
        =
    \ln\left(1 + \frac{
        \sum_{i= m+1}^{n} \exp\left(
            \frac{x_i}{\rho}
        \right)
    }{
        \sum_{i=1}^{m} \exp\left(
            \frac{x_i}{\rho}
        \right)
    }\right)
    \end{equation}
    
    Let $c = \frac{\sum_{i=m+1}^{n} \exp(x_i/\rho)}{\sum_{i=1}^{m} \exp(x_i/\rho)}$. Since $\delta = \ln(1+c) \geq c/2$ for $\delta<1$, an upper bound of $2c$ would suffice to prove the lemma.

    Using the equation for the gradient of $\smax$ from~\Cref{lem:smaxsmoothness}, we get $\| \nabla \smax_{\rho}(x) - \nabla \smax^{\leq m}_{\rho}(x) \|$ to be equal to
    \begin{align}
        &\frac{(\exp\left(\frac{x_1}{\rho}\right), \dots, \exp\left(\frac{x_{n}}{\rho}\right))}{ \sum_{i=1}^{n} \exp\left(\frac{x_{i}}{\rho}\right)} - \frac{(\exp\left(\frac{x_{1}}{\rho}\right), \dots, \exp\left(\frac{x_{m}}{\rho}\right), 0, \dots, 0 )}{ \sum_{i=1}^{m} \exp\left(\frac{x_{i}}{\rho}\right)}\\
        = &\frac{(\exp\left(\frac{x_{1}}{\rho}\right), \dots, \exp\left(\frac{x_{n}}{\rho}\right))}{ \sum_{i=1}^{n} \exp\left(\frac{x_{n}}{\rho}\right)} - \frac{(1+c)(\exp\left(\frac{x_{1}}{\rho}\right), \dots, \exp\left(\frac{x_{m}}{\rho}\right), 0, \dots, 0 )}{ (1+c)\sum_{i =1}^{m} \exp\left(\frac{x_{i}}{\rho}\right)}\\
        = &\frac{-c(\exp\left(\frac{x_{1}}{\rho}\right), \dots, \exp\left(\frac{x_{m}}{\rho}\right)) + (0,\cdots,0,\exp\left(\frac{x_{m+1}}{\rho}\right), \dots, \exp\left(\frac{x_{n}}{\rho}\right), 0, \dots, 0 )}{ (1+c)\sum_{i =1}^{m} \exp\left(\frac{x_{i}}{\rho}\right)}
    \end{align}
    The norm of this is at most
    \begin{align}
        & \frac{\|-c(\exp\left(\frac{x_{1}}{\rho}\right), \dots, \exp\left(\frac{x_{m}}{\rho}\right))\| + \|(\exp\left(\frac{x_{m+1}}{\rho}\right), \dots, \exp\left(\frac{x_{n}}{\rho}\right))\|}{ (1+c)\sum_{i =1}^{m} \exp\left(\frac{x_{i}}{\rho}\right)}\\
        \leq & \frac{c}{1+c} \frac{\sum_{i =1}^{m} \abs{\exp\left(\frac{x_{i}}{\rho}\right)}}{\sum_{i =1}^{m} \exp\left(\frac{x_{i}}{\rho}\right)} + \frac{1}{1+c} \frac{\sum_{i =m+1}^{n} \abs{\exp\left(\frac{x_{i}}{\rho}\right)}}{\sum_{i =1}^{m} \exp\left(\frac{x_{i}}{\rho}\right)}\\
        \leq & \frac{c}{1+c} + \frac{c}{1+c} < 2c.\qedhere
    \end{align}
\end{proof}


\section{Deferred proofs: Function construction and properties}\label{sec:hardclass}

Here we prove~\Cref{lem:singlestep,lem:finaloutput}.

\singlesteplemma*

\finaloutputlemma*

The proofs of both of these use the following lemma.

\begin{lemma}\label{lem:ballconcentration}
    Fix any $t \in [0,\dots,k-1]$. Conditioned on any fixing of $\{v_i\}_{i \leq t}$, any query $x$ in the unit ball will, with probability $1-1/n^{10}$, satisfy $\forall i>t\,\,\abs{\dotp{v_i}{x}} \leq 10\sqrt{\frac{\ln n}{n}}$.
\end{lemma}

\begin{proof}
    Note that for $i>t$, $v_i$ is distributed uniformly at random from a unit sphere in $\R^{n-t}$. The following useful concentration statement about random unit vectors follows from~\cite[Lemma 2.2]{Ball97}.

    \begin{proposition}\label{prop:concentration}
    Let $x\in B(\vec 0, 1)$. Then for a random unit vector $v$, and all $c>0$,
        \begin{equation}
            \Pr_{v}(|\<x,v\>| \geq c) \leq 2e^{-nc^2/2}.
        \end{equation}
    \end{proposition}
    
    Using~\Cref{prop:concentration} and the fact that $n-t > n/2$ we have that for any $x$ in the unit ball,
    \begin{align*}
        \Pr\left[\abs{\dotp{v_i}{x}} \geq 10\sqrt{\frac{\ln n}{n}}\right] &\leq 2e^{-n/2 \left( 10\sqrt{\frac{\ln n}{n}} \right)^2/2}\\
        &\leq 2e^{-25\ln n} \leq n^{-24}.
    \end{align*}
    Applying a union bound for each of the vectors $v_{t+1}, \dots, v_k$, we have that with probability at least $1-1/n^{23}$, $\forall i>t\,\,\abs{\dotp{v_i}{x}} \leq 10\sqrt{\frac{\ln n}{n}}$. (We use the constant $10$ in the lemma statement only because it is a nicer constant than $23$.)
\end{proof}

\begin{proof}[Proof of \Cref{lem:singlestep}]
    To show that $g(x) = g_{t+1}(x)$, we will show that for all $y \in B_{\beta}(x)$, $h(y) = h_{t+1}(y)$. Let $E_x$ be the event that $x$ satisfies $\forall i>t\,\,\abs{\dotp{v_i}{x}} \leq 10\sqrt{\frac{\ln n}{n}}$. We will show that $E_x \implies g(x) = g_{t+1}(x)$. Hence let us assume $E_x$ holds.

    We know that $x$ satisfies $\dotp{v_{j}}{x} - \dotp{v_{t+1}}{x} \leq 20\sqrt{\frac{\ln n}{n}}$ for all $j>t+1$. Hence for any $y \in B_{\beta}(x)$, $\dotp{v_{j}}{y} - \dotp{v_{t+1}}{y} \leq 20\sqrt{\frac{\ln n}{n}} + 2\beta$. To show that $h(y) = h_{t+1}(y)$, it is sufficient to show that $f_{t+1}(y) \geq f_j(y)$ for all $j>t+1$.

    Note that $f_{t+1}(y) \geq f_j(y)$ if and only if 
    \begin{align*}
    (j-t-1)n^{-\alpha} & \geq 
    \ln\left(\frac{
            \sum_{\ell =1}^{j} \exp\left(
                \frac{\dotp{y}{v_{\ell}} + (k-\ell)\gamma}{\rho}
            \right)
        }{
            \sum_{\ell =1}^{t+1} \exp\left(
                \frac{\dotp{y}{v_{\ell}} + (k-\ell)\gamma}{\rho}
            \right)
        }\right)\\
        & = 
        \ln\left(1 + \frac{
            \sum_{\ell = t+2}^{j} \exp\left(
                \frac{\dotp{y}{v_{\ell}} + (k-\ell)\gamma}{\rho}
            \right)
        }{
            \sum_{\ell =1}^{t+1} \exp\left(
                \frac{\dotp{y}{v_{\ell}} + (k-\ell)\gamma}{\rho}
            \right)
        }\right)
    \end{align*}
    Since $c \geq \ln(1+c)$, the following statement which we will show is in fact stronger.
    \[ n^{-\alpha} \geq \frac{
        k \max_{t+2 \leq \ell \leq j} \exp\left(
            \frac{\dotp{y}{v_{\ell}} + (k-t-2)\gamma}{\rho}
        \right)
    }{
        \exp\left(
            \frac{\dotp{y}{v_{t+1}} + (k-t-1)\gamma}{\rho}
        \right)
    }.\]
    
    This can be rewritten as $- \rho \alpha \ln n \geq \rho \ln k + \max_{t+2 \leq \ell \leq j} \dotp{y}{v_{\ell}} - \dotp{y}{v_{t+1}} - \gamma$, or $\gamma \geq \rho (\ln k + \alpha \ln n) + \max_{t+2 \leq \ell \leq j} \dotp{y}{v_{\ell}} - \dotp{y}{v_{t+1}}$.
    
    We know this last statement is true because the RHS is at most $\rho(1 + \alpha)\ln n + 20\sqrt{\frac{\ln n}{n}} + 2\beta$ which is smaller than $\gamma$, which is $40\sqrt{\frac{\ln n}{n}}$ (recall that $\beta = \gamma/\ln n$ and $\rho = \gamma/100 \alpha \ln n$).

    Since $E_x$ is true with probability $1-1/n^{10}$~\Cref{lem:ballconcentration}, the lemma follows.
\end{proof}


\begin{proof}[Proof of \Cref{lem:finaloutput}]
    Again, let $E_x$ be the event that $x$ satisfies $\forall i>t\,\,\abs{\dotp{v_i}{x}} \leq 10\sqrt{\frac{\ln n}{n}}$. Let us assume $E_x$ holds.

    The value of $g(x)$ can be lower bound as follows. Since $\dotp{x}{v_k} \geq -10\sqrt{\frac{\ln n}{n}}$, $h(x) \geq f_k(x) \geq \rho \ln \exp\left(\frac{-10\sqrt{\frac{\ln n}{n}}}{\rho}\right) = -10\sqrt{\frac{\ln n}{n}}$. Since $h$ is $1$-Lipschitz, $g(x) \geq -10\sqrt{\frac{\ln n}{n}} - 2\beta \geq -11\sqrt{\frac{\ln n}{n}}$ (because $\beta < 40/\sqrt{n \ln n}$).

    For $x^* = \frac{1}{\sqrt{k}} \sum -v_i$, we know each $f_i(x^*)$ is at most $\rho \ln\left( k \exp\left(\frac{-1/\sqrt{k} + k\gamma}{\rho}\right)\right) + kn^{-\alpha}$. This is at most \[ \rho \ln k - \frac{1}{\sqrt{k}} + k\gamma + kn^{-\alpha}. \]This in turn is at most $-0.8/\sqrt{k}$ since $k\gamma \leq 0.1/\sqrt{k}, n \geq \Omega(k^3),\alpha>1$ and $\rho < 1/k$. So $g(x^*) \leq -0.8/\sqrt{k} + 2\beta < -0.7/\sqrt{k}$.
    
    Since $\sqrt{\frac{\ln n}{n}} \ll 1/\sqrt{k}$, $g(x) > g(x^*) + 0.1/\sqrt{k}$ and so $x$ does not optimize $g$.

    Since $E_x$ holds with probability $1-1/n^{10}$, the lemma follows.
\end{proof}

%=============================================================================
\section{Parallel Randomized and Quantum Lower Bounds}\label{sec:infhiding}

Our randomized lower bound follows from two important properties satisfied by our hard class of functions. We abstract out these properties and define a generic class of hard functions. A class of functions $\mathscr{F}$ is an information-hiding class of functions if there is a sequence of `partially-informed' functions that reveal very little new information, with $\mathscr{F}$ containing the `fully-informed' functions.

An example the reader may want to keep in mind is the following `Guess the numbers' problem. For a sequence of numbers $A = (a_1, \dots, a_m) \in [N]^m$, consider the function $f_{A}$ that takes as input a sequence $B \in [N]^m$ and returns the sequence $A_{\leq i} 0^{m-i}$ where $i \in [m]$ is the maximum number such that $A_{<i} = B_{<i}$. The task is to learn $A$. An example `partially-informed' function would be $f_{A_{\leq i}0^{m-i}}$. This hides information in the sense that if one doesn't know $A_{\le i}$ (say $A$ is chosen uniformly at random), then for most inputs the output of $f_{A}$ would be the same as the output of $f_{A_{\leq i}0^{m-i}}$, and of course, for no input does the output of $f_{A_{\leq i}0^{m-i}}$ reveal any more information than $A_{\leq i}$.

\begin{definition}[Information-Hiding Class of Functions]\label{def:infhidingfunctions}
  Let $\mR = (\mR_1, \dots, \mR_m)$ be a random variable defining a sequence of functions $f_1, \dots, f_m$ in the sense that setting a value of $\mR_{\leq i}$ fixes the function $f_i$. The class of functions $\{f_m\}$ obtained by ranging over the various values of $\mR$ is an $m$-step $(\delta_1,\delta_2)$-information-hiding class of functions (under the distribution $\mR$) if the sequence satisfies the following properties.
  \begin{enumerate}
    \item For all $1 \leq i < m$ and any setting of $\mR_{< i}$,
    \[ \forall x \in \R^n: \Pr_{\mR_{\geq i} | \mR_{< i}} 
    \left( O_{f_m}(x) = O_{f_i}(x) \right) \geq 1-\delta_1 \]
    where $O_{f}(x)$ is the information about $f$ that the model allows us to query at $x$ (for example the function value, gradient and perhaps higher order derivatives if our queries provide them).
    \item For any setting of $\mR_{< m}$,
    \[ \forall x \in \R^n: \Pr_{\mR_{m} | \mR_{< m}} 
    \left( x \text{ is a correct output for }f_m \right) \leq \delta_2. \]
  \end{enumerate}
\end{definition}

As a corollary of \Cref{lem:singlestep,lem:finaloutput}, we see that our class of hard functions were indeed information-hiding functions.

\begin{corollary}\label{thm:gisinfhiding}
    Let $V = (v_1, \dots, v_k)$ be the random variable that is distributed Haar randomly from the possible choices of $k$ orthonormal vectors from $\R^n$. The sequence of functions $g_1,\dots,g_k$ is a $k$-step $(n^{-10},n^{-10})$-information-hiding class of functions when the allowed queries are function values and derivatives up to the $p$th order derivative.
\end{corollary}

We now prove the hardness of information-hiding classes of functions. We start with the setting of parallel randomized algorithms.

\begin{theorem}\label{thm:parallellb}
  Let $\mathscr{F}$ be an $m$-step $(\delta_1,\delta_2)$-information-hiding class of functions under the distribution $\mR$. Then for any parallel query algorithm $\mathcal{A}$ making $K$ queries per round and using less than $m$ rounds, the probability that the algorithm outputs a correct output for $f$ distributed according to $\mR$ is at most $\delta_2 + mK\delta_1$.
\end{theorem}

\begin{proof}
    Let the success probability of $\mathcal{A}$ be $p_\mathrm{succ}$ when $V$ is distributed Haar randomly. We can fix the randomness of $\mathcal{A}$ to get a deterministic algorithm $\mathcal{B}$ with success probability at least $p_\mathrm{succ}$ on the same distribution.

    Let us denote the transcript of $\mathcal{B}$ as $T = (S_1, S_2, \dots, S_{m-1}, x_\mathrm{out})$ where $S_i$ is the set of queries made in the $i$th round and $x_\mathrm{out}$ is the output of the algorithm. Note that these are random variables that depend only on $\mR$. We now create hybrid transcripts $T^{(i)}$ for $0 \leq i \leq m-1$. The hybrid transcript $T^{(i)} = (S_1^{(i)},\cdots,S_{m-1}^{(i)},x_{\mathrm{out}}^{(i)})$ is defined as the transcript of $\mathcal{B}$ when, for all $j \leq i$, the oracles calls in round $j$ (which are supposed to be to $\Oracle_{f_m}$) are replaced with oracle calls to $\Oracle_{f_j}$. Note that
    \begin{itemize}
        \item For any $V$, $T = T^{(0)}$.
        \item $T^{(m-1)}$ is a function of $\mR_{\leq m-1}$.
        \item For any $V$, if the answers of $\Oracle_{f_m}$ on $S_{i}^{(i-1)}$ are the same as the answers of $\Oracle_{f_i}$ on $S_{i}^{(i)}$ then $T^{(i-1)} = T^{(i)}$. This is because they have queried the same oracles in their first $i-1$ calls, given the same inputs in the $i$th call and gotten the same output, and have been querying the same oracles thereafter.
    \end{itemize}

    We start with the observation that
    \begin{align}
        \Pr_{\mR}[x_{\mathrm{out}}^{(m-1)} \text{ is }\epsilon\text{-optimal}] &= \E_{\mR_{< m}} \left[ \Pr_{\mR_m | \mR_{<m}}[x_{\mathrm{out}}^{(m-1)} \text{ is }\epsilon\text{-optimal}] \right] \\
        &\leq \delta_2. \tag{by property $2$ in \Cref{def:infhidingfunctions}}
    \end{align}

    Next we show that $\Pr_{\mR}[x_{\mathrm{out}}^{(m-1)} = x_{\mathrm{out}}] \geq 1-mK\delta_1$ which will complete the proof.

    \begin{align}
        \Pr_{\mR}[x_{\mathrm{out}} \neq x_{\mathrm{out}}^{(m-1)}] &\leq \sum_{i \in [m-1]} \Pr_{\mR}[x_{\mathrm{out}}^{(i-1)} \neq x_{\mathrm{out}}^{i}]\\
        &\leq \sum_{i \in [m-1]} \Pr_{\mR}[T^{(i-1)} \neq T^{(i)}]\\
        &\leq \sum_{i \in [m-1]} \Pr_{\mR}[\Oracle_{f_m}(S_{i}^{(i-1)}) = \Oracle_{f_i}(S_{i}^{(i)})]\\
        &\leq \sum_{i \in [m-1]} \E_{\mR_{<i}}\left[\Pr_{\mR_{\geq i} | \mR_{<i}}[\Oracle_{f_m}(S_{i}^{(i-1)}) = \Oracle_{f_i}(S_{i}^{(i)})] \right]\\
        &\leq m K \delta_1,
    \end{align}
    since $S_{i}^{(i-1)} = S_{i}^{(i)}$, and using property 1 in~\Cref{def:infhidingfunctions} with a union bound over the inputs in each $S_i$.
\end{proof}

We now turn to quantum query algorithms. In our quantum query model a $t$-query quantum query algorithm is a quantum circuit that uses a query oracle $t$ times. The query oracle is implemented by a unitary so that it supports queries in superposition. We allow arbitrarily high precision for the real numbers involved, and our lower bound is independent of the algorithm maker's choice of number of bits of precision. This is the same model used by and described in more detail in~\cite[Section 4.3]{GKNS21}.

We show that an information-hiding class of functions would be hard even for quantum query algorithms to compute. We can't use the above proof since we can't use a union bound on all queried points; a single quantum query may query exponentially many points in superposition. However, we know that a large fraction of this superposition is on points that don't reveal much information. The small fraction of points that do reveal information will not be noticeable to the quantum query algorithm since they are only a small fraction of the superpositioned points. We can then use the hybrid argument again to give a quantum query lower bound analogous to the classical one proved above.

\begin{theorem}\label{thm:quantumlb}
  Let $\mathscr{F}$ be an $m$-step $(\delta_1,\delta_2)$-information-hiding class of functions under the distribution $\mR$. Then for any quantum query algorithm making less than $m$ queries, the probability that the algorithm outputs a correct output for $f$ distributed according to $\mR$ is at most $\delta_2 + 4m\sqrt{\delta_1}$.
\end{theorem}

The proof of this goes via what is commonly called the hybrid argument. Fix any quantum algorithm $A$ making at most $m-1$ queries, specified by the unitaries $U_{m-1}O_{f_m}U_{m-2}O_{f_m} \cdots U_{1}O_{f_m}U_{0}$. Now we define a sequence of unitaries starting with $A_0 = A$ as follows:
\begin{align}\label{eq:unitaries}
    A_0 &\defeq U_{m-1}O_{f_m}U_{m-2}O_{f_m} \cdots O_{f_m}U_{1}O_{f_m}U_{0} \nonumber\\
    A_1 &\defeq U_{m-1}O_{f_m}U_{m-2}O_{f_m} \cdots O_{f_m}U_{1}O_{f_1}U_{0} \nonumber \\
    A_2 &\defeq U_{m-1}O_{f_m}U_{m-2}O_{f_m} \cdots O_{f_2}U_{1}O_{f_1}U_{0}\\
    &\phantom{n}\vdots \nonumber\\
    A_{m-1} &\defeq U_{m-1}O_{f_{m-1}}U_{m-2}O_{f_{m-2}} \cdots O_{f_2}U_{1}O_{f_1}U_{0} \nonumber
\end{align}

Property 1 provides us with the following lemma.

\begin{lemma}[$A_{t}$ and $A_{t-1}$ have similar outputs]\label{lem:similar}
  Let $A$ be a $m-1$ query algorithm and let $A_{t}$ for $t\in [m-1]$ be the unitaries defined in \cref{eq:unitaries}. Then
  \begin{equation}
      \E_{\mR}\bigl(\norm{A_t|0\>-A_{t-1}|0\>}^2\bigr) \leq 4\delta_1.
  \end{equation}
\end{lemma}
\begin{proof}
  From the definition of the unitaries in \cref{eq:unitaries} and the unitary invariance of the spectral norm, we see that $\norm{A_t|0\>-A_{t-1}|0\>} = 
  \norm{ (O_{f_t}-O_{f_m}) U_{t-1}O_{f_{t-1}} \cdots O_{f_1} U_0|0\>}$. Let us prove the claim for any fixed choice of vectors $\mR_{\leq t-1}$, which will imply the claim for any distribution over those vectors. Once we have fixed these vectors, the state $U_{t-1}O_{f_{t-1}} \cdots O_{f_1} U_0|0\>$ is a fixed state, which we can call $|\psi\>$. 
  Thus our problem reduces to showing for all quantum states $|\psi\>$,
  \begin{equation}\label{eq:OVOVt}
      \E_{\mR_{\ge t} | \mR_{< t}}\bigl(\norm{(O_{f_t}-O_{f_m})|\psi\>}^2\bigr) \leq 4\delta_1. 
  \end{equation}
  Now we can write an arbitrary quantum state as $|\psi\>=\sum_x \alpha_x |x\>|\phi_x\>$, where $x$ is the query made to the oracle, and $\sum_x |\alpha_x|^2 =1$.  Thus the LHS of \cref{eq:OVOVt} is equal to
  \begin{equation}
      \E_{\mR_{\geq t} | \mR_{< t}}\left(\Norm{\sum_{x} \alpha_x (O_{f_t}-O_{f_m})|x\>|\phi_x\>}^2\right)
      \leq 
      \sum_{x}  |\alpha_x|^2 \E_{\mR_{\geq t} | \mR_{< t}}\left(\norm{  (O_{f_t}-O_{f_m})|x\>|\phi_x\>}^2\right)        .
  \end{equation}

  Since $|\alpha_x|^2$ defines a probability distribution over $x$, we can again upper bound the right hand side for any $x$ instead. 
  Since $O_{f_t}$ and $O_{f_m}$ behave identically for some inputs $x$, the only nonzero terms are those where the oracles respond differently, which can only happen if $O_{f_t}(x) \neq O_{f_m}(x)$. When the response is different, we can upper bound $\norm{  (O_{f_t}-O_{f_m})|x\>|\phi_x\>}^2$ by $4$ using the triangle inequality. Thus for any $x \in \R^n$, we have 
  \begin{equation}
      \E_{\mR_{\geq t} | \mR_{< t}}\left(\norm{(O_{f_t}-O_{f_m})|x\>|\phi_x\>}^2\right)
      \leq 4 \Pr_{\mR_{\geq t} | \mR_{< t}}(O_{f_t}(x) \neq O_{f_m}(x)) \leq 4\delta_1, 
  \end{equation}
  where the last inequality follows from Property $1$.
\end{proof}

And Property 2 provides us with the following.

\begin{lemma}[$A_{m-1}$ does not solve the problem]\label{lem:Akminusone}
  Let $A$ be a $m-1$ query algorithm and let $A_{m-1}$ be defined as above. 
  Let $p_R$ be the probability distribution over $x \in B(\vec 0,1)$ obtained by measuring the output state $A_{m-1}|0\>$ when the randomness $\mR$ is fixed to $R$. Then $\Pr_{R \sim \mR, x \sim p_R} (x \text{ is a correct output}) \leq \delta_2$.
\end{lemma}
\begin{proof}
Let us establish the claim for any fixed choice of $\mR_{< m}$, since if the claim holds for any fixed choice of these vectors, then it also holds for any probability distribution over them. For a fixed choice of vectors, this claim is just $\Pr_{\mR_m, x \sim p_R} (x \text{ is a correct output}) \leq \delta_2$. Now since the algorithm $A_{m-1}$ only has oracles $O_{f_i}$ for $i<m$, the probability distribution $p_R$ only depends on $R_{< m}$. Since these are fixed, this is just a fixed distribution $p$. So we can instead establish our claim for all $x \in B(\vec 0, 1)$, which will also establish it for any distribution.

So what we need to establish is that for any $x \in \R^n$, $\Pr_{\mR_{m}} \left( x \text{ is a correct output} \right) \leq \delta_2$ which is what Property 2 gives us.
\end{proof}

Finally we can put these two lemmas together to prove our lower bound.

\begin{lemma}[$A$ does not solve the problem]\label{lem:qlowerbound}
    Let $A$ be an $m-1$ query algorithm. Let $p_R$ be the probability distribution over $x \in B(\vec 0,1)$ obtained by measuring the output state $A|0\>$ when the randomness $\mR$ is fixed to $R$. Then $\Pr_{R \sim \mR, x\sim p_R} (x \text{ is a correct output}) \leq \delta_2 + 4m\sqrt{\delta_1}$.
\end{lemma}

\begin{proof}
    Let $P_R$ be the projection operator that projects a quantum state $\ket{\psi}$ onto the space spanned by vectors $\ket{x}$ for $x$ such that $x$ is a correct output when $\mR = R$. Then $\| P_R A \ket{0} \|^2 = \Pr_{x\sim p_R} (x \text{ is a correct output})$. We know from \Cref{lem:Akminusone} that $\E_{R \sim \mR}\bigl(\norm{P_R A_{m-1} \ket{0}}^2\bigr) \leq \delta_2$. We prove our upper bound on the probability by showing that it is approximately the same as $\E_{R \sim \mR}\bigl(\norm{P_R A_{m-1} \ket{0}}^2\bigr)$.
    
    \Cref{lem:similar} states that for all $1 \leq t < m$, $\E_{\mR}\bigl(\norm{A_{t}|0\>-A_{t-1}|0\>}^2\bigr) \leq 4\delta_1$. Using telescoping sums and the Cauchy-Schwarz inequality, we see that
    \begin{align}
        \E_{\mR}\bigl( \norm{A_{m-1}|0\>-A|0\>}^2 \bigr) &\leq \E_{\mR}\left( \left(\sum_{t \in [m-1]} \norm{A_{t}|0\>-A_{t-1}|0\>}\right)^2 \right)\\
        &\leq \E_{\mR}\left(\sum_{t \in [m-1]} \norm{A_{t}|0\>-A_{t-1}|0\>}^2\right) \left( \sum_{t \in [m-1]} 1^2 \right) \leq 4\delta_1 \cdot m \cdot m.
    \end{align}

    For all $R$, $\abs{\norm{P_R A_{m-1} \ket{0}} - \norm{P_R A \ket{0}}} \leq \norm{P_R A_{m-1} \ket{0} - P_R A \ket{0}} = \norm{P_R (A_{m-1} \ket{0} - A \ket{0})}  \leq \norm{A_{m-1} \ket{0} - A \ket{0}}$. Hence
    \begin{equation}\label{eq:qlowerboundintermediatebound}
      \E_{R \sim \mR}\bigl(\bigl(\norm{P_R A_{m-1} \ket{0}} - \norm{P_R A \ket{0}}\bigr)^2\bigr) \leq 4m^2\delta_1.
    \end{equation}

    We want an upper bound on $\E_{R \sim \mR}\bigl(\norm{P_R A \ket{0}}^2 - \norm{P_R A_{m-1} \ket{0}}^2\bigr)$, which is no larger than $2 \E_{R \sim \mR}\bigl(\norm{P_R A \ket{0}} - \norm{P_R A_{m-1} \ket{0}}\bigr)$ since $\norm{P_R A \ket{0}} + \norm{P_R A_{m-1} \ket{0}} \leq 2$. We get such a bound by applying Jensen's inequality to \cref{eq:qlowerboundintermediatebound}: $\E_{R \sim \mR}\bigl(\norm{P_R A \ket{0}} - \norm{P_R A_{m-1} \ket{0}}\bigr) \leq 2m\sqrt{\delta_1}$, and so $\E_{R \sim \mR}\bigl(\norm{P_R A \ket{0}}^2 - \norm{P_R A_{m-1} \ket{0}}^2\bigr) \leq 4m\sqrt{\delta_1}$.
    
    We can now use linearity of expectation and upper bound our required probability as 
    \begin{equation} 
        \Pr_{R \sim \mR,x\sim p_R} (x \text{ is a correct output}) = \E_{R \sim \mR}\bigl(\norm{P_R A \ket{0}}^2\bigr) \leq \delta_2 + 4m\sqrt{\delta_1}. \qedhere
    \end{equation}
\end{proof}

The proofs of the quantum lower bound in \Cref{thm:main} and the highly parallel lower bound alluded to after that now follow from \Cref{thm:parallellb,thm:quantumlb}~and~\Cref{thm:gisinfhiding}.

\begin{corollary}
    The complexity of $\epsilon$-optimizing the class of functions $g$ is:
    \begin{itemize}
        \item $k$ rounds in the parallel randomized setting where in each round $K$ parallel queries are allowed, and $Kn^{-9}\ll 1$. (Note that by modifying the constants in the definition of the function, we can support $K$ being any polynomial in $n$.)
        \item $k$ queries in the quantum setting to get success probability larger than $n^{-4}$.
    \end{itemize}
\end{corollary}
