
We show by induction that the $n^{\text{th}}$ order derivative of the cross-entropy loss w.r.t. the weight vector \(\mathbf{w}\) of a binary-logistic regression classifier is:

\begin{equation}\label{eq:n_derivative}
\mathtt{\nabla^n_{\mathbf{w}}\ell (\boldsymbol{\theta})} = Q_n(p)\mathtt{\mathbf{x}^{\otimes n}}
\end{equation}
where $Q_n(p)$ is some scalar-valued polynomial function of $p$.
%
%
%
%
\begin{proof} By induction.


\textbf{Base Case (\( n = 1 \))}:

For \( n = 1 \), the gradient of the cross-entropy loss \(\ell\) w.r.t. \(\mathbf{w}\) is:

\[
\nabla^1_{\mathbf{w}} \ell (\boldsymbol{\theta})= (p - y) \mathbf{x}
\]

This matches the form $Q_1(p) \mathbf{x}^{\otimes 1}$ for $Q_1(p) = p - y$. We have that the base case holds.

\textbf{Inductive Step}: Assume \Cref{eq:n_derivative} holds for some $n$

\[\mathtt{\nabla^n_{\mathbf{w}}\ell (\boldsymbol{\theta})} = Q_n(p)\mathtt{\mathbf{x}^{\otimes n}}
\]
we need to show that it also holds for \((n+1)\):

\[
\mathtt{\nabla^{n+1}_{\mathbf{w}} \ell (\boldsymbol{\theta}) }= Q_{n}\left(p\right) \mathtt{\mathbf{x}^{\otimes (n+1)}}
\]

By the product rule:
\begin{equation*}
    \begin{aligned}
    \mathtt{\nabla^{n+1}_{\mathbf{w}} \ell (\boldsymbol{\theta}) }= & \nabla_{\mathbf{w}} Q_{n}\left(p\right) \mathtt{\mathbf{x}^{\otimes n}}\\
    = &\left[\nabla_{\mathbf{w}} Q_{n}\left(p\right) \right]\mathtt{\mathbf{x}^{\otimes n}} 
    \end{aligned}
\end{equation*}
And by chain rule:
\begin{equation*}
    \begin{aligned}
        \nabla_{\mathbf{w}} Q_{n}\left(p\right) 
        = & \left[\nabla_{p} Q_{n}\left(p\right) \right] \nabla_{\mathbf{w}} p
    \end{aligned}
\end{equation*}

The first gradient is the derivative of a polynomial function of $p$, which is again a polynomial function of $p$. The second term, as we have seen in \Cref{app:hess_softmax}, is $p(1-p) \mathbf{x}$. Now putting everything together, we have
\begin{equation*}
    \begin{aligned}
    \mathtt{\nabla^{n+1}_{\mathbf{w}} \ell (\boldsymbol{\theta})}
    = &\left[\nabla_{\mathbf{w}} Q_{n}\left(p\right) \right]\mathtt{\mathbf{x}^{\otimes n}} \\
    = & \left[\nabla_{\mathbf{w}} Q_{n}\left(p\right) \right] p (1-p) \mathbf{x} \mathtt{\mathbf{x}^{\otimes n}}\\
    = & Q_{n+1}\left(p\right) \mathtt{\mathbf{x}^{\otimes n + 1}}
    \end{aligned}
\end{equation*}
which completes the induction.
\qedhere
\end{proof}