\label{sec:loss-repr}

% \def\GD#1{(\mathtt{GD}\;#1)}
% \def\GD#1{\mathtt{GF}[#1]}
\def\GD#1{\mathtt{GradFlow}[#1]}
\def\NGD#1{\mathtt{NGF}[#1]}

% Perhaps the most common way to define a continuous trajectory, or a
% vector field for that matter, is with the gradient of a function.
% One particularly natural class of trajectories is given by gradient descent
% We have now seen that commitment functions $F = \Lrn_\phi$ can be represented with vector fields (\cref{sec:vecrep}), One particularly important way of specifying a vector field is via the gradient of a loss function.
% We have seen that c
% Commitment functions
We have now seen how learners satisfying certain axioms
% (for confidence on a continuum)
% $\Lrn_\phi : \confdom \times \Theta \to \Theta$
can be represented as vector fields (\cref{sec:vecrep}).
A particularly important way of specifying a vector field is via the gradient of a potential.
	% \unskip\footnote{
	% Technically, 
	% }
	% Technically, the gradient of a function is 
This is especially true in modern machine learning, where training is idealized as loss-minimizing gradient flow \citep{arora2018convergence}, and where the substantial
%
% This relationship can be described by a 
% Indeed, t
advances of the last two decades have repeatedly demonstrated value of casting learning as optimization \citep{sra2011optimization}.
Our framework allows us to express this idea as a simple relationship between $\Lrn$ and $\Bel$:
% In our framework, the idea can be expressed as a simple relationship between $\Lrn$ and $\Bel$:

% \begin{equation}
\begin{LrnBelAxioms}[nosep]
	\item $\displaystyle
	\frac{\partial}{\partial \chi} 
		% {^+\!\Lrn}
		{\Lrn}
		(\phi, \chi, \theta)
	 	% \Big|_{\chi = \bot}  
	= \nabla_{\mskip-2mu\theta }
		% {^+\!\Bel}
		\,
		{\Bel}
			(\phi, \theta)
	$ \label{ax:lb-ascent}
\end{LrnBelAxioms}

\Cref{ax:lb-ascent} says that learning occurs by gradient ascent (%
i.e., using some measure of disbelief in observations
%  $-\Bel(\phi,\theta)$ 
as a loss):
that learning is fundamentally (just) about locally increasing degree of belief---no more, and no less.
% The negation of $\Bel(\phi,\theta)$ acts like a loss function.
% In other words, $-\Bel_\phi(\theta)$ is a loss function. 
%
It also gives us a way of turning $\Bel$ (whose output is an epistemic confidence) into a commitment flow $\Lrn$ (which takes a learner's confidence as input), which may have contributed to any ambient confusion about the distinction between the two readings of the word ``confidence''. 
% 
%
Unlike \cref{ax:effectiveness,ax:monotone,ax:truth-is-enough}, 
	\cref{ax:lb-ascent} imposes serious constraints on $\Lrn$ even if we are free to select $\Bel$.
% This allows us to define a learner $\Lrn$ by specifying $\Bel$, and 
% and so we call $\Lrn$ \emph{optimizing} if it can be 
We say $\Lrn$ is \emph{optimizing} if there exists some $\Bel$ such that the pair satisfy \cref{ax:lb-ascent}. 
%
% This way of constructing a vector field has another benefit: the flows formed from such vector fields are guaranteed to have limits as $t\to \infty$. 
This way of constructing a learner has another benefit: the flows formed from such vector fields are guaranteed to have limits and satisfy \cref{ax:acyclic}, meaning that orderless combination $\oplus$ is always well-defined. 

\vnew{
	
}

% \end{equation}
% We have asked that this relationship hold for their additive forms because, in a sense, gradients themselves are additive. 

\commentout{
To do this at full generality, we need to make sense of a gradient, which requires more structure, in the form of a Riemannian metric.  It turns out that, up to a multiplicative constant, there is a unique natural Riemannian metric on any parameterization of a probability distributions \cite{chentsov}; taking gradients with respect to this geometry, show how familiar loss functions on probability measures correspond to different standard notions of confidence in the other representations.

Suppose we have:
\begin{enumerate}[nosep]
	\item A differentiable loss function $\mathcal L : \Theta \times \Phi  \to \mathbb R$, which intuitively measures the ``incompatibility'' between a belief state $\theta$ and an assertion $\varphi$, and
	\item
		% A way of taking gradients of $U$ with respect to $\theta$,
		% % $\nabla : ()$
		% such as an inner product $g_p : T_p\Theta \times T_p\Theta \to \mathbb R$, making $(\Theta, g)$ a Riemannian manifold.
		A way of taking the gradient of ${\cal L}$ with respect to $\theta$,%
			\footnote{
			such as a tangent-cotangent isomorphism $(-)^\sharp : T^*_p\Theta \to T_p \Theta$, perhaps coming from an affine connection, in turn perhaps coming from a Riemmannian metric.}
        so as to obtain a vector field on $\Theta$ which optimizes $\mathcal L.$
\end{enumerate}

Then we can define an update rule $\GD {\cal L}$ that reduces inconsistency by gradient flow (the continuous limit of gradient descent). Concretely, such an update rule has a vector field:
% \def\GD#1{(\mathtt{GD}\; #1)}
\[
	\GD {\cal L}'_\phi(\theta) = - \nabla_\theta {\cal L}(\theta,\phi).
\]


\begin{prop}
	An update rule $F$ on a Riemannian manifold $\Theta$ is optimizing update rule if and only if $(F')^\flat$ is a conservative co-vector field.
	\cite[Prop 11.40]{lee2013smooth}
\end{prop}
}

% Note that this is true even for costs generated by asymmetric distances $c_{\{y\}}(x) = d(y, x) \ne d(x,y) = c_{\{x\}}(a)$.


Technically, to view the derivative of a function $\ell : \Theta \to \mathbb R$ as a vector field $\nabla \ell \in \mathfrak X\Theta$ (rather than a co-vector field), one needs more than a manifold structure on $\Theta$; we will assume that $\Theta$ comes with what is called a \emph{Riemannian Metric}. 
The details are unimportant; 
what matters is that we can always fall back on the Euclidean metric for subsets of $\mathbb R^n$, and that some other spaces (such as parametric families of distributions), have a different natural metric. 


% \textbf{Natural Gradients for Probability Distributions.}
% \subsection{Optimizing Commitment Functions for Probabilistic Beliefs}
\textbf{Optimizing Commitment for Probabilistic Beliefs.}
%
% Often, 
In many learning settings of interest,
beliefs $\theta \in \Theta$ are associated
with probability distributions $P_\theta \in \Delta \Omega$ over some measurable space $\Omega$. 
% When $\Theta$ parametrizes a family of probability distributions (i.e., we have in mind some function $\Pr : \Theta \to \Delta X$) there is a particularly natural metric on $\Theta$, called the Fisher information metric.
% Not only are beliefs often represented this way in practice, but also 
Fortunately, this gives us a natural Riemannian metric on $\Theta$---which, as explained above, is precisely what we need in order to make sense of gradients on a manifold. 
%
Specifically, the \emph{Fisher Information Metric} (FIM) induced by the parameterization $\theta \mapsto P_\theta$ 
% is the unique metric $\Theta$ that is independent of the representation of $\X$ \parencite{chentsov}, in the following sense.
turns out to be the unique metric (up to scalar multiple) that 
	is invariant under sufficient statistics
	\citep{chentsov}%
\footnote{
For instance, 
if $X$ and $Y$ take values in $\Omega$, and 
$p(Y|X)$ and $q(X|Y)$ are such that
% $\Pr_\theta = qp\Pr(\theta)$,
% for all $\theta$, sampling $x\sim \Pr_\theta$
% is the same as the distribution over $x'$  $y \sim p(Y|x)$, and $x'\sim q(X|y)$, is equialent to samkp
% the distribution $P_{\theta}(X)$ is unchanged is unchanged after converting to $Y$ and back again $X$ (via $p$ and $q$ respectively), 
$P_{\theta}(X) = q \circ p \circ P_\theta(X)$ 
% is unchanged is unchanged after applying the two stochastic maps, 
for all $\theta$,
\commentout{
as depicted by the in commutative diagram
% $q \circ p \circ \Pr_\theta(X) = \Pr_\theta(X)$
% \[
	% g [ \Theta \xrightarrow{\Pr} X ]  = g [ \Theta \xrightarrow{\Pr} X \xrightarrow{p}
	%  \Theta \xrightarrow{\Pr} X
	% \quad = \quad {} \xrightarrow{\theta} \Theta \xrightarrow{\Pr} X \overset p\to Y \overset q\to X,
	\[
	\begin{tikzcd}[ampersand replacement =\&]
		\Theta \ar[r,"\Pr"]\ar[d,"\Pr"']
			% \ar[rd,dashed,"\Pr^{(Y)}"description]
			\& X \\
		X \ar[r,"p"'] \& Y \ar[u, "q"']
	\end{tikzcd}, 
	\]
% \]
}%
then clearly the family $P_\theta(Y) := p\,\circ\,P_{\theta}(X)$ carries the same information about the parameters (and how to update them) as does $P_\theta(X)$.
Chentsov's theorem (\citeyear{chentsov}) tells us that the FIM 
	is the only Riemannian metric on $\Theta$ (as a function of the parameterization $\theta \mapsto P_\theta$), 
	that is the same whether derived from $P_\theta(X)$ or $P_\theta(Y)$. 
}%
---a finding that has lead many to use the term \emph{natural gradient} for gradients in this geometry, and formed the basis Information Geometry
\citep{amari1998natural,amari2000methods}.
% since we can (losslessly) convert between the two.
% $g[\Pr] = g[\Pr^{Y}]$
% \footnote{at least when $X$ and $Y$ take values in a finite set, although there have since been numerous extensions of it.}


% This allows us to compute the gradient with respect to this metric as
% At each point $\Theta$, the components of the Riemannian metric form a matrix---in this case, the Fisher information matrix $\mathcal I(\theta)$---which allow us to  compute the gradient in the natural geometry from the coordinate derivatives as
% At each point $\theta$,  the Riemannian metric form a matrix---in this case, the Fisher information matrix $\mathcal I(\theta)$---which allow us to  compute the gradient in the natural geometry from the coordinate derivatives as
% Technically,
To be rather technical for a paragraph,
a \emph{Riemannian metric} consists of an inner product $\langle\cdot,\, \cdot\rangle_\theta : T_\theta\Theta \times T_\theta\Theta \to \mathbb R$ on tangent vectors at each point $\theta \in \Theta$; 
	it can therefore be viewed as a matrix $G(\theta)$ with components $G(\theta)_{i,j} = \langle e_i, e_j \rangle_\theta$, where $\{e_i\}$ are basis vectors of the tangent space $T_\theta \Theta$.
% Thus, it can be viewed as giving a matrix $G(\theta) $
The gradient of a function $f : \Theta \to \mathbb R$ in this geometry is then given by
% \[
$
	% \NGD {\cal L}'_\phi (\theta) = -
	\nabla_\theta 
		% \mathcal L(\theta,\phi)
		f(\theta)
		% = \mathcal I(\theta)^{-1} \frac{\partial}{\partial \theta_i} U(\theta, \phi)
		:= G(\theta)^{\dagger}  \frac{\partial f}{\partial \theta}^{\mathsf T}(\theta)
		% \mathcal L(\theta, \phi),
		% f(\theta)
$
% \]
where $ \mathcal I(\theta)^{\dagger} $ denotes the Moore-Penrose psuedoinverse%
\commentout{%
	\footnote{Much of the literature assumes that the matrix $G$ is non-singular, and hence uses the inverse instead.  Many of our examples are non-singular for uninteresting reasons, and it suffices to use the pseudo-inverse here. 
	Much more can be said of the singular case; that is the domain of Singular Learning Theory \cite{slt}.}
}
of the matrix $ \mathcal I(\theta)$
and $\frac{\partial f}{\partial \theta} = [\frac{\partial f}{\partial \theta_1}, % \ldots, \frac{\partial U}{\partial \theta_i},
 \ldots, \frac{\partial f}{\partial \theta_n}]$ is the 
% gradient for the euclidean metric, 
(co)-vector of partials (i.e., the transpose of the gradient of $f$ in the Euclidean metric, which is sensitive to the choice of coordinates).
% which is the transpose of the derivative.
In the special case where $\Theta = \Delta W$ is itself the set of probability distributions over a finite set $W = \{1, \ldots, n\}$
and $P_\theta = \theta$,
% and  is the identity,
the simplex representation $\theta = P = (p_1, \ldots, p_n) \in \Theta$
(in which $\sum_{i} p_i = 1$ and $p_i \ge 0)$,
yields
$\mathcal I(P) =  \mathrm{diag}(\frac{1}{p_1}, \ldots, \frac{1}{p_n})$.
%
%
%
% \TODO[ TODO: fix below: revisit examples; double check new \P\ below ]
%
% The previous paragraph, for those who didn't follow, explains how there is a natural way to take gradients that does not depend on the choice of representation. 
For readers who did not follow the details: we now have a representation-invariant way of calculating gradients. 
%

Let us now revisit the examples from \cref{sec:intro}.
% all of which are optimizing learners.
% \begin{itemize}[nosep,itemsep=1pt,left=0.5em]
\begin{itemize}[wide,itemsep=0pt,topsep=0pt]
\item The update process of \cref{ex:prob-simple} can be shown to be the optimizing for log probability $\Bel(P, \phi) = \log P(\phi)$. In other words, it is about minimizing surprisal.

\item 
% Dempster's rule of combination \cref{ex:shafer}, 
In \cref{ex:shafer}, $\Lrn(\Bel, \alpha, \phi) = \Bel \oplus \Bel_{(\alpha,\phi)}$ is not optimizing; assuming that it is leads to a contradiction of Clairaut's theorem in the general case. 
However, in special case where the belief state $\Bel = \Plaus \in \Theta$ is restricted to probability measures, $\Lrn$ is optimizing with objective $\Bel(\Bel, \phi) = \Bel(\phi)$, perhaps atoning for the clash of symbols.
%
This differs from \cref{ex:prob-simple} only by a strictly increasing monotone function, which is why the two update rules differ only by reparameterization.
This is also the \emph{Bayesian} objective, as we will see in \cref{sec:Bayes}.

\item The learner in \cref{ex:classifier} is, by definition, an optimizing learner for $\Bel(\theta, (x,y)) = - \ell(\theta, x,y)$ to minimize loss.

\item In \cref{ex:kalman1d},
	the field generated at $K = 0$ is the gradient of
	$\Bel((\hat x, \sigma^2), z) = \frac12 (\hat x-z)^2 + \sigma^4$.
\end{itemize}
% \subsubsection{Expected Utility Maximization Update Rules}
% \subsection{Expected-Value Optimizing Learners}
\textbf{Expected-Value Optimizing Learners.}
% \subsection{Boltzmann Update Rules}
%
% We now have a representation of $\Theta$, 
% Now that we have fixed a (Riemannian) metric on $\Theta$, 
% Now that we have a way of taking gradients (via the FIM)
Having fixed the geometry on $\Theta$, 
	there is a 1-1 correspondence between optimizing commitment flows (those that satisfy \cref{ax:lb-ascent}) 
	and loss functions $\mathcal L = -\Bel_\phi : \Theta \to \mathbb R$. 
% Among optimizing update rules for probabilistic beliefs, 
% there is a class of 
% It seems natural to the 
One class of such functions stands out as a natural starting point for our investigations:
	the linear ones 
	% $\mu \mapsto \sum_{x}^n \mu_i v_i$, 
	$P \mapsto \Ex_P[V]$, 
	that is,
	expectations of of random variables $V : W \to \mathbb R$.
When $W = \{1, \ldots, n\}$, these functions are parameterized by vectors $\phi = V \in \mathbb R^n$. 
So, what learning procedure is induced by linear beliefs?


% It turns out that 
% \begin{equation}
\begin{linked}{prop}{boltz-expect-fields}
	Suppose $\Theta = \Delta W$
	and $\Phi$ consists of random variables $V: W \to \mathbb R$. 
	The flow form of the optimizing learner
	that has $\mathcal L = - \Bel(P, V) =  \Ex_{P}[V]$ 
	% $V_\phi : X \to \mathbb R$
	is 
	% the Boltzmann Learner for potential $V$. 
	% the \emph{Boltzmann Learner}. 
	\vspace{-1ex}
	\begin{align*}
		% \Boltz[U] &: (\mathbb R \times \Phi) \to \Delta\X \to \Delta\X \\
		&\Boltz
			% ^\beta_\varphi(P)
			(P, \beta, V)
			(w) :\propto
				P(w) \exp(-\beta V(w))
		% \Boltz[U](P, t, \phi)
		\commentout{\\
			&= A \mapsto \frac
				1{\Ex_{P} [ \exp(-\beta V)]}
				{\int \exp(-\beta V(x)) \mathbbm 1_A \mathrm d P(x) }
		}
		.
	\end{align*}	
\end{linked}


% We now turn to a related concept, of a \emph{Boltzmann Learner}. 
This is also known as the \emph{softmax} distribution (relative to the base measure $P$)
with logits $V$ and temperature $1/\beta$.
Intuitively, larger confidence $\beta$
	reflects increasingly certainty 
	% that the probability mass lies
	in states $w$ that have low potential $V(w)$. 
Indeed, using $\Boltz_V$ to update a distribution $P$ 
	with high confidence ($\beta \to \infty)$ 
	conditions $P$ on the minimizer(s) of $V$.
% Committing to $V$ with confidence $\beta$ results in the Boltzmann distribution for the potential energy $V$ at inverse temperature $\beta$ 
% 	(relative to the base measure $\mu$ corresponding to your prior belief). 
% For this reason, we call it a \emph{Boltzmann learner}. 
So for this learner, confidence ``tempers'' the distribution 
and coincides with the concept of thermodynamic coldness.
% and Boltzmann rationality \cite{boltzmann-rationality}

% This is the additive form of a linear rule (using the confidence domain $[0, \infty]$).
% But what about the multiplicative one (using the domain $[0,1]$)? 





% If one views the function $U_\varphi : X \to \mathbb R$
% 	as a potential energy over $X$ induced by $\phi$, then
% 	$\Boltz [U]^\beta_\varphi(\Unif)$ is the Boltzmann distribution at inverse temperature 
% 		(thermodynamic coldness) 
% 		$\beta$.
	% and base measure $\mu$.
	% In this thermodynamic analogy, as temperature decreases, one becomes more certain that particles are in their most favorable states.
\commentout{%
In this thermodynamic analogy, one becomes more certain that particles are in their lowest energy states as temperature decreases. }

% $(\mu(X),\varphi) \mapsto \mu(X ~|~ \arg\min_x U(x,\varphi))$.

% Again, suppose we have a differentiable function $U : \Theta \times \Phi  \to \mathbb R$.
% Suppose further that we have a recovering a parameter that gives rise to a given probability distribution, i.e., a section of $\Pr$, or concretely,
% a function $\Pr^{-1} : \Delta\X \to \Theta$ such that $\Pr(\Pr^{-1}(\mu)) = \mu$ for all $\mu \in \Delta\X$.
%
% This time, define an update rule directly, by
%
% \begin{align*}
%     (\mathtt{Boltz} U)_\varphi^\beta(\theta)
%         &:\propto \Pr\nolimits_\theta \exp(-\beta U(\theta,\varphi))\\
%         &:= \Pr\nolimits^{-1}\bigg\{
%             A \mapsto \Pr\nolimits_\theta(A)  \, \frac
%             % {1}
%             { \exp(-\beta U(\theta,\varphi)) }
%             {\Ex_{\Pr_\theta}[\exp(-\beta U(\theta,\varphi))]}
%             % \int_A  \exp(-\beta U(\theta,\varphi)) \mathrm d\,\Pr\nolimits_\theta
%         \bigg\}
% \end{align*}
% Now suppose we have a potential function $U : X \times \Phi  \to \mathbb R$.

% Suppose, for each $\phi \in \Phi$, we have a potential function $U_\phi : X \to \mathbb R$ on the underlying set $X$.

\begin{linked}{prop}{Boltz-props}
	\begin{enumerate}[label=(\alph*), parsep=0pt,itemsep=0.5ex]
	\item $\Boltz$ satisfies \cref{ax:zero,ax:seq-for-more,ax:cont-and-smooth,ax:acyclic,ax:combinativity}.
	% \item Updates $\Boltz[U]^\beta_\phi : \Delta X \to \Delta X$ are invertible and commutative.
	\item $\Boltz$ updates commute and are invertible iff $\beta < \infty$.
	\item $\Boltz_{U \oplus V} = \Boltz_{U+V}$\ .
	\item $\Boltz_{V_1}^{\beta_1} \circ \cdots \circ \Boltz_{V_n}^{\beta_n} (P) = \Boltz(\smash{\sum\limits_{i=1}^n \beta_i V_i}, 1, P)$.
    % \item Boltzman updates satisfy the symmetry axiom (\cref{ax:symmetry}) for
    %     $\mathrm{Aut}(\Delta \mathcal H, \mathcal X)
    %         := \{ 
    %             \sigma : 
    %         \}
    %     $
	% \item The pair \cref{ax:monotone,ax:truth-is-enough,ax:effectiveness,ax:lb-ascent} 
	\end{enumerate}
\end{linked}

\vnew{%
% Observe that everything is linear and commutative. 
Observe how well-behaved these learners are: any sequence of observations in any order is equivalent to a single observation of their weighted sum.
This property may come at a significant cost, however: learning in brains and artificial neural networks exhibits a recency bias, an effect which is arguably optimal for bounded agents
	\citep{wilson2014bounded,fudenberg2014learning}, or in changing environments.
% Recall, however, that learning in brains and artificial neural networks is
}


% [Theorem: correspondence with Shafer]
