%%%%%%%

\section{PROOFS OF MAIN RESULTS}

We begin with the claims of the main (i.e. numbered) results. 
For convenience, we repeat the statements of the propositions before proving them. 

\recall{prop:az-iso}
% \textbf{Proposition 1.~}{\it
% 	The fractional domain $[0,1]$ the additive domain $[0,\infty]$ are isomorphic.
% 	Furthermore, the space of isomorphisms between them is in natural bijection with $(0,\infty)$.
% 	Specifically, for each $\beta \in (0,\infty)$, there is an isomorphism $\varphi_\beta : [0,1] \to [0,\infty]$ given by $\varphi_\beta(s) = -\frac1\beta \log(1-s)$ with inverse $\varphi_\beta^{-1}(t) = 1- e^{-\beta t}$.
% }
\begin{lproof}\label{proof:az-iso}
    Clearly  $\varphi_\beta$ and $\varphi_\beta^{-1}$ are continuously differentiable, and one can verify with a few steps of simple algebra that the two are inverses. In both cases, the only possible wrinkle is the at the point of high confidence, but there are no problems there either, because:
    \[
        \lim_{s \to 1} \varphi_\beta(s) = \frac1\beta \lim_{s \to 1} \log \Big(\frac{1}{1-s}\Big) = \infty
        \qquad\text{and}\qquad
        \lim_{t \to \infty} \varphi_\beta^{-1}(t) = \lim_{t \to \infty} 1- e^{-\beta t} = 1.
    \]
    
    Next, we show that $\varphi_\beta$ preserves the structure of the confidence domain. 
    We just saw that $\varphi_\beta$ and $\varphi_\beta^{-1}$ preserve the top element $\top$ of both confidence domains.  It is even more immediate that it preserves the bottom element.
    %
    It is also easy to see that both functions preserve the order (i.e., are monotonic).  For example, $\frac{\mathrm d}{\mathrm d s} \varphi_\beta(s) = \frac1{\beta(1-s)} \ge 0$. 

    Next we show that $\varphi_\beta$ and its inverse preserve independent combination $(\cseq)$.
    For $a, b \in [0,1]$, we have
    \begin{align*}
        \varphi_\beta(a \cseq b)
            &= \varphi_\beta(a + b - ab) \\
            &= -\frac1\beta \log(1 - a - b + ab) \\
            &= -\frac1\beta \log((1 - a)(1-b)) \\
            &= -\frac1\beta \log(1 - a) - \frac1\beta\log(1-b) \\
            &= \varphi_\beta(a) + \varphi_\beta(b).
    \end{align*}
    A similar calculation shows, for all $t, u \in [0,\infty]$, that
    \begin{align*}
        \varphi_\beta^{-1}(t) \cseq \varphi_\beta^{-1}(u)
            &= 1 - e^{-\beta t} + 1 - e^{-\beta u} - (1 - e^{-\beta t})(1- e^{-\beta u}) \\
            &= 2 - e^{-\beta t} - e^{-\beta u} - 1 + e^{-\beta t} + e^{-\beta u} - e^{-\beta(u+t)} \\
            &= 1 - e^{-\beta(u+t)} \\
            &= \varphi_\beta^{-1}(u + t). 
    \end{align*}

    Finally, we must show that these are the only isomorphisms between the two confidence domains.
    For this, we refer to a standard argument that is most directly seen as the solution to Cauchy's exponential functional equation $g(x + y) = g(x) g(y)$ after the change of variables $g = 1-f$.
    
    A similar argument is provided by \citet{shannon1948mathematical} in defense of entropy, and a much more direct analogue appears in the form we need by \citet{shafer1976mathematical}, who shows directly that every continuous mappings of $[0,1]$ to $[0,\infty]$ for which multiplication becomes addition in this way, must be of the form $s\mapsto - k \log(1-s)$, for some $k > 0$. 
\end{lproof}


%%%%%%%%%%%%% PROPOSITION 2 %%%%%%%%%%%%%%%%
\recall{prop:no-continuous-condition-ext}
% \textbf{Proposition 2.~}{\it
% 	There exists no continuous function $\Lrn_\phi : \Delta W \times [0,1] \to \Delta W$ 
% 	with the property that 	$\Lrn_\phi(\mu, 1) = \mu|\phi$ when $\mu(\phi) > 0$. 
% }
\begin{lproof}\label{proof:no-continuous-condition-ext}
    % In search of a contradiction, suppose otherwise.
    % That is, 
    Fix a non-empty subset $\phi \subseteq W$ and consider a function $F : \Delta W \times [0,1] \to \Delta W$
    such that $F(\mu, 0) = \mu$ and $F(\mu,1) = \mu | \phi$ whenever $\mu(\phi) > 0$. 
    Our aim is to show that $F$ cannot be continuous. 

    Fix distribution $\mu_0 \in \Delta W$ with the property that $\mu_0(\phi) = 0$. 
    For each $\delta > 0$, consider the set
    \[
        B_\delta(\mu_0)
            := \{ \mu \in \Delta W : \mathrm{TV}( \mu,  \mu_0) < \delta \}
            = \{ (1-\delta) \mu_0 + \delta P \}_{P \in \Delta W}
    \]
    of distributions within $\delta$ total variation distance
        % \footnote{$\mathrm{TV}(P,Q) := \sup_{A \subseteq W} |P(A) - Q(A) |$}
    of $\mu_0$.
        % (Recall that the total variation distance is given by $\mathrm{TV}(P,Q) := \sup_{A \subseteq W} |P(A) - Q(A) |$.)
    %
    By assumption, $F(-, 1)$ updates by conditioning on $\phi$, which means all mass not on $\phi$ is removed, and the rest is renormalized. More precisely, this means $F((1-\delta) \mu_0 + \delta P, 1) =  P$ for all $\delta \in (0,1)$, and thus
    the image of $B_\delta(\mu_0)$ under $F$ is all of $\Delta W$. 
    Therefore, for every $\epsilon \in (0, 1)$,
    %  say $\epsilon = 0.5$ for definiteness, 
    there cannot be $\delta > 0$ such that $\mu \in B_\delta(\mu_0)$ 
        implies $F(\mu, 1) \in B_{\epsilon}( F(\mu_0, 1) )$. 
    Thus $F$ cannot be continuous. 
\end{lproof}

%%%%%%%%%%%%% PROPOSITION 3 %%%%%%%%%%%%%%%%
\recall{prop:maximal-continuous-theta}
% \textbf{Proposition 3.~}{\it
% 	For all $\phi \in \Phi$, 
% 	there is a maximal open set 
% 	$\Theta_\phi \subseteq \Theta$ such that the restriction
% 	$
% 	\Lrn_{\phi} |_{\Theta_\phi} : 
% 		[\bot,\!\top) \times \Theta_\phi \to \Theta
% 	$		
% 	of $\Lrn_\phi$ to $\Theta_\phi$ is continuous. 	
% }
\begin{lproof}\label{proof:maximal-continuous-theta}
    As noted in the main text, the observation $\phi$ is not mathematically relevant to the argument; to simplify notation, we work with the commitment function $F := \Lrn_\phi : \Theta \times [\bot, \top] \to \Theta$. 
    In this context, the belief space $\Theta$ and confidence domain $\confdom$ both implicitly have topologies. Let $\tau \subseteq 2^\Theta$ denote the topology associated with $\Theta$ 
        (i.e., the collection of all open subsets of $\Theta$). 
    Given $U \subseteq \Theta$, we use the standard notation $F|_U$ to denote the restriction of the function $F$ to domain $U \times \confdom$. 

    By assumption (L2), for each fixed $\theta \in \Theta$, the function $F_\theta : \confdom \to \Theta$ is continuous. 
    Let 
    \[
        \mathcal U :=  \Big\{ U \in \tau  ~\Big|~
            F|_U : U \times \confdom \to \Theta \text{ is continuous }
            \Big\}
    \]
    be the set of all open subsets of $\Theta$ on which the restriction of $F$ is continuous. 
    Since unions of open sets are open, 
    we know that $\Theta_\phi := \bigcup \mathcal U \subseteq \Theta$ is open.
    We now show that it is the maximal open set on which $F$ is continuous, as promised by the theorem.
   

    Recall that a function $f : X \to Y$ is continuous iff the preimage $f^{-1}(V) = \{ x \in X : f(x) \in V\}$ of an open set $V \subseteq Y$ is itself an open set. 
%
    Given $V \subseteq \Theta$, observe that
    \begin{align*}
        (\theta, \chi) \in (F|_{\Theta_\phi})^{-1}(V)
        &\iff  \quad \exists U \in \mathcal U.~ \theta \in U ~\text{ and }~ F(\theta, \chi) \in V \\
        &\iff \quad \exists U \in \mathcal U.~  (\theta, \chi) \in (F|_U)^{-1}(V)  \\
        & \iff (\theta, \chi) \in \bigcup_{U \in \mathcal U} (F|_U)^{-1}(V). 
    \end{align*}
    In other words, we have shown that $(F|_{\Theta_\phi})^{-1}(V) = \bigcup_{U\in \mathcal U} (F|_U)^{-1}(V)$.
   
%
    % From the conclusion of the previous paragraph, it
    It follows that the preimage $(F|_{\Theta_\phi})^{-1}(V)$ of an open set $V \subseteq \Theta$ is a union of open sets (since each $F_U$ was assumed to be continuous), and hence itself open.
    Therefore $F|_{\Theta_\phi}$ is continuous, and since $\Theta_\phi$ contains every other open set satisfying that property, it is the maximal such open set. 
\end{lproof}

% \textbf{Note.} The version stated in the text has two typos. 

We will return to \cref{theorem:add-reparam} in \cref{sec:proof-addrep}.
Previously, the following result was in the main text, but we no longer believe it important to state formally; we give it again here for completeness, as it still supports the discussion in \cref{sec:vecrep}.

%%%%%%%%%%%%%%%%%%%%  PROPOSITION 5 %%%%%%%%%%%%%%%%%%
\begin{linked}{prop}{at-most-one-flow}
	If $\Lrn$ is a commitment flow and $\phi_1, \phi_2 \in \Phi$,
	% then there is a unique function
	then there is at most one commitment flow
	$\Lrn_{\phi_1 \oplus \phi_2}
	 	: [0, \infty] \times \Theta \to \Theta$
	such that
	$\Lrn'_{\phi_1 \oplus \phi_2} = \Lrn'_{\phi_1} + \Lrn'_{\phi_2}$.
\end{linked}
% \recall{prop:at-most-one-flow}
\begin{lproof} \label{proof:at-most-one-flow}
    Most of the work is done by an important result in differential geometry: 
    
    \begin{fact}[The Fundemental Theorem on Flows]
        If $X \in \mathfrak X(\Theta)$ is a somoth vector field, then
        there is a unique function
        $f : \mathcal D \to \Theta$
        where $\mathcal D \subseteq \mathbb R \times \Theta$ is maximal,
        satisfying
        $
        % \[
            f(a, f(b, \theta)) = f(a+b,\theta)
        $
        whenever $(a+b, \theta) \in \mathcal D$,
        and 
        $
            \frac{\partial}{\partial t}
                 f(t,\theta)
                % \underset{\chi=0}|
                |_{t{=}0}
                \!\!= X(\theta)
            % \]
        $
        % for all $\theta \in \Theta$ and $a,b\ge 0$.
        for all $(t,\theta) \in \mathcal D$. 
        \label{fact:unique-integral-curves}
    \end{fact}

    The statement above is a gloss and selective restatement of the statement of the result as presented by \citet[][Theorem 9.12]{lee2013smooth}, which inlines the definition of a flow (Equations 9.6 and 9.7).
    % and bypaseses the discussion about flow domains.
    %
    A further alteration: we are interested in a minor variant in which the vector field $X$ and the function of interest are not necessarily smooth (i.e., infinitely differentiable), but rather merely twice differentiable $(C^2)$. As discussed in Appendix C of Lee and more directly treated by \citet[\S4.1]{abraham2012manifolds}, precisely the same techniques suffice to establish the analogous result without assuming smoothness.

    Applying the $C^k$ analogue of \cref{fact:unique-integral-curves} to the vector field $X = \Lrn_{\phi_1\oplus\phi_2} = \Lrn_{\phi_1} + \Lrn_{\phi_2}$, we find that
    there is a unique flow $F : \mathcal D \to \Theta$ whose derivative is $X$ and whose domain $\mathcal D \subseteq \mathbb R \times \Theta$ is maximal. 
    %
    % What remains is to show that $[0,\infty) \subseteq \mathcal D$. 
    Thus, 
    there is at most one function satisfying \cref{ax:zero,ax:combinativity,ax:cont-and-smooth}, and hence at most one commitment flow.
    % The primary missing piece is that we are no longer guaranteed that there is a well-defined limit as $t \to \infty$. \qedhere 
    The primary missing piece is that the resulting flow may no longer be \emph{complete}---following the sum of the two fields may ``leave'' the manifold $\Theta$ in finite time, and, even if it stays within the manifold, it may exhibit cyclic behavior, violating \cref{ax:acyclic} or standing in the way of a well-defined continuous completion at the limit $t \to \infty$.
    \qedhere 
        
    % We also need to verify \cref{ax:acyclic} and \cref{ax:ineq-witness}. 
    % \cref{ax:acyclic} follows from being the integral of a flow, 
    % \TODO 

    % \begin{coro}
    %     If $\Lrn_{\phi_1}$ and $\Lrn_{\phi_2}: \Theta \times [0,1] \to \Theta$ are distinct,
    %     then so are $\Lrn'_{\phi_1}$ and $\Lrn'_{\phi_2}
    %     $.
    %     \label{fact:unique-flow-for-vfield}
    % \end{coro}
    
\end{lproof}


%%%%%%%%%%%%%%%%%%%%  PROPOSITION 6 %%%%%%%%%%%%%%%%%%
\recall{prop:linterleave}
\begin{lproof}\label{proof:linterleave}
    % Taking the derivative with respect to $\chi$, we have
    % \begin{align*}
    %     % \frac{\partial}{\partial \chi} \Big[ \Lrn^\chi \Big]
    %     \frac{\partial}{\partial \chi} \Big[ \lim_{n \to \infty} L_{\chi/n}^{(n)}(\theta) \Big]
    %         % &= \lim_ 
    % \end{align*}
    $\Lrn_{\phi_1\oplus\phi_2}^\chi(\theta)$ is, by definition, the result of integrating a vector field from $t=0$ to $t=\chi$. 
    That integration can be thought of as taking a process of taking (infinitely) many (infinitesimal) sequential steps in the direction of that field. 
    
    In the limit as $\epsilon\to 0$, 
    \[ 
    \Lrn_{\phi_1 \oplus \phi_2}^{\epsilon}(\theta_0) = \theta_0 + \epsilon \Lrn'_{\phi_1\oplus\phi_2} = \theta_0 + \epsilon \Lrn'_{\phi_1} + \epsilon \Lrn'_{\phi_2}
    \]
    can be viewed as a small linear addition to the original position (in any choice of local coordinates). 
    Yet by the same approximation, this is also what results from as an infinitesimal update of $\Lrn_{\phi_1}$ followed by $\Lrn_{\phi_2}$, which equals $L_\epsilon(\theta)$!
    As $\epsilon \to 0$, the Euler integration method of the field $\Lrn'_{\phi_1\oplus\phi_2}$ starting at $\theta$ from $t=0$ to $t=\chi$ with step size $\epsilon$, which equals $\Lrn^\chi_{\phi_1\oplus\phi_2}(\theta)$, is actually calculating $\lim_{n\to \infty} L^{(n)}_{\chi/n}(\theta)$. Therefore the two quantities are equal.
\end{lproof}

%%%%%%%%%%%%%%%%%%% PROPOSITION 7 %%%%%%%%%%%%%%%%%
% \textbf{Proposition 7.~}{\it
% 	Suppose $\Theta = \Delta X$
% 	% and $\Phi$ consists of random variables $X \to \mathbb R$. 
% 	and $\Phi = \{ V : X \to \mathbb R\}$ consists of random variables over $X$. 
% 	The flow form of the optimizing learner
% 	that has $\Bel(\mu, V) =  \Ex_{\mu}[V]$ 
% 	% $V_\phi : X \to \mathbb R$
% 	is 
% 	% the Boltzmann Learner for potential $V$. 
% 	% the \emph{Boltzmann Learner}. 
% 	\vspace{-1ex}
% 	\begin{align*}
% 		% \Boltz[U] &: (\mathbb R \times \Phi) \to \Delta\X \to \Delta\X \\
% 		&\Boltz[U]
% 			% ^\beta_\varphi(\mu)
% 			(\mu, \beta, \phi)
% 			(x) :\propto
% 				\mu \exp(-\beta V(x))
% 		% \Boltz[U](\mu, t, \phi)
% 		\commentout{\\
% 			&= A \mapsto \frac
% 				1{\Ex_{\mu} [ \exp(-\beta V)]}
% 				{\int \exp(-\beta V(x)) \mathbbm 1_A \mathrm d\mu(x) }
% 		}
% 		.
% 	\end{align*}	
% }
\recall{prop:boltz-expect-fields}

\begin{lproof} \label{proof:boltz-expect-fields}
    First, we calculate the vector field given by the gradient of $\Bel(\mu, V) = \Ex_\mu[V]$ in  the natural (Fisher) geometry for $\Theta = \Delta X$.
    % the vector field form $\Lrn$ is given by
    \begin{align*}
        % X(\mu) &=
         \hat\nabla_\mu \Bel(\mu, V) 
        &= \hat\nabla_\mu \Ex_\mu[V] \\
        &= \mathcal I(\mu)^{-1} ( \nabla_\mu \Ex\nolimits_\mu[V] - \lambda \mathbf 1) \\
    \intertext{ where $\lambda$ is the Lagrange multiplier associated with the constraint $g(\mu) = \sum_{x} \mu(x) - 1 = 0$, which has gradient $\nabla_\mu g(\mu) = \mat 1$.  The field is therefore given by}
        &= \left[
            %  \frac{1}{\mu(x)}
            \mu(x)
            \frac{\partial}{\partial \mu(x) } \Ex_\mu[V] - \lambda \mu(x)
        \right]_{x \in X} \\
        &= 
        x \mapsto \mu(x) (V(x) - \lambda)
    \end{align*} 
    for some constant $\lambda$.  We can solve for $\lambda$ with the observation that the result must yield a vector tangent to the probability simplex, i.e., the sum across all components must equal zero; thus $\sum_{x \in X} \mu(x)( V(x) - \lambda) = \Ex_\mu[V] - \lambda = 0$, and so we must have $\lambda = \Ex_\mu[V]$. Therefore,
    \begin{align*}
        \hat\nabla_\mu \Bel(\mu, V) &= x \mapsto \mu(x) (V(x) - \Ex\nolimits_\mu[V]) \\
        &= \mu \odot ( V - \Ex\nolimits_\mu[V] ),
    \end{align*}
    where $\odot$ is used to emphasize that it is an element-wise product between vectors. 
    
    At the same time, we can calculate the path velocity of the Boltzman update rule.
    Letting $Z := \Ex_\mu[ \exp(- \beta V) ]$ be the normalization constant, 
    % \begin{align*}
    $
        \frac{\partial Z}{\partial \beta} 
        = \Ex_\mu\left[ \frac{\partial}{\partial\beta} \exp(-\beta V)\right]
        = \Ex_\mu[ -V \exp(-\beta V) ]  
    $. 
    % \end{align*}
    Keeping that in mind, we can calculate: 
    \begin{align*}
        \frac{\partial}{\partial \beta} \Boltz[V](\mu, \beta) \,\Big|_{\beta=0}
        % &= x \mapsto  \frac{\partial}{\partial \beta} \Big[ \frac{\mu(x) \exp(-\beta V(x))}{\Ex_\mu[\exp(-\beta V)]}\Big] \\
        &= x \mapsto  \frac{\partial}{\partial \beta} \Big[ \frac{\mu(x) \exp(-\beta V(x))}{\Ex_\mu[\exp(-\beta V)]}\Big] \\
        &= x \mapsto \mu(x) \frac{\partial}{\partial \beta} \Big[ \exp(-\beta V(x)) \Big]_{\beta=0} + \mu(x) \exp(-\beta V(x)) \frac{\partial}{\partial \beta} \Big[ \frac1Z \Big]_{\beta=0}
        \\
        &=  x \mapsto \mu(x) \exp(-\beta V(x)) \Big( -V(x) +  \frac{\partial}{\partial \beta} \Big[ \frac1Z \Big]_{\beta=0} \Big) \Big|_{\beta=0}
        \\
        &=  x \mapsto \mu(x) \Big( -V(x) - \frac{1}{Z^2} \frac{\partial Z}{\partial \beta} \Big)
        \\
        &=  x \mapsto \mu(x) \Big( -V(x) - \frac{\Ex_\mu[ -V \exp(- \beta V) ]}{\Ex_\mu[ \exp(- \beta V) ]^2} \Big|_{\beta=0} \Big)
        \\
        &=  x \mapsto \mu(x) ( -V(x) - \Ex\nolimits_\mu[ -V])
        \\
        &= \mu \odot ( \Ex\nolimits_\mu[V] - V)
        .
    \end{align*}
    % As a sanity check, note that the sum over all components is
    % \[ \sum_{x \in X} ((\Boltz[V])'_\varphi\, \theta)_x
    %         = \sum_{x \in X} p(x) (\Ex\nolimits_p[g] - g(x))
    %         = \Ex\nolimits_p[ \Ex\nolimits_p [ g ]] - \Ex\nolimits_p [g] = 0,
    %     \]
    %     so indeed it lies within the tangent space.
    Since this is the same field as before, \cref{prop:at-most-one-flow} tells us that $\Boltz_V$ is the unique flow representation of the optimizing learner with potential $\Ex_\mu[V]$. 
\end{lproof}

%%%%%%%%%%%%%%%%%%% PROPOSITION 8 %%%%%%%%%%%%%%%%%
\recall{prop:Boltz-props}
\begin{lproof}\label{proof:Boltz-props}
    (a) L1 and L2 are obvious. 
    L4 follows from the fact that (as shown in \cref{prop:boltz-expect-fields}), the field is the gradient of a potential, and so it cannot have closed integral curves. 
    L5 is actually part (c), and L3 follows from L5 and the fact that adding numbers makes them larger.

    (b) 
    % This also follows from part (c). 
    Boltzmann updates commute because 
    \[ 
        \Boltz_u^{\beta_1} \circ \Boltz_v^{\beta_2}(\mu)
            \propto \mu \exp(-\beta_1 u) \exp(-\beta_2 v)
            =\mu \exp(-\beta_2 v)\exp(-\beta_1 u) 
            \propto \Boltz_v^{\beta_2}\circ\Boltz_u^{\beta_1}(\mu).
    \]
    If $\beta < \infty$, the update $\Boltz_u^{\beta}$ can be inverted by $\Boltz^\beta{k-u}$ where $k$ is any constant. 
    If $\beta = \infty$, then it amounts to conditioning, and hence is not invertible. 

    (c)
    Adding the vector fields discovered in the proof of \cref{prop:boltz-expect-fields},
    \begin{align*}
        \Boltz'_{u \oplus v} 
        &= \Boltz'_u  + \Boltz'_v \\
        &=  \mu \odot (\Ex\nolimits_\mu[u] - u) + \mu \odot (\Ex\nolimits_\mu[v] - v) \\
        &= \mu \odot (\Ex\nolimits_\mu[u+v] - (u+v)) \\
        &= \Boltz'_{u+v}. \qedhere
    \end{align*}

    (d) Slightly generalizing the calculation of part (b):
    \begin{align*}
        \Boltz_{v_1}^{\beta_1} \circ \cdots \circ \Boltz_{v_n}^{\beta_n} (\mu)
            &\propto \mu \prod_{i=1}^n \exp(- \beta_i v_i) \\
            &\propto \mu \exp\Big( - \sum_{i=1}^n \beta_i v_i \Big) \\
            &\propto \Boltz_{\sum\limits_{i=1}^n \beta_i v_i}
    \end{align*}
\end{lproof}

%%%%%%%%%%%%%%%%%%% THEOREM 9 %%%%%%%%%%%%%%%%%

\recall{prop:Boltz-Bayes}
\begin{lproof}\label{proof:Boltz-Bayes}
    One direction is easy: if $\Lrn$ is Bayesian with likelihood $P(\,\cdot\mid\cdot\,)>0$, then belief states are probability distributions, and so for $\star := \beta = 1$, a Bayesian update with likelihood $P(X \mid H)$ can be written as
    \begin{align*}
        P_{\Lrn(\theta,\star,\phi)}(h) &\propto P_\theta(h) \cdot P(\phi \mid h) \\
            &\propto P_\theta(h) \cdot \exp( \log P(\phi \mid h)),
    \end{align*} 
    and so coincides with the Boltzmann update with confidence 1 and potential $- \log P(\phi \mid h)$. This simple well-known fact is largely responsible for the prevalence of ``tempering'' and exponential families in the Bayesian literature. In effect, it just converts between the additive and multiplicative domains. 

    The opposite direction is less well-known, and considerably less intuitive.
    We cannot simply invert the construction above, because, owing to the fact that probabilities are constrained to sum to one, not every potential can be obtained by the logarithm of a conditional probability in this way.
    However, we can circumvent this by choosing a new measurable space $\mathcal X$.

    Concretely, suppose we are given a potential $u : \Phi \times \mathcal H \to [0, \infty)$. 
    In this case, define $X$ to be a variable whose can take on values $2^\Phi$, and define the likelihood $P(X | h)$ according to:
    \[
        P(X{=}A \mid h) := \prod_{\phi \in A} \exp(-u(\phi,h)) \prod_{\phi \in \bar A} (1-\exp(-u(\phi, h))).
    \] 
    It is not hard to see that this implies
    \[
        P(X{\supseteq}A \mid h) = \prod_{\phi \in A} \exp(-u(\phi,h)) = \exp(-\sum_{\phi \in A} u(\phi, h)).
    \]
    By viewing an observation $\phi$ as the event $X \supseteq \{\phi\}$, we now have an event whose (strictly positive) likelihood corresponds to the potential $u(\phi, -)$.
    This establishes the reverse direction of the theorem.
\end{lproof}


\commentout{%
    \recall{prop:continuum-seqacyc}
    \begin{lproof} \label{proof:continuum-seqacyc}
        Assume that $\confdom$ is a continuum (i.e., totally ordered, one-dimensional, and connected).
        Furthermore, assume that $F : \confdom \times \Theta \to \Theta$ satisfies \cref{ax:zero,ax:cont-and-smooth,ax:combinativity}.
        
        To establish \cref{ax:seq-for-more}, 
        suppose $\chi_1 \le \chi_2$, and choose $\theta \in \Theta$.

        By \cref{ax:combinativity}, we must find $\chi'' \in (\bot, \chi']$ such that
        $F(\chi'' \cseq \chi, \theta) = F(\chi', \theta)$. 
        
        By \cref{ax:zero}, we know that $F(\bot, \theta) = \theta$, and by \cref{ax:cont-and-smooth}, we know that $F_\theta |_{[\bot,\chi']} : [\bot,\chi']$ is a continuous path from $\theta$ to $F(\chi',\theta)$ that passes through $F(\chi,\theta)$ (since $\chi < \chi'$, and $\confdom$ is totally ordered and connected).  
    \end{lproof}
}%



\subsection{The Additive Representation Theorem}
    \label{sec:proof-addrep}
The proof of \cref{theorem:add-reparam} is a bit more techncial than the others. 
We will first need a technical result about differential geometry.
In this section we assume that is a continuum (a one-dimensional, totally ordered confidence domain),
and that  $F : \confdom \times \Theta \to \Theta$ is a commitment function (satisfying \cref{ax:zero,ax:cont-and-smooth,ax:seq-for-more,ax:acyclic,ax:combinativity}).

Now a few definitions. A point $p = (\chi,\theta) \in \confdom \times \Theta$
is called \emph{active} if $\frac{\partial F}{\partial \chi}|_p \ne 0$. 
$p$ is a \emph{submersion point}, or \emph{submersive}, if $d F|_p : T_p(\confdom \times \Theta) \to T_{F(p)}\Theta$ is surjective. 
(That is, if $F$ is a submersion at $p$.)

\begin{lemma}\label{lem:active-sub}
    For all $\theta \in \Theta$, if there exists an active point $p$ in the fiber $F^{-1}(\theta)$, then there also exists an active point $\hat p$ in the fiber that is a submersion point. 
\end{lemma}
\begin{proof}
    For the sake of contradition, suppose otherwise---that $p^* = (\chi^*, \theta_0) \in F^{-1}(\theta)$ is an active point in the fiber $F^{-1}(\theta)$, but no submersion point in the fiber is active (i.e., $\pd F \chi |_{p} = 0$). 
    % Suppose, for contradiction, that the lemma is false
    % Fix $\theta \in \Theta$, and suppose that there exists is an active point
    % $p^* = (\chi^*, \theta_0) \in F^{-1}(\theta)$ in the fiber $F^{-1}(\theta)$.
    
    Select a sequence of strictly increasing confidences $(\chi_n) \in \confdom^{\mathbb N}$ that approach $\chi^*$ from below. (So $(\chi_n) \to \chi^*$.)
    %
    For each $n$, define $\theta_n := F(\chi_n, \theta_0)$. Since $F$ is continuous, $(\theta_n) \to F(\chi^*,\theta_0) = \theta$. 
    By \cref{ax:seq-for-more}, since $\chi_n < \chi^*$, we are guaranteed that there exists some $\delta_n \le \chi^*$ such that $F(\delta_n, F(\chi_n, \theta_0)) = \theta$, which we use to define the sequence $(\delta_n)_{n \in \mathbb N}$.  
    Defining $p_n := (\delta_n, \theta_n)$ gives a sequence of points, each lying in the fiber $F^{-1}(\theta)$ owing from the definitions of $\theta_n$ and $\delta_n$. Note that $(p_n) \to (\delta_{\lim}, \theta)$. 
    Since $\confdom$ is homeomorphic to an interval, it is bounded, so by the Bolzano-Weierstrass theorem, $(\delta_n)$ has a convergent subsequence; let $(\delta_m)$ be such a subsequence limiting to the smallest possible value (i.e., $\lim_{m\to\infty} \delta_m = \lim\inf_{n \to \infty} \delta_n =: \delta_{\lim}$).

    Define also the sequence $(q_n = (\chi_n, \delta_n))_{n \in \mathbb N}$.
    Intuitively, each $q_n = (\chi_n,\delta_n)$ is a different way of splitting up the effective total confidence $\chi_n \cseq \delta_n \cong \chi^*$.
    
    % Since $p_n \to (\delta_{\lim}, \theta)$, by continuity of $F$ (\cref{ax:cont-and-smooth}), 
    %     We know that $F(\delta_{\lim},\theta) = \theta$. 
    % By \cref{ax:zero}, $F(\bot, \theta) = \theta$.
    % Since $\bot \le \theta$,  \cref{ax:acyclic} tells us that for all $\chi \in [\bot, \delta_{\lim}]$,
    %     $F(\chi,\theta) = \theta$. This implies that $\pd F \chi = 0$ at these points, and in particular, 
    %         $\pd F \chi(\delta_{\lim}, \theta) = 0$. 
    % As $F(\delta_{\lim}$ 
    % $\pd F\chi$
    
    % \textbf{Case 1: $\delta_{\lim} = \bot$.}
    Intuitively, as $\chi_n$ approaches $\chi^*$, the remaining residual confidence $\delta_n$ required to effectively get there should decrease to $\bot$. 
    % This is the primary case, which we cover first. 
    Indeed, 
    \[
        \lim_{n\to \infty} \delta_n = \lim_{n\to \infty}s(\chi^*,\chi_n) = s(\chi^*, \lim_{n\to\infty}\chi_n)
         = s(\chi^*,\chi^*) = \bot. 
    \]
    
    The point $p_\bot = (\bot, \theta)$, which is obviously in the fiber $F^{-1}(\theta)$, is a submersion point---since $F(\bot, \,\cdot\,) = \mathrm{id}_\Theta$ is the identity map on $\Theta$, it follows that $\pd F\theta|_{p_\bot}$ is the identity map on $T_\theta \Theta$ (i.e., the identity matrix in any coordinate representation). 
    This is a sufficient condition for the differential of $F$ to be surjective at this point, even if the derivative with respect to $\chi$ is zero. 
    Furthermore, since the set of invertable matrices is open and $F$ is $C^1$ along the line $\{\bot\}\times\Theta$, it follows that any point sufficiently close to that line (i.e., with small enough value of $\chi$) will be a submersion point as well. 
    
    Define the function $H(\chi, \delta) := F(\delta, F(\chi,\theta_0)) : \confdom^2 \to \Theta$, whose utility we will see shortly. 
    The level set $H^{-1}(\theta)$ consists of confidence pairs $(\chi,\delta)$ for which $F(\delta \cseq \chi,\theta_0) = \theta$ for which sequential application leads to our target. 
    At the point $p_n$, what direction keeps us within this set? 
    Taking the differential of $H$ at the point $p_n$, by the chain rule, we find: 
    \begin{equation}
        dH|_{p_n} (v) 
            = v_\delta \Big( \pd F\chi{(\delta_n,\theta_n)} \Big) + v_\chi \Big( \pd F\theta {(\delta_n,\theta_n)}  \pd F\chi{(\chi_n,\theta_0)} \Big),
            \label{eq:dH}
    \end{equation}
    for a vector $v = v_\delta \pd{}\delta + v_\chi \pd{}\chi \in T_{p_n}\confdom^2$ tangent to $p_n$.
    We are looking for vectors in the kernel of $dH|_{p_n}$ (i.e., for which $dH_{p_n}(v) = 0$); these are the ones that lie tangent to the level set of interest.
    \unskip\footnote{
    In more detail: since this differential has constant rank at a neighborhood of the limiting point (as we are about to show), the points lie on a smooth sub-manifold, by the constant rank level subset theorem.
    That submanifold is a one-dimensional curve the primary argument in the proof of \cref{theorem:add-reparam}---from \cref{ax:seq-for-more,ax:cont-and-smooth,ax:combinativity}, it follows that all $\pd F \chi$.
    }
        % \footnote{It can be shown that this }
    Remarkably, this relates the conditions of activeness and submersiveness at $p_n$ to activeness at the point $p^*$, which was guaranteed by assumption!
    \begin{itemize}
        \item  By our assumption that $p^* = (\chi^*,\theta_0)$ is active, the derivative
            $\pd F \chi|_{(\chi^*,\theta_0)} =: v^*$ exists and is a nonzero tangent vector;
            moreover, that nonzero value is the limit of the sequence $(\pd F\chi(\chi_n, \theta_0) )_{n \in \mathbb N}$.
            Therefore, for $\epsilon > 0$ there exists an integer $N_1$ for which $\pd F\chi(\chi_n, \theta_0)$ is within $\epsilon$ of $v^*$ (for any choice of coordinates) for all $n > N_1$.
        \item Since $\delta_{\lim} = \bot$, we know that $(p_n) = (\delta_n,\theta_n) \to (\bot,\theta)$. Therefore, there exists an integer $N_2$ for which $n > N_2$ implies $p_n$ is in the a neighborhood of $p_\bot$ where $\pd F \theta$ is within $\epsilon$ of the identity matrix (say for the same choice of coordinates and $\epsilon$) and in particular invertible.
        %
        Therefore, $p_n$ is submersive; since we assumed for contradiction that there are no active submersive points in the fiber, we must conclude that $\pd F\chi(p_n) = \pd F\chi(\delta_n,\theta_n) = 0$. 
        So the first term of \eqref{eq:dH} is zero.
    \end{itemize}
    From these two observations, we deduce that, for all $n > \max(N_1, N_2)$, 
    the quantity $w_n := \pd F \theta(p_n) \pd F \chi (\chi_n,\theta_0)$ 
    on the right side of \eqref{eq:dH},
    is the product of an invertable matrix (whose trace is bounded away from zero) and a vector bounded away from zero, and hence itself a vector $w_n$ bounded away from zero. 
    This forces $v_\chi = 0$. 
    Furthermore, this same line of reasoning applies not only for the points $p_n$ and $p_{n+2}$,
        but for the entire curve they lie on. Parameterizing this curve as a path $\gamma(t)$ along this curve starting at $p_n$ and ending at $p_{n+2}$, we find that the kernel of $dH|_{\gamma(t)}$ has a zero $\chi$-component for all $t$ along this segment.
    Thus the curve $\gamma(t)$ must have zero derivative in its first component ($\chi$), and $\chi$ must be constant along it.        
    And yet $\chi_n < \chi_{n+1} < \chi_{n+2}$ are strictly increasing coordinates! This is a contradiction. 
\end{proof}




% We defer the proof of \cref{theorem:add-reparam} until the end (\cref{appsec:addrep}).
%%%%%%%%%%%%%%%%% THEOREM 4 %%%%%%%%%%%%%%%%%%%%
\recall{theorem:add-reparam}
% \textbf{Theorem 4.~}{\it
% 	Let $(D, \le, \bot, \top)$ be an ordered connected manifold with a greatest and least element,
% 	and suppose $\Lrn : \Phi \times D \times \Theta \to \Theta$ satisfies L1-L3, FC.
% 	Then, for every $d \in D$, there exists a one-dimensional submanifold $D_d \subseteq D$ containing $d$
% 	\unskip, a continuous function $g : D \times \Phi \times \Theta \to [0,\infty]$,
% 	and a commitment flow $^+\!\Lrn$ such that
% 	$
% 		\forall \theta, \phi, \chi \in D_d.~
% 		^+\!\Lrn(\phi, g(\chi,\phi,\theta), \theta) = \Lrn(\phi, \chi, \theta). 
% 	$
% }
% \end{theorem}
\begin{lproof}\label{proof:add-reparam}
    % First, we construct the one-dimensional sub-manifold $D_d$.
    % {\color{red}(the below requires that $\cseq$ be diffble)}
    % Choose a tangent vector $v \in T_{\bot}\confdom$. 
    % Consider the vector field $X \in \mathfrak X\confdom$ generated by
    % $X(\chi) := d(\chi' \mapsto \chi \cseq \chi')(v)$.
    % If $X(\chi) = 0$ for any $\chi$, 

    \def\Dir{\mathit{Dir}}
    For each $\theta \in \Theta$, let 
    \[
        \Dir(\theta) := \Big\{ \frac{\partial}{\partial\chi} F(\chi, \theta_0) : \theta_0 \in \Theta, \chi \in \confdom, F(\chi,\theta_0) = \theta \Big\} \subseteq T_{\theta} {\Theta}
    \]
    be the tangent subspace at $\theta$ spanned by derivatives of $F$ at various starting points.
    The key to proving the theorem is to show that 
    % $\mathrm{dim}\,\mathrm{span}(\Dir(\theta)) \le 1$;
    the elements of $\Dir(\theta)$ are all parallel and oriented the same direction; 
    this will allow us to use it to define a vector field 
    % $X$ such that $\mathrm{span}(X(\theta)) = \Dir(\theta)$ 
    which locally captures updating with $F$ (up to re-scaling) regardless of the ``original'' starting belief state $\theta_0$. At this point, we can recover an additive representation from the integral curves of this vector field.
    
    Suppose $(\chi_1, \theta_1)$ and $(\chi_2, \theta_2)$ are such that $F(\chi_1,\theta_1) = F(\chi_2,\theta_2) = \theta$. 
    To show that the corresponding directions in $\Dir(\theta)$ are parallel, it suffices to show that the sub-tangent spaces of $T_{\theta} \Theta$ generated by infinitesimal perturbations of $\chi_1$ and $\chi_2$, respectively, are the same.
    For all $\chi_1' > \chi_1$, we know (by \cref{ax:ineq-witness}) that 
    \[
    % \exists \tilde \chi_1.~F^{\tilde\chi_1} F^{\chi_1'}\theta_1 = F^{\tilde\chi_1} F^{\chi_1} \theta_1 = F^{\tilde\chi_1} F^{\chi_2} \theta_2.
    % \exists \tilde \chi_1.~F^{\chi_1'} \theta_1 = F^{\tilde\chi_1} F^{\chi_1} \theta_1 = F^{\tilde\chi_1} F^{\chi_2} \theta_2.
    \exists \tilde \chi_1.~F({\chi_1'}, \theta_1) = F({\tilde\chi_1}, F({\chi_1}, \theta_1)) = F({\tilde\chi_1}, F({\chi_2}, \theta_2)).
    \]
    %TODO: remove implicit assumption that choice of \tilde \chi_1 is smooth!
    % By a second application of \cref{ax:ineq-witness},    
   % By \cref{ax:combinativity}, we can form the confidence $\tilde \chi_1 \cseq \chi_2 \ge \chi_2$ that leads to the same point, in the sense that 
    Thus, for all $\chi_1' > \chi_1$, there exists some $\chi_2' := \tilde \chi_1 \cseq \chi_2 \ge \chi_2$ such that $F(\chi_2', \theta_2) = F(\chi_1', \theta_1)$.
    % ---and vice versa, by symmetry.  
    Symmetrically, for all $\chi_2' > \chi_2$, there exists a corresponding $\chi_1' \ge \chi_1$ with the same property.  
    %
    In particular, this is true for $\chi_1'$ and $\chi_2'$ that are infinitesimally close to $\chi_1$ and $\chi_2$, and thus the ray in the tangent space $T_{\theta}\Theta$ generated by positive perturbations of $\chi_1$ and $\chi_2$ are the same (if nonzero).
    Formally speaking, this argument establishes that either  
    \begin{equation*}
    \begin{aligned}
    % \{ d F(v, \theta_1) : v \in T_{\chi_1}\confdom\} = \{ d F(v, \theta_2) : v \in T_{\chi_2}\confdom\}.
    % \forall (\chi_1,\theta_1), (\chi_2,\theta_2) \in F^{-1}(\theta).
    % \quad \exists k \ge 0. \qquad \frac{\partial}{\partial \chi_1} F(\chi_1, \theta_1) = k         \frac{\partial}{\partial \chi_2} F(\chi_2, \theta_2) 
    \{ d F(v, \theta_1) : v \in T_{\chi_1}\confdom\} = \{ d F(v, \theta_2) : v \in T_{\chi_2}\confdom\},\\
    \text{ or one of the two equals the singleton } \{ \mat 0 \}.
    \end{aligned}
        \label{eq:sametangent}
    \end{equation*}
    % provided neither set equals the singleton $\{ \mat 0 \}$. 
    (Recall that $T_\chi\confdom$ is the tangent space at $\chi \in \confdom$, and has the same dimension as $\confdom$.)
    It follows that the dimension of $\mathrm{span}(\Dir(\theta))$ is at most the dimension of the confidence domain $\confdom$ itself---and 
    since that domain was assumed to be one-dimensional, we have shown that 
    $\mathrm{dim}\,\mathrm{span}(\Dir(\theta))$ is equal either to one or to zero. 
    %
    Moreover, we have shown that all (nonzero) tangent vectors in $\Dir(\theta)$ point in the same direction.
    % In any coordinates, $\frac{\partial}{\partial\chi} F(\chi,\theta_0)$
    % For if $v_1
    
    Define a vector field $X(\theta)$ by a continuous selection from $\Dir(\theta)$ that is nonzero whenever $\Dir(\theta)$. 
    Such a continuous selection exists because $F$ itself is twice continuously differentiable ($C^2$)
    \vnew{
        when restricted to $\Theta_\phi$. 
        
        For each point $\theta$:
        if $\Dir(\theta) \ne \{ \mat 0\}$, then select any $(\theta_0, \chi) \in F^{-1}(\theta)$ for which $\frac{\partial}{\partial \chi} F(\theta_0, \chi) \ne 0$. 
        Applying \cref{lem:active-sub}, this guarantees the existence of an active submersion point $\hat p$;
        in turn, by the submersion theorem, this guarantees the existence of a $C^1$ local section
            $\sigma_\theta : U_\theta \to \confdom \times \Theta$
        on some neighborhood $U_\theta \ni \theta$.
        %
        We then define a local vector field on $U_\theta$ according to
            $Y_{\theta}(\theta') := \frac{\partial F}{\partial \chi} (\sigma(\theta'))$. 
        Since $\{ U_\theta \}_{\theta \in \Theta}$ is an open cover of $\Theta$, 
        we know there exists a partition of unity $R = \{ \rho_\theta : U_\theta \to [0,1] \}$ subordinate to it---meaning that this indexed family has the following properties \citep[Thm 2.23]{lee2013smooth}:
        \begin{enumerate}
            \item for all $\theta \in \Theta$, $\rho_\theta(\theta') = 0$ when $\theta' \notin U_\theta$.
            \item every point $\theta' \in \Theta$ has a neighborhood 
            % $V_\theta
                that intersects the support of $\rho_{\theta}$ for only finitely many values of $\theta$.
            \item $\forall \theta' \in \Theta. \sum_\theta \rho_\theta(\theta') = 1$. 
        \end{enumerate} 
        Finally, this allows us to define our vector field as 
        \begin{equation}
            X(\theta') := \sum_{\theta \in \Theta} \rho_\theta(\theta') Y_\theta(\theta).
        \end{equation}
        This is continuous because each $Y_theta$ is smooth, and only finitely many terms $\rho_\theta$ are nonzero.

        
        
        % let 
        % $\lambda(\theta)$ be a continuous function such that $\lambda(\theta) \ge 0$ with equality iff $\Dir(\theta) = \{ \mat 0\}$. 
        % % The existence of such functions is guaranteed by the bump functions.
        % %
        % For any $\theta$, there

        % \TODO
    }%


    For $\theta \in \Theta$ and any vector field $V \in \mathfrak X(\Theta)$, we use the standard notation $\exp_\theta( V ) := y(1)$ for the unique solution to the differential equation  $\frac{\mathrm dy}{\mathrm dt} = V(y)$ with initial condition $y(0) = y_0$, evaluated at $t=1$.
    By the rescaling lemma  \cite[e.g.,][Lemma 9.3]{lee2013smooth}, $\exp_\theta(t V) = y(t)$ is the result of starting at $\theta$ and following the vector field $V$ for time $t \ge 0$. 
    %
    Since scaling a vector field by a positive scalar field results in the same (or truncated) integral curves after reparameterization, for all $\theta\in \Theta$ and  $\chi \in \confdom$, there exists some $t_{(\theta,\chi)} \in [0,\infty]$ such that $\exp_\theta( t_{(\theta, \chi)} X) = F(\chi,\theta)$. 
 
 
    % Furthermore, 
    % the value of $t$ for which following $X$
    % to be the ratio $X(\theta) / 
    With these definitions in place, we define
    % \begin{align*}
    %     g(\chi,\theta) 
    % \end{align*}
    $^+\!F(t, \theta) := \exp_\theta(t X)$
    for $t \in [0, \infty]$, 
    and $g(\chi,\theta) := t_{(\theta,\chi)}$.
    % where $\exp_\theta( X ) := y(1)$ is the unique solution to the differential equation  $\frac{\mathrm dy}{\mathrm dt} = X(y)$ with initial condition $y(0) = y_0$, evaluated at $t=1$. That is, $^+\!F(t,\theta)$ is there result of following the vector field $X$ starting at $\theta$ from time zero to time $t$. 
    \qedhere

    % The next major difficulty is that 
    % The key to proving this theorem is to 
\end{lproof}




% \TODO
% 
% \begin{prop}
%     % L1, L2, L3, L5 imply L4. 
%     \cref{ax:zero,ax:cont-and-smooth,ax:seq-for-more,ax:combinativity} imply \cref{ax:acyclic}.
% \end{prop}
% \begin{proof}
%     Suppose that $\chi_0 \le \chi \le \chi_1$, and that $F(\chi_0, \theta_0) = F(\chi_1,\theta_0) =: \theta$. 
%     In search of a contradiciton, suppose that $F(\chi, \theta_0) = \theta' \ne \theta$.
%     Applying L3 twice:
%     \def\byL3{\overset{\vphantom{\big|}\text{(\cref{ax:seq-for-more})}}}%
%     \begin{itemize}[nosep]
%         \item Since $\chi_0 \le \chi$, $\exists \tilde \chi_0$ such that 
%         $\theta' = F^{\chi}(\theta_0) \byL3= F^{\tilde \chi_0} F^{\chi_0}(\theta_0) = F^{\tilde \chi_0} (\theta )$.
%         \item Since $\chi \le \chi_1$, $\exists \tilde \chi_1$ such that 
%         $F^{\tilde \chi_1}(\theta') = F^{\tilde \chi_1} F^{\chi}(\theta_0) \byL3= F^{\chi_1}(\theta_0) = \theta$.              
%     \end{itemize}
%     So $F(\tilde\chi_0, \theta) = \theta'$ and $F(\tilde\chi_1,\theta') = \theta$.         
%     By substituting each into the other and applying L5, it follows that 
%     \[
%     F(\tilde \chi_1 \cseq \tilde \chi_0, \theta) = \theta
%     \qquad\text{and}\qquad
%     F(\tilde \chi_0 \cseq \tilde \chi_1, \theta') = \theta'.
%     \]
% 
%     Let $\theta^* := F(\top, \theta_0)$. Since $\top$ is absorbing, 
%     % it is easy to see that applying $F^\top$ projects us to $\theta^*$ at $\theta$ and $\theta'$ as well:
%     \begin{align*}
%         \theta^* = F(\top, \theta_0) 
%             &= F(\top \cseq \chi_0, \theta_0) = F(\top, \theta)  \\
%             &= F(\top \cseq \chi_1, \theta_0) = F(\top, \theta').
%     \end{align*}
% 
%     If it were the case that $\theta^* = \theta$, then we would have
%     \[
%         \theta = F(\top, \theta_0) = F(\tilde \chi_0 \cseq \top, \theta_0) = F(\tilde \chi_0, \theta) = \theta',
%     \]
%     contradicting our earlier assumption, and so $\theta^* \ne \theta$. The same argument establishes that $\theta^* \ne \theta'$.
%     % If $\theta^* = \theta'$, then
%     % \[
%     %     \theta' = \theta^* =  F(\top, \theta_0)
%     %         = F(\tilde\chi_0 \cseq \top, \theta_0) 
%     %         = F(\tilde\chi_0, \theta') = \theta.
%     % \]
% 
% 
% 
%     \TODO  
% 
% \end{proof}


\commentout{%
\subsection{Additive Representation Theorem}\label{appsec:addrep}

The most difficult math in this paper is the additive representation theorem (\cref{theorem:add-reparam}). We will need a few lemmas.

\begin{lemma}
    If $F : \confdom \times \Theta \to \Theta$ is a commitment function satisfying 
    \cref{ax:combinativity,ax:cont-and-smooth,ax:seq-for-more}
    [L1-L5] and CF, then,
    for all $\theta \in \Theta$ and $\chi_1 \le \chi_2$, 
    there exists a continuously differentiable and strictly monotone path 
    $\gamma : [0,1] \to \confdom$ such that
    $\gamma(0) = \chi_1$, $\gamma(1) = \chi_2$ 
    with the additional property that 
    $\frac{\partial}{\partial s} F(\gamma(s), \theta) \ne 0 $ 
    unless $F(\chi_1, \theta) = F(\chi_2, \theta)$.
    % $\gamma(0) = F(\chi, \theta)$,
    % $\gamma(1) = F(\top, \theta)$, and 
\end{lemma}
\begin{lproof}
    \TODO
\end{lproof}
}


\section{Defered Calculations and Further Results}
Beyond the main numbered results of the paper, we have also deferred a few minor calculations to the appendix. 
%  by presenting the proofs of these minor minor results.

\textbf{Kalman Combinativity.} 
We claim that pair $(K, r^2)$ forms a confidence domain.
With some simple algebra, one can show that the sequence of updates $(K_2, r_2^2) * (K_1, r_1^2)$ 
is equivalent to a single update with $(K_3, r_3^2)$, where $K_3 = K_1 + K_2 - K_1K_2$ just as in example 1 and the other examples using the $[0,1]$ domian, and
\[
    r^2_3 = \frac{K_2^2 r^2_2 + K_1^2(1-K_2)^2r_1^2}{(K_1+K_2-K_1K_2)^2}.
\]
This is the only non-commutative example in our setting.
In the case where $K$ is chosen optimally, this reduces to a single domain with inverse variance combining additively.


\begin{prop} \label{prop:free-additivity}
	If $F: \confdom \times \Theta \to \Theta$ satisfies 
    \cref{ax:zero,ax:cont-and-smooth},
    % L1-L2 
    then there exists another commitment function ${^{::}\!F}$
	% function for $\Theta$ on $\Phi$, that behaves in exactly the same way, but \emph{is} additive, but with the altered confidence domain
	(also for beliefs $\Theta$ on observations $\Phi$), that accepts confidences in an extended domain $\confdom' \supseteq \confdom$, has the same behavior as $F$ when restricted to the orginal confidence domain, and in addition satisfies all axioms \cref{ax:zero,ax:cont-and-smooth,ax:seq-for-more,ax:acyclic,ax:combinativity}.
\end{prop}
\begin{lproof}
Consider the new confidence domain
\[
    \Big\{
        \text{finite lists}~[c_1, \ldots, c_n]
		\text{ with each } c_i \in \confdom,
		% \text{ such that } c_i \in \confdom \text{ for all } i = 1, \ldots, n
		\quad
        \leqslant
		\quad
		% \text{list concatenation}~::,
		::\;,
		\quad
		[\,]\;,
		\quad
		[\top]\;,
		\quad
        \mathfrak g'
		\,
	\Big\}, \quad\text{where}
\]
% where:
\begin{itemize}
\item 
The operation ``$::$'' is list concatenation, except that it collapses instances of $\top$, i.e.,
\[
	[c_1, \ldots c_n] :: [d_1, \ldots, d_m]
	 := \begin{cases}
		 % [\top] & \text{ if } c_i = \top \text{ for some $i$ or $d_j=\top$ for some $j$}\\
		 [\top] & \text{ if } \top \in \{c_1, \ldots, c_n,d_1, \ldots,d_m \} \\
		 [c_1, \ldots, c_{n}, d_1, \ldots, d_m] & \text{otherwise.}
 \end{cases}
\]
Concatenating the empty list $[\,]$ on either side has no effect,
by construction, for all $L \in \confdom'$, we have $[\top] :: L = [\top] = L :: [\top]$,
and $::$ is clearly associative, so $\confdom'$ is also a confidence domain.
\item
The order is given by the prefix ordering: $[c_1, \ldots, c_n] \leqslant [d_1, \ldots, d_m]$ iff $n \le m$ 
with $d_i = c_i$ for all $i \in [n]$. 
\end{itemize}

The new update rule for this confidence is given by:
	\[
		{^{::}\!F}([c_1, \ldots, c_n], \theta)  :=
				(F^{c_n} \circ \cdots \circ F^{c_1}) (\theta).
	\]
${^{::}\!F}$ has the same behavior as $F$ on the elements that correspond to the original confidence domain, since
$
	{^{::}\!F}(c,\theta) = F(c,\theta),
$
when $c \in \confdom$ is a member of the original domain, 
and it satisfies \cref{ax:combinativity} by construction, since
\begin{align*}
{^{::}\!F}^{[c_1, \ldots, c_n]}_\phi ( ^{::}\!F^{[d_1, \ldots, d_m]}_\phi (\theta) )
		&:=
			F^{d_m}_\phi \circ \cdots \circ F^{d_1}_\phi (
			F^{c_n}_\phi \circ \cdots \circ F^{c_1}_\phi (\theta))\\
		&= (F^{d_m}_\phi \circ \cdots \circ F^{d_1}_\phi \circ
		F^{c_n}_\phi \circ \cdots \circ F^{c_1}_\phi) (\theta) \\
		&= {^{::}\!F}^{[c_1, \ldots, c_n, d_1, \ldots, d_m]}_\phi (\theta) \\
		&= {^{::}\!F}^{[c_1, \ldots, c_n] :: [d_1, \ldots, d_m]}_\phi (\theta).
\end{align*}
\end{lproof}% We are primarily interested in the case where confidence can be measured as a real number,

% \begin{example}[Gaussian NGD]\label{ex:gauss-ngd}
% 	Consider the case where $\Theta  = \{ (\mu, \sigma^2) \in \mathbb R \times \mathbb R_+ \}$ is the half-space of parameters to a Gaussian over some real variable $X$, and $\Phi \cong \mathbb R$ consists of possible observations of $X$.
% 	In this case, the obvious loss function is negative log likelihood of the observation $x$ according to our belief state $(\mu, \sigma^2)$:
% 	\begin{align*}
% 		% &\mathcal L(x, \mu, \sigma^2) 
% 		\ell =  
% 		- \log \mathcal N(x \mid \mu, \sigma^2) 
%         = \frac12 \log 2\pi\sigma^2  + \frac12 \left(\frac{x-\mu}{\sigma}\right)^2 
% 		\!\!. 
% 		%%%% PDG illustration not appropriate :(  %%%
% 		% \\
%         % &\quad=
% 		% \aar*{%
% 		% \begin{tikzpicture}[center base]
% 		% 	\node[dpad0](X) at (0,0){$X$};
% 		% 	% \node[dpad
% 		% 	\draw[arr2, <<-] (X) to node[above,pos=0.7]{$x$} ++(1.0,0);
% 		% 	% \draw[arr2, <-] (X) to node[above]{$\mathcal N(X|\mu,\sigma^2)$} ++(-1.7,0);
% 		% 	\node[dpad0](m) at (-1.5, 0.3) {$\mu$};
% 		% 	\node[dpad0](s2) at (-1.5, -0.3) {$\sigma^2$};
% 		% 	\mergearr{m}{s2}{X};
% 		% 	\node[above=2pt of center-ms2X](N){$\mathcal N$};
% 		% 	\draw[arr1, <<-] (m) to node[above, pos=0.7]{$\mu$} +(-0.9,0);
% 		% 	\draw[arr1, <<-] (s2) to node[above, pos=0.7]{$\sigma^2$} +(-0.9,0);
% 		% \end{tikzpicture}}.
% 	\end{align*}
% 
% 	The FIM for a Gaussian is
% 	% \[
% 	$
% 	\mathcal I(\mu, \sigma) =
% 	% \begin{bmatrix}
% 	%     \Ex_{\mathcal N(X|\mu,\sigma^2}[ \frac{\partial mathcal N(X|\mu,\sigma^2}{\partial \mu}) ]
% 	% \end{bmatrix}
% 	% =
% 	% \begin{bmatrix}
% 	% 	\frac1{\sigma^2} & 0 \\
% 	% 	0 & \frac{1}{2 \sigma^4},
% 	% \end{bmatrix}
% 	\mathrm{diag}(
% 		\frac1{\sigma^2}, \frac{1}{2 \sigma^4})
% 	$
% 	and so the update field is given by:
% 	\begin{align*}
% 		&\Lrn'_{x}(\mu, \sigma^2)
%         = - \hat\nabla_{\mu, \sigma^2} \mathcal L(x,\mu,\sigma^2)
%         \\ &= \mathcal I(\mu, \sigma^2)^{-1}
% 		\begin{bmatrix}
% 			\frac{x-\mu}{\sigma} \\[1ex] \frac {-\sigma^2 + (x-\mu)^2}{2 \sigma^4}
% 		\end{bmatrix}
% 		=
% 		\begin{bmatrix}
% 			x-\mu \\ (x-\mu)^2 - \sigma^2
% 		\end{bmatrix}.
% 	\end{align*}
% 
% 	Note that:
% 	\begin{itemize}[wide]
% 		\item  $\Ex_{x \sim \nu} [ \Lrn'_x(\mu,\sigma^2) ] = \mat 0$ if and only if $\nu$ has mean $\mu$ and variance $\sigma^2$.
% 		% We can interpret this in two ways:
% 		% This means that if observations are drawn from a fixed distribution $\nu(X)$,
% 		Moreover, this point is the unique global attractor.
% 		This means that,
% 		\begin{enumerate}
% 			\item If observations are drawn from a fixed distribution $\nu(X)$, and we repeatedly use $\Lrn$ to update $\theta = (\nu, \sigma)$ with small confidence $\epsilon$,
% 			then $\mu$ will approach the mean $\Ex_{\nu}[X]$ of $\nu$
% 			and $\sigma^2$ will approach the variance $\Ex_{\nu}[X^2] - \Ex_{\nu}[X]^2$.
% 			% If we make f observations are drawn from a fixed distribution $\nu(X)$,
% 			% If we update our belief parameters $\theta = (\mu, \sigma)$ according to $F$ with
% 
% 			\item If we perform a single high-confidence update on the extended observation $\varphi \propto \nu$, in which each $x$ has relative confidence $\nu(x)$, the result will be a Gaussian with the mean and variance of $\nu$, i.e.,
% 			\[
% 				\forall \theta.\quad
% 				\lim_{c \to \infty} \Pr\nolimits_{\Lrn_{\nu}^c(\theta)} = \mathcal N(\Ex\nolimits_{\nu}[X], {\mathrm{Var}}_{\nu}[X])
% 			\]
% 		\end{enumerate}
% 		In this sense, relative confidence acts like probability.
% 		% So,
% 
% 		\item
%         {\color{red} THIS IS WRONG
% 		If we update with the observation $x = \mu$ of our estimate with confidence $c$,
% 		the mean is unchanged, and our estimate of the variance becomes the harmonic mean of our previous variance $\sigma_0^2$ and the inverse confidence $\frac1c$.
% 		That is,
% 		\[
% 			\Lrn^c_\mu(\mu, \sigma_0^2) =
% 			\left(\mu, \frac{1}{c + \frac{1}{\sigma_0^2}} \right).
% 		\]
% 		Equivalently, the precision of the resulting distribution is the average of the confidence $c$ and the previous precision $\nicefrac{1}{\sigma_0^2}$, which suggests that confidence is of the same type as precision.
% 		%
% 		Note that if $\sigma_0^2$ is very large, so that our initial beliefs are very uncertain, updating with confidence $c$ results in variance $\frac1t$.
% 		In this sense, the magnitude of confidence acts as the inverse of variance.}\qedhere
% 	\end{itemize}
% \end{example}
