
% Suppose that the space $\Theta$ is actually a differentiable manifold.
% In this case, we might want want $F$ to be compatible with the differentiable structure.
% \begin{CFaxioms}
%     \item $\Theta$ is a differentiable manifold.
%         For fixed $\theta$ and $\phi$, the function $\beta \mapsto F^{\beta}_\phi(\theta)$
%         is continuously differentiable.
%             \hfill \textbf{(differentiability)} \label{ax:diffble2}
% \end{CFaxioms}
% If $\Theta$ is a differentiable manifold and

%joe3: bad story. Just say that you want additivity.
%oli3: bringing up text about additivity, and reduing stress on scale.
% Recall that the number of training iterations $n$ in \cref{ex:classifier}
% and Shafer's weight of evidence $w$ in \cref{ex:shafer}
% are measurements of confidence that do not lie in $[0,1]$, but
%  rather in $[0,\infty]$.
Most quantities used in science and everyday life
can be measured additively:
if you start with six minutes/meters/galons/votes/dollars,
and then gain seven
 % additional (distinct) ones, 
more,
then you have thirteen altogether.
% We would like a measure of confidence that also works this way.
% What is measure of confidence that works this way?
% We would like to be able to measure confidence in the same way.
% Wanting to measure confidence the same way gives us the domain $[0,\infty]$. 
To measure confidence in the same way, we must use the domain $[0,\infty]$. 
With this confidence domain, \cref{ax:combinativity} means $\Lrn$ is \emph{additive},
making it amenable to analogies of weight (e.g., the weight of evidence $w$ in \cref{ex:shafer})
and time (e.g., the number of training iterations $n$ in \cref{ex:classifier}).
Indeed, an additive learner can be implemented so that confidence really does coincide with time: imagine a machine with state space $\Theta$, controlled by buttons labeled by $\Phi$, that, while $\phi$ is pressed, evolves from initial state $\theta_0$ according to $\theta(t) = \Lrn(\phi, t, \theta)$. 
Crucially, this behavior is incoherent if $\Lrn$ is not additive.
If \cref{ax:combinativity} does not hold, then there would exist $t_1,t_2$ such that the configuration of the robot after holding down button $\phi$ for $t_1$ seconds and then by $t_2$ additional seconds, would be different from the configuration after holding down $\phi$ for $t_1+t_2$ seconds.

% This temporal analogy may not always be appropriate 
Temporal analogies may not always be appropriate
(because they may clash with other, truer notions of time),
but they have such intuitive force that 
% \emph{flow} is the established name for generic function 
% the established name for a generic function
a function
% \emph{flow} is
% the generic term for a function
$f: [a,b] \times \Theta \to \Theta$  
	% (with $a\le 0 < b$)
	(with $0 \in [a,b] \subseteq \mathbb R$)
% satisfying  \cref{ax:diffble,ax:additivity}
satisfying \cref{ax:zero,ax:cont-and-smooth,ax:combinativity}
% is called a \emph{flow}. 
is known generically as a \emph{flow} \parencite{lee2013smooth}.
% \unskip.
% Beyond the structure of a flow,
\commentout{
    Beyond \cref{ax:diffble,ax:additivity},
    $F$ need only handle full-confidence
    appropriately (i.e., satisfy \cref{ax:idemp,ax:cont})
    in order to satisfy all of our axioms thus far.}
Due to \cref{prop:additivity-implications} and the fact that 
    \cref{ax:diffble} is stronger than \cref{ax:cont}, 
the flow axioms 
(\cref{ax:additivity},\cref{ax:diffble})
imply all of our axioms so far
(\cref{ax:zero,ax:idemp,ax:cont,ax:diffble,ax:seq-for-more,ax:nopause,ax:additivity}).
Our only non-standard requirement is that $F$ must satisfy
these properties for the upper limit of $\infty \notin\mathbb R$ as well.
Thus:
% For example, we might imagine incorporating $\phi_1$ for
% we can think of each input $\phi$ 



% In fact, there is a unique additive
\begin{defn}
	% A function satisfying
	% A \emph{flow update function}
	A \emph{flow update function}
	is a function
	$F : \Phi \times[0,\infty] \times \Theta\to \Theta$
	satisfying
	%oli3: no more heding, part 3.
	% the appropriate analogues of
	% \cref{ax:zero,ax:idemp,ax:cont,ax:seq-for-more,ax:diffble}
	%%% ax:zero implicit in additivity!
	% \cref{ax:zero,ax:idemp,ax:cont,ax:diffble}
	% \cref{ax:idemp,ax:cont,ax:diffble}
	\cref{ax:diffble}
	and \cref{ax:additivity}.
\end{defn}

% To some,
% \cref{ax:additivity} might be pallatable already,
\Cref{ax:additivity} might already be pallatable,
% but it looks like it might be an assumption that significantly restricts the expressiveness of our framework.
% but it also might look like it might significantly restrict the expressiveness .
but it is clearly a nontrivial assumption, and 
% looks like it might  severely restrict the expressiveness of our update formalism.
one might worry 
% that it could limit applications of the formalism.
that it limits the applicability of our formalism.
% This is not the case.
Fortunately, this is not the case.
While \cref{ax:additivity} does significantly pin down how confidence
can be measured, it has no effect on what confidence can express.
% More concretely, for every confidence function that does not satsify \cref{ax:additivity}, we can


\begin{linked}{theorem}{add-reparam}
	If $\Lrn 
	% : \Phi \times \confdom \times \Theta \to \Theta
	$ 
	% satisfies \cref{ax:zero,ax:combinativity,ax:cont-and-smooth,??}
	satisfies \cref{ax:zero,ax:combinativity,ax:cont-and-smooth,??}
	then there exists a
	flow update function
	$^+\!\Lrn$
	and a continuous function
	$g : \Phi \times [\bot,\!\top] \times \Theta \to [0,\infty]$
	% and a function $g$
	such that
	% for all $\theta,\phi,$ and $\chi$,
	% for all $\theta\in\Theta,\phi\in\Phi,$ and $\chi\in[\bot,\!\top]$,
	% \[
	\[
		\forall \theta,\phi,\chi.\qquad
		\Lrn( \phi,
			\chi,
		 \theta )
		 =
		{^+}\!\Lrn(\phi,~
		% \beta,
		g(\phi,\chi,\theta),~
		 \theta)
		 \qquad\text{and}\qquad
		{^+}\!\Bel(\phi,\theta) = g(\phi, \Bel(\phi,\theta),\theta)
		.
	\]
	% Furthermore, $^+\!F$ and $g$ are unique up to a multiplicative factor,
	% and so
	% there is a unique choice of $(^+\!F,g)$
	% such that $^+\!F$ and $F$ handle smallconfidences in the same way,
	% i.e., $\frac{\partial g}{\partial \chi} = 1$.
	Furthermore,
	% $^+\!F$ and $g$ are unique up to a multiplicative factor
	% the pair
	 $(^+\!F, g)$ is unique up to a multiplicative factor
	% in the additve representation of confidence.
	% in the additive representation of confidence (i.e., the output of $g$).
	in the output of $g$.
	% (that may depend on $\phi$ and somewhat on $\theta$).
	% (that can depend on $\phi$, and partially on $\theta$).
	% (that can depend on $\phi$ and $\theta$).
		% \footnote{ can depend on $\phi$, 
		% and $[\theta] \in \Theta / (\theta_1 \sim \theta_2 \text {if})$}.
	\end{linked}
\begin{coro}
There is a unique choice of $(^+\!F, \beta)$
% such that $^+\!F$ and $F$ handle small confidences the same way,
% i.e., $\frac{\partial g}{\partial \chi}|_{\chi=0} = 1$.
such that $^+\!F$ and $F$ have the same effect on observations
made with sufficiently low confidence, 
i.e., $\frac{\partial \beta}{\partial \chi}\big|_{\chi=\bot} = 1$.
% that behaves like $F$ for low confience updates
% (and is also additive: \cref{ax:additivity}).
% Furthermore, there exists a function
\end{coro}

Thus, updates performed with $F$ are equivalent
to updates performed with ${^+}\!F$, except that
the degree of confidence needs to be translated appropriately (via $\beta$).
We call $^+\!F$ the \emph{additive form of $F$},
and $\beta(\phi, \chi, \theta)$ the additive form of 
confidence $\chi$. 
% This quantity might, unfortuna tely, depend on $\theta$, and $\chi$.
% Unless $F$ is strangely parameterized,
% If confidence to $F$ is meaningful independent of $\theta$ and $\phi$,
% then so too should 
% knowing 
Ideally,
the translation $g$ to an additive scale should not depend
on our current beleifs $\theta$ or observation $\phi$.

% It would be  dependence of $g$ on $\theta$ and $\phi$ is
% somewhat unsavory, and would $F$ 


\begin{defn}
We call an update function $F$ \emph{uniform} if 
the additive form
$g(\phi,\chi,\theta)$
of its confidence depends only on $\chi$
(and not on $\theta$ or $\phi$). 
\end{defn}

% If $F$ already satisfies \cref{ax:additivity},
% then $^+\!F = F$ and $g(\phi,\chi,\theta) = \chi$, so
% $F$ is uniform.
\cref{ax:additivity} implies uniformity, as then $^+\!F = F$ 
and
$g(\phi,\chi,\theta) = \chi$.
%
There also another important class of uniform update functions:
those whose confidences lie in $[0,1]$ and can be interpreted
as ``fraction of the way to full incorporation''. 
Let $a,b \in [0,1]$. If we go fraction $a$ of the distance to a target, 
and then fraction $b$ of the remaining way, then we have gone
$a + (1-a)b = a + b - ab$ of the total distance. 
Thus, such functions have an analogue of additivity:

\begin{CFaxioms}
	\item  \label{ax:fractionality}
	% For all $\phi \in \Phi$ and $\chi_1, \chi_2 \in [0,1]$, 
		% $F^{\chi_1}_\phi \circ F^{\chi_2}_\phi = F^{\chi_1 + \chi_2 -\chi_1\chi_2}_\phi$.
	% For all
    For 
	% $\phi$ and
	 % $\alpha,\alpha' \in [0,1]$, 
	 $\alpha_1,\alpha_2 \in [0,1]$, 
		$F^{\alpha_1}_\phi \circ F^{\alpha_2}_\phi = F^{\alpha_1 + \alpha_2 - \alpha_1\alpha_2}_\phi$.
\end{CFaxioms}

% \cref{ax:fractionality}, like \cref{ax:additivity}, implies \cref{ax:zero,ax:i
% It's a little 
This way of combining numbers in $[0,1]$ is a little more
complicated, and it's not as apparent that it has nice properties
like associativity (although it does). However,
% all of the nic
\cref{ax:fractionality} has essentially the same implications as 
\cref{ax:additivity},
and there are certainly cases where intuition for this scale
is stronger.
% and so we have analogous class of functions
% and so we have analogous class of functions
% so we name the analogous class of update functions as well.
% so we name the analogous class of update functions and 

\begin{defn}
	An path update function $F$ 
	% (i.e., with $\bot=0,\top=1$, satisfying 
	% 	\cref{ax:zero,ax:idemp,ax:cont,ax:seq-for-more,ax:diffble,ax:nopause} ) 
	% is called \emph{fractional} if it also satisfies \cref{ax:fractionality}.
    satisfying \cref{ax:diffble,ax:fractionality} is called \emph{fractional}.
\end{defn}
\begin{prop}
    Fractional update functions satisfy
    \cref{ax:zero,ax:idemp,ax:cont,ax:diffble,%
        ax:seq-for-more,ax:nopause}, and are uniform.
\end{prop}
We also get an analogous representtion theorem, allowing us
to uniquely convert any update function to a fractional one 
with the same effect for low confidence.
\begin{prop}
	If $F$ satisfies \cref{%
		ax:funcform,%
		ax:zero,ax:idemp,ax:cont,ax:seq-for-more,ax:diffble,ax:nopause},
	and operates on confidences $\chi \in [\bot,\! \top]$,
	% then there is a unique pair $(^{\%}\!F, \alpha)$, where 
	then there is a unique 
    fractional update function 
	$^{\%}\!F
		% : \Phi \times [0,1] \times\Theta\to\Theta
	$,
    and continuous map
		$\alpha : \Phi \times [\bot,\!\top] \times \Theta \to [0,1]$
	such that, 
		for all $\theta\in\Theta,\phi\in\Phi,$ and $\chi\in[\bot,\!\top]$,
		\[
			% \forall \theta,\phi,\chi.~~~
			F( \phi,
			% g(\phi,\beta,\theta),
				\chi,
			 \theta )
			 =
			{^{\%}}\!F(\phi,~
			\alpha
            (\phi,\chi,\theta),~
			 \theta)
		\]
	% ${^{\%}}\!F^{a}_\phi \circ {^{\%}}\!F^{b}_\phi = {^{\%}}\!F^{a + b - ab}_\phi$
	% for all $\theta,\phi,$ and $\chi$, and
	and
	$\frac{\partial \alpha}{\partial \chi}\big|_{\chi=\bot}=1$.
\end{prop}


\paragraph{What Distinguishes This from Control Theory?}
In many ways, our setup resembles that of control theory: 
	we have a continuous manifold of states $\Theta$, a set of 
	of inputs (``control signals'') $\Phi$, which cause $\Theta$ 
	to evolve ``over time''. 
\begin{itemize}
	\item Control theory does not require the analogue of a ``full-confidence'' update; there may be no limit as $t \to \infty$. This allows conrol theory to talk about a far more general class of dynamical systems without fixed points. 
	\item ``Time'' has a single clear and consistent interpretation in control theory. But the analogue here, additive confidence, is only well-defined up to a multiplicative constant. But time breaks down in other ways as well; by chaining multiple observations together, ``time'' extends past $t=\infty$.
	It is also sometimes helpful to think in terms of the reparameterized setting of $[0,1]$.
	
\end{itemize}


% Moreover,
\commentout{
Going back through our examples:
\begin{description}
	\item[{\bf[\cref{ex:prob-simple}]}]
		$g(\mu, \alpha, \phi) = - \log(1-\alpha)$.
		This means that
		$^+\!F(\mu, \beta, \phi) = e^{-\beta} \mu + (1-e^{-\beta}) (\mu\mid \phi)$.
		
	\item [{\bf[\cref{ex:shafer}]}] 
		Weight of evidence $w$ is already additive, so
			$g(m, w, \phi) = w$, and $^+\!F = F$. 
		Meanwhile, degree of support $\alpha$ is translated
		the same way as $\alpha$ in the first example: in this case,
			$g(m, \alpha, \phi) = - \log(1-\alpha)$. 
		
		\commentout{
		As noted in the introduction,
		the restriction of this update rule to belief states
		that are probabilities, gives an update rule}
		% from the $\alpha$ of \cref{ex:prob-simple}, but they differ'
		
				
	\item [{\bf[\cref{ex:classifier}]}] 
		($n$ is already additive).
	% \item [{\bf[\cref{ex:classifier}]}] 
\end{description}
}



% Recall that \cref{ax:seq-for-more} impilies that the behavior of updates
% is generated by low-confidence updates; we saw a particularly nice
% way of doing that in \cref{ax:additivity},
% which has the feature that confidence behaves the same way no matter what your initial beliefs are.


% Even restricting to , additivity is a particularly natural.
%
% \begin{prop}
% 	If $F$ is a differentiable \cofunc\ with confidence domain $\Rplus$,then there is a unique update rule $G$ with the same confidence domain, that behaves approximately like $F$ for small increments of confidence, and is also additive (\cref{ax:additivity}).
% \end{prop}
% \input{sections/vecfield-repr}
