\section{Big List of Learning Rules}

% \begin{enumerate}
% \item \textbf{Lin Interp.} 
% 	$\quad 
% 	\Theta = \Delta(\Omega, \mathcal F)
% 	\quad
% 	\Phi = \mathcal F
% 	$
% \end{enumerate}

\begin{table*}
\centering
\small
~\hfill
\hspace{-3cm}
\def\arraystretch{1.4}
\begin{tabular}{cc|cccc|c}\toprule
	$\Theta$ & $\Phi$ & ctx
		& Path  % $F^c_\phi(\theta)$
		& Flow
        & Vector Field % $\Lrn'_\phi(\theta)$
        & Loss % $\mathcal L(\theta, \phi)$
        % & commute
	\\\midrule
		$\Delta(\Omega, \mathcal F)$ & $\mathcal F$ &
		& $(1-s) \theta + s\, (\theta|\phi)$
		&
		& $\theta|\phi - \theta$
		& $\log \theta(\phi)$ 
		% & iff $\theta(\phi_1 \cap \phi2) > 0$
	\\
		$\Delta(\Omega, \mathcal F)$ & $\mathcal F$ & $U : \Omega \times \Phi \to \mathbb R$
		&
		& 
		{\def\arraystretch{0.9}\begin{tabular}{@{}l@{}}
            $\propto
                \theta \odot \exp( - t U(\omega,\phi) )$\\
            \color{gray}
            $=
                \omega \mapsto 
					\frac{\theta(\omega) \exp( - t U_\phi(\omega)) }
						{ \Ex_{\theta}[ \exp({-t U_\phi})] }$
            \end{tabular}}
		&
		{\def\arraystretch{0.9}\begin{tabular}{@{}l@{}}
        	$\mu \odot (\mathbbm{1}_A - \mu(A))$\\
            \color{gray}
            $= \mu(A)\, (\mu|A - \mu)$ \\
			$= \mu \odot (\mathbbm{1}_A - \mu(A))$
            \end{tabular}}
		&
			$\Ex_\theta [ U_\phi ]$
\end{tabular}
\hspace{-3cm}
\hfill~
\end{table*}
% \section{Discussion on Incremental Confidence and Independence Assumptions}

\section{Further Discussion: Full Confidence, Incremental Confidence, and Independence}


% \subsection{Discussion on Full-Confidence Updates}
\subsection{Updating with Full Confidence}
	\label{sec:full-conf}
Since the purpose of
% $F^1_\phi$
$F^\top_\phi$
is to \emph{fully} incorporate $\phi$ into our beliefs,
two successive updates with the same information ought to have the same effect as a single one.
Intuitively, this is because if we have just updated our beliefs to be consistent with the information $\phi$, then a second observation of $\phi$ will require no further alterations of our belief state.

Full-confidence updates are not invertable,
and destroy some information about one's prior belief state;
because of this, the beliefs that they produce can be easily compressed.
Intuitively, this is the benefit of fully trusting information: you
can simplify the way you think about things.
% In this case, we call $F$ an \emph{update rule}, or more precisely, a \emph{$\Theta$-update rule on $\Phi$}, and insist that

%joe3: This is problematic, because the issue of succiessive updates is, in general, problematic. You haven’t discussed it at all. Indeed, you haven’t even hinted that there’s an issue.
%oli3: It's actually not problematic for full confidence. That's why
% it never gets discussed when people talk about conditioning. But I
% have added a section discussing this later. 
%oli3: removing definition to shorten presentation; can equally well
% do this in a more compressed way, using just "full-confidence 
% update"
\commentout{
\begin{defn}
	% A \emph{full-confidence ($\Theta$-)update rule} (for $\Phi$) is
	A \emph{full-confidence update rule} is
	a mapping $P: \Phi \times \Theta \to \Theta$ such that
	for all $\phi \in \Phi$, 
	$P_\phi = (\theta \mapsto P(\phi,\theta)): \Theta \to \Theta$ is idempotent.
	That is,	
	$P_\phi(P_\phi(\theta)) = P_\phi(\theta)$
	 for all $\phi\in\Phi$ and $\theta \in \Theta$.
\end{defn}}


\begin{CFaxioms}
	\item
	Full-confidence updates are idempotent.
	% For all $\phi \in \Phi$, the update $F^\top_\phi$ is idempotent.
    % That is, for all $\phi \in \Phi$,  $F^1_\phi \circ F^1_\phi = F^1_\phi$.
    That is, for all $\phi \in \Phi$,  $F^\top_\phi \circ F^\top_\phi = F^\top_\phi$.
    % That is, for all $\phi \in \Phi$ and $\theta \in \Theta$,  $F^1_\phi \circ F^1_\phi = F^1_\phi$.
    % (i.e., $F^1_\phi \circ F^1_\phi = F_\phi$).
	% Full-confidence updates are idempotent. 
	% Or,
	% equivalently,
	% % $F^1 = (\phi, \theta) \mapsto F(\phi,1,\theta): \Phi \times \Theta \to \Theta$ is a full-confidence
	% $F^\top = (\phi, \theta) \mapsto F(\phi,\top,\theta): \Phi \times \Theta \to \Theta$ is a full-confidence
	% update rule.
	\label{ax:idemp}
\end{CFaxioms}

% In curried form, $F : \Phi \to (\Theta \to \Theta)$.

% We now proceed with the formal details.
% \textbf{Update Rules.}
% Consider a space $\Theta$
% of possible belief states,
% and a set $\Phi$ of statements.
% % and a set $\Phi$ of ``statements'', i.e., the things one can have confidence in.
% % An \emph{update rule} (or more precisely, a \emph{$\Theta$-updating rule on $\Phi$})
% An \emph{update rule}, or more precisely, a \emph{$\Theta$-update rule on $\Phi$},
% is a function of the form
% \[
%     % F :  (\mathbb R \times \Phi) \to \Big( \Theta \to \Theta \Big)
%     F :  \Phi \to \Big( \Theta \to \Theta \Big)
% \]
% % which describes how to update beliefs about $X$, with the new information, at a certain level of trust.
% which describes how to (fully) update beliefs $\Theta$ with new information $\Phi$.
% and for $F$ to be an update rule, we require that , meaning that updating any belief with $\phi$ twice in a row is equivalent to single update.
% Having said that, one reading of this paper is a relaxation of this requirement.
% Here are some examples.
Once $\Theta$, $\Phi$, and any implicit structure in them is specified, there is often a natural choice of full-confidence update rule.
To illustrate, we now consider three different rules for different choices of $\Phi$.
In each case, the possible belief states $\Theta := \Delta W$ be the set of all probability distributions over a finite set
 $W
  % = \{w_1, \ldots, w_n\}
  $ of ``possible worlds''.

\begin{enumerate}[wide, label=\textbf{\thesubsection.\arabic*}]
	\item %\textbf{Conditioning.}
	\textbf{Conditioning.}
	First, consider the case where observations are events, i.e., $\Phi := 2^W$.
	Here, the appropriate rule seems to be conditioning:
	% \[
	% \begin{aligned}
	% 	(-) \smash{\,\Big|\,} (\;\cdot\;) : \qquad 2^W &\to (\Delta W \to \Delta W) \\
	% 	A  &\mapsto (  ~\mu~~ \mapsto \mu \mid A ~),
	% \end{aligned}
	% \]
	% where $(\mu \mid A)(x) = \frac{\mu(\{x\})}{\mu(A)}$
	% in which learning $A$ maps
	% where the action of the conditional measure $\mu\mid A$ is given by $(\mu \mid A) \{w\} = \ifrac{\mu\{w\}}{\mu(A)}$.
	% where the action of the conditional measure $\mu\mid A$ is given by $(\mu \mid A)(B) = \ifrac{\mu(B \cap A)}{\mu(A)}$, provided $\mu(A) > 0$,
	starting with $\mu \in \Delta W$, the conditional measure 
	$\mu|A$ is given by $(\mu|A)(B) = \ifrac{\mu(B \cap A)}{\mu(A)}$, provided $\mu(A) > 0$.
	Note that $(\mu|A)|A = \mu|A$, so conditioning is idempotent. 
	% and may be defined arbitrarily otherwise.
	% and otherwise is just equal to $\mu$.
	% and is otherwise undefined, although for completeness
 	% equal to $\mu$.
	% Observe:
	% \begin{itemize}[nosep, leftmargin=1.2em]
	% 	\item Provided $\mu(A) > 0$, then $(\mu\mid A) \mid A = \mu \mid A$, so conditioning is a full-confidence update.
	% 	\item If $\mu(A \cap B) > 0$, then $(\mu \mid A) \mid B = \mu \mid (A \cap B) = (\mu \mid B) \mid A$, so the order that information is recieved does not matter (so long as it is consistent with one's beliefs).
	% \end{itemize}
	There are well-known issues with conditioning $\mu$ on $A$ when
	$\mu(A) = 0$, 
	and so typically this operation is left undefined. 
	% To satisfy \cref{ax:funcform,ax:idemp}, the result must either
	% be $\mu$ itself or 
	% give probability 1 to $A$.

	\item
	\textbf{Imaging.}
	% A second example of an update rule is the ``imaging'' 
	Our second example is the ``imaging''
	% approach of David Lewis \parencite{lewis1976probabilities}.
	approach of \textcite{lewis1976probabilities}.
	% , albeit in very different notation.
	% Once again, consider a finite set $W$, and belief states $\Theta := \Delta W$.
	Suppose that
	% , for some set $\Phi$, that
 	we already have a full-confidence update rule
	$f : \Phi \times W \to W$ on individual worlds, which we interperet as assigning each statement $\phi \in \Phi$ and $w \in W$ an element $f(\phi, w) \in W$ which is the unique ``world most similar to $w$, in which $\phi$ is true'' \parencite{gardenfors1979imaging}.
	In this case, idempotence of $f_\phi: W \to W$
	% amounts to the (very reasonable) requirement that the world most similar to $f_\phi w$ in which $\phi$ is true, is $f_\phi w$ itself.
	says the world most similar to $f(\phi,w)$ in which $\phi$ is true, is $f(\phi,w)$ itself.
	% From $f$, we can 
	We can then 
	% construct a full confidence update rule for $\Delta W$
	lift $f$ to a full confidence update rule for $\Delta W$,
	% with the pushforward	ide
	by
	$%
	% \[
    	% \begin{aligned}
    		% F_\phi(\mu) &:=
    		F(\phi, \mu) 
				% &:=
				% f^{\sharp}(\mu)
    			% &= A \mapsto \mu( f^{-1}_\phi( A ))\\
    			= A \mapsto \mu(\{w : f(w, \phi) \in A\})
    	% \end{aligned}
		% \qquad
		% \qquad
		% \begin{tikzpicture}[center base]
		% 	\node[dpad0] (W) {$W$};
		% 	\node[dpad0, right=1 of W] (W') {$W$};
		% 	\node[dpad0, below right=0.2 and 0.2 of W] (Phi) {$\Phi$};
		% 	\mergearr{W}{Phi}{W'}
		% 	\node[above=1pt of center-WPhiW']{$f$};
		% 	\draw[arr2, <-] (W) to node[above]{$\mu$} ++(-1, 0);
		% 	\draw[arr2, <<-] (Phi) to node[below]{$\phi$} ++(-1.3, 0);
		% 	\draw[arr2, <-, dashed, gray] (W') to node[above]{$F_\phi(\mu)$} ++(2, 0);
		% \end{tikzpicture}
	% \]
	$,
	intuitively moving the probability of each  world $w$ to
	 % $f_\phi w$
	$f(\phi,w)$.	
	%$f_\phi w$, the world closest to $w$ in which $\phi$ is true.
	% is the pushforward measure of $\mu$ through $f_\phi$, which Lewis calls the ``image of $\mu$ on $\phi$''
	% And, since $f$ is idempotent, $F$ will be as well.
	Since $f$ is idempotent, so is $F$.


	\commentout{
	\item More generally, consider a measurable space $\mathcal W = (W, \mathcal A)$, where $W$ is a set and $\mathcal A$ is a $\sigma$-algebra over $W$, and let $\mathcal F \subset \mathcal A$ be closed under supersets in $\mathcal A$.
	% Now, let $\Theta$ be the set of conditional probabili$

	\TODO[Properly Use Conditional Probability Measure, to define on all events]

	Conditioning a probability distribution $\mu \in \Delta\X$ on an event $A \in \mathcal A$ also makes sense in this more general measure-theoretic setting, at least so long as $\mu(A) > 0$, and is given by
	% the Lebesgue integral
	% \[
	$$
		% (\mu \mid A) (B) = \frac{1}{\mu(A)} \int \mathbf 1_{B}(x)  \mathrm d\mu(x)
		(\mu \mid A) (B) = \frac{\mu(B \cap A)}{\mu(A)}
	$$
	}


	\item
	\textbf{Jeffrey's Rule.}
	% Once more, suppose that $W$ is a finite set and $\Theta := \Delta W$.
	% Next, consider a more general form of observation, in which observations themselves are probabilities.
	% Next, consider a more general form of observation, in which observations themselves are probabilities.
	% Our final example is a way in which 
	% 	people have historically tried to augment conditioning 
	% 	to allow for uncertain observations. 
	% Recall form the introduciton
	% that Jefrey's Rule, a widely used generalization of
	% conditioning
	Both of the previous approaches establish a single event
	with certainty. Jeffrey's rule aims to mitigate this
	by allowing for ``uncertain observations'', in the sense 
	the observations can be probabilistic. 
	
	% Suppose observations themselves are probabilities.
	Formally, let $\Phi$ be the set of pairs $(X,\pi)$,
	% Formally, suppose $\Phi$ consists of marginal distributions $\pi(X)$
	where $X : W \to S$ is a random variable taking values in a set $S$,
	% (i.e., some function of $W$),
	and $\pi \in \Delta S$ is a probability on
	% the possible values that $X$ can take.
	$S$.
	Jeffrey's rule is then:
	\begin{align*}
		% \mathrm{Jeffrey}_{(X,\pi)}
		% \mathrm{Jeffrey}_{\pi(X)}
		% \mathrm{J}_{\pi(\mskip-2muX\mskip-2mu)}
		% {J}_{(X,\pi)}(
		{J}((X,\pi),
		\mu) &:= \sum_{x \in S} \pi(X{=}x) \;  \mu \big|
            (X{=}x)
            % \{ w : X(w) = x \}
			% \\
			% &= A \mapsto \sum_{x \in S} \pi(X{=}x)\, \mu( A \mid X \!= x)
	\end{align*}

	When $\pi$ places all mass on some $x \in S$, Jeffrey's Rule amounts to conditioning on $X {=} x$.
	% and so it is sometimes thought to generalize conditioning so that 
	% absolute certainty is no longer necessary.
	For this reason, Jeffrey's Rule is often thought of
		as a generalizing conditioning so as to allow
		for observations of lower confidence.
	 % but for other choices of $\pi(X)$,
	% For this reason, Jeffrey's Rule is sometimes often thought of as a generalization of conditioning that admits for less that complete certainty.
	However, 
	% $J_{(X,\pi)}$ 
	% $J$
	% is idempotent;
	% also, 
	$\pi$ has been fully incorporated into the posterior beliefs
	% since the marginal of $J((X,\pi),\mu)$ on $X$ is $\pi$,
	(since $J((X,\pi),\mu)(X) = \pi(X)$),
	while the prior belief 
	% about $X$
	$\mu(X)$ has been destroyed.
	In addition, $J_{(X,\pi)}$ is idempotent. 
	% Therefore $J$ is still a full-confidence update rule---just 
	% % one that handles observations that can be uncertain.
	% one that handles a different kind of observation.
	% So $J$ still makes updates with full-confidence---it just handles
	% 	observations that do not indicate high probability.
	
	Therefore, $J$ still establishes its observations with with full confidence---%
		it's just that these observations can be probabilistic.
	%
	% This is another historical conflation between
	% We attribute this to a conflation between
	% We imagine that this mismatch has contributed to
	% the historical conflation between confidence and likelihood.
	We imagine that this mismatch is a result of a
	historical conflation of confidence with likelihood.

	% Let $\mu' := J_{\pi(X)}(\mu)$ be the result of applying Jeffrey's rule for $(X,\pi)$ to $\mu$.
	% % then $\pi$ will be fully incorporated (that is, $\mu'(X) = \pi(X)$),
	% Note that $\mu'(X) = \pi(X)$, so $\pi(X)$ has been fully incorporated into $\mu'$, while all information about the old prior belief about $X$ has been destroyed by the update.
\end{enumerate}





% We will later see that idempotence
We have seen how full-confidence updates are idempotent; is the reverse true as well?
That is, if an update is idempotent, then is it full-confidence? 
% Once we have the rest of our axioms, this will indeed be the case.
% After we place some mild restrictions on what can happen
% for intermediate values of confidence, this will indeed be the case.

% Trivially, the answer is no, because the identity $F^0_\phi$ is idempotent as well.
% Aside from the identities that arise from no confidence $F^0_\phi$,
 % we will see that 
% it will turn out that 
% as a consequence of our later axioms,
% if $\phi$ is nontrivial, then
% the update $F^\chi_\phi$ is idempotent iff $\chi\in\{0,1\}$.

\begin{prop}
	If $F$ satisfies
	\cref{ax:funcform,ax:zero,ax:idemp,ax:cont,%
		ax:diffble,%unnecessary
		ax:seq-for-more,ax:nopause},
 	% $\exists\phi,\chi,\theta.~$ 
	while
 	$\theta,\chi,\phi$ are such that 
	$F_\phi^\chi(\theta) \ne \theta$
	and
	$F^\chi_\phi \circ F^\chi_\phi = F^\chi_\phi$, then $\chi=\top$. 
\end{prop}

% Before we get to these axioms, or talk about what happens about 
% But in order to characterize how updates work for intermediate values of confidence,
% But before we go any further,
% Before we can make sense of those axioms,
% Some of these axioms, including \cref{ax:idemp}
% make involve sequential updates.  
% Before we articulate these axioms, we must
Before we can freely describe the axioms governing
intermediate degrees of confidence, we must
% revisit \cref{ax:funcform} 
% discuss a 
first clarify a point about sequential observations.


\subsection{Discussion on Incremental Confidence and Independence Assumptions 1}
	\label{ssec:indep-shafer}

Historically, \cref{ax:funcform} has not proved as anodyne as it looks.
Some might object that it's not possible to write such a function that is appropriate in all circumstances.
For example, Shafer argues for Dempster's rule of combination as a way of incorporating information, but is very careful to emphasize that it ought to be used only on \emph{independent} information, for reasons illustrated below.



\begin{example}
    % \label{ex:dupl}
	You have initial belief state $\theta_0$.
	Now, someone comes up to you and tells you that $\phi$ is true, a statement
		that you trust to some intermediate degree of confidence $c \notin\{ \bot, \top\}$.
	So, in accordance with \Cref{ax:funcform}, you use $F$ to transform your beliefs, partially incorporating the information to arrive at some belief state $\theta_1 := F^c_\phi(\theta_0)$.
	Immediately afterwards, your friend repeats what they just said: $\phi$ is true.
	Your confidence in the statement remains the same, and so according to
	\Cref{ax:funcform}, you again update your beliefs, arriving at $\theta_2 := F^c_\phi(\theta_1)$.
	Except in very special circumstances (e.g., you already know that $\phi$ is true, or $c \in \{\bot,\top\}$), typically $\theta_2$.
	And yet, it seems your your attitude towards $\phi$ ought to be the same whether you've heard it twice or only once.
\end{example}


Now, it's important to mention that we're not quite in the same position as Shafer.
Shafer was prescribing a concrete representation of $\Theta$ (a belief function) and a concrete update rule $F$ (Dempster's rule of combination), and so he needed to defend these choices.
% To take an analogous prescriptive stance,
We only need to defend something much more modest: we only need to defend the assumption that, if $\Theta$ and $\Phi$ properly model the relevant aspects of the scenario at hand, then there exists \emph{some} function $F$ which performs updates appropriately.
Descriptively speaking, we're also in good shape: for synthetic agents, it suffices to point out that learning algorithms represent functions, which given a state, an input, and a number of iterations (confidence), produce an output.
And, supposing that $\Theta$, $\Phi$, and $\confdom$ all capture the relevant respective aspects of a human's belief state, input information, and attitude towards it, how could it be that a human does otherwise?
%
% After all, if you recieve the same input twice, and your confidence in it has not changed, it would be best to only update once.
% There are essentially three ways to proceed.
In any case, keeping \cref{ex:dupl} in mind, here are three ways to proceed.

\begin{enumerate}[label={\textbf{I\arabic*.}},ref={I\arabic*}]
	\item \textbf{Accept Severe Limiations.} \label{approach:assume}
	Like Shafer, we could be careful
	% not to claim anything about how to update beliefs, except
	to claim nothing about the belief updating process except in the (unusual) case where
	information recieved is independent.
	% This severly handicaps the usefulness of .
	This would be a severe limiation to the theory, and much less necessary than it was for Shafer.
	Imagine that we are writing code that describes how a synthetic agent updates its beliefs. Shafer's approach is to package any such code with a warning against running it unless assured that observations will always be independent.
	But independece is notoriously difficult to establish; are we to simply accept that the code will not behave correctly in any realistic scenario?
	% Under what circumstances could we possibly be sure that all inputs we will be independent?

	In practice, many theoretical properties of standard statistical learning algorithms are heavily dependent on indepencence assumptions (most commonly, that one recieves independent, identically distributed samples).
	% Despite this, such algorithms .
	This warning label not seem to keep them from being applied in settings where practitioners readily admit samples are not really independent at all---nor indeed performing well emperically in those settings \parencite{???}.

	% If we need to make a decision that depends on information that is not given, then
	% Shafer found himself in a position where he needed to qualify usage of the update rule


	\item \textbf{Appropriately Enrich Domains.}\label{approach:enrich}
	% For instace, the agent has been totally
	% We told a story wanting to avoid incorporating information twice.
	In \cref{ex:dupl}, it seems obvious that we ought to ignore the second copy of the information, because it has already been accounted for.
	However, this intuition is highly contingent on the implicit supposition that we \emph{know} the second input to be a replica of the first.
	Were we ignorant to the nature of the second piece of information, perhaps it would not be so unreasonable to incroproate it again, even without a proof of independence.
	% If we want our agent to do the same
	% In asking our agent to take such issues into account, it is only fair to give it access to the same
	% It seems unfair to criticize an agent for not behaving
	So, if we would like our agent to make the same decisions that we did, it seems only fair to give it access to the knowledge that we needed to get there.
	One way of doing this is to extend the belief state so that it also tracks what information has been incorporated.
	% For instance, suppose that every message

	For \cref{ex:dupl} to work, it is critical that we are able to discern that the two inputs were identical.
	As a result, it seems that the relevant description of the input information was not just $\phi$, but a pair $(\phi, \mathit{id})$ that also a description of its identity.
	It is also critical that we remember the identity of previously incorporated information, so we would also be better off with a belief space $\Theta$ reflects this.
	With these two modifications, any \cofunc\ can be straightforwardly modified to avoid the issue in \Cref{ex:dupl}.


	We submit that it is always possible to enrich the space of beliefs and observations in this way to track the relevant information, to resolve the issue.
	With a few more assumptions later on, we will be able to formalize the construction we just alluded to (\cref{ex:dupl-enriched}).

	\item \textbf{An Incremental Interpretation of Confidence.}
		\label{approach:interperet}
	Finally, we can get around the issue by interpreting a confidence $c \in \confdom$ not as an absolute measurement of confidence, but rather an incremental one.  This means viewing $c \in \confdom$ as the degree of \emph{additional} confidence we have in $\phi$, beyond whatever we have already incorporated into our beliefs.

	% This proposal immediately raises some important questions.
	This proposal might be concerning.
	% First, how
	One might worry that it's harder to make sense of ``incremental confidence'' than an absolute notion.
	How ought we to numerically describe the confidence of an update?
	Suddenly this becomes much more subjective, for to assign a number, not only must we describe how much trust we have in the new information, but we must also take history or current belief state into account.
	Furthermore, the words ``incremental'' and ``additional'' suggest that we will need a formal description of how to aggregate confidences---%
	the very concept of which we will need to defend.
	% Indeed, such a way of combining confidences will ultimately play a large role for us.
	% It will turn out that such a way of combining confidences
	% Fortunately, it will turn out that

	Even modulo these concerns, the incremental interpretation still leaves us in a strictly better place than we were before.
	%  with \cref{approach:assume}.
	To begin, in situations where inputs are independent (i.e., the only cases where we would have been allowed to apply the \cofunc\ according to \cref{approach:assume}), the two notions coincide.
	More explicitly: if the new information $\phi$ is indepenendent of everything we've previously seen, then an absolute measurement of our confidence in it is no different from a measrurement of how much we ought to increment it from having no confidence.
	Already, though, we can do more.
	In the situation described by \cref{ex:dupl}, for instance,
	the second utterance induce no \emph{additional} confidence ($\bot$), and so applying $F$ with no confidence clearly gives the desired result of ignoring the new information (per \cref{ax:zero}).
%
%
	And even in general, the prospect of having to numerically estimate a fuzzy quantity seems more promising than red tape requiring that $F$ only be used (in good conscience) on independent information.
	% In some ways, this approach it is not so different from directly requiring independence of inputs, there are several aspects of this framing, that in our view, make it more pallatable.
	% First,
	% In cases
	% This raises some questions. Even if we already had a

\end{enumerate}

We would like to point out that readers who find who find it reasonable to ignore inputs you have no confidence in (per \cref{ax:zero}) have implicitly either accepted either \cref{approach:assume} or \cref{approach:interperet}, as the next example shows. 


\begin{example}
	Suppose you first hear $\phi$ from a partially trusted source, and incroproate it into your beliefs appropriately.
	Then, the same source sends you a second message, which is obviously spam.
	In an absolute sense, you now have no confidence ($\bot$) in anything this source tells you, including (in retrospect) both messages.
	It seems appropriate to excise $\phi$ from your belief state in response, rather than leaving your belief state unchanged, as \cref{ax:zero} would prescribe.

	Note that in this scenario, while it seems that we ultimately have no confidence in $\phi$, it does not seem to be the case that we have no incremental confidence in $\phi$.
	Rather, the incremental confidence seems to be the inverse of the original confidence.
\end{example}

We state our results with the incremental interpretation of confidence, with the understanding that all of our results also admit a more conservative reading, in which confidence is measured absolutely, and also all applications of the function $F$ are independent. 


% \subsection{Independence of Inputs}
% \subsection{Sequential Observations and Independence of Inputs}
\subsection{Sequential Observations and Input Independence}

% When does it make sense to combine observations?
Our characterization of full-confidence requires us to think 
about the effect of making updates with $F$ in sequence.
% But does it always make sense to combine information? 
Does it always make sense to apply $F$ repeatedly to model sequential observations?
\cref{ax:funcform} says yes.
But anodyne as it looks, analogues of \cref{ax:funcform} have
historically been controvertial, especially for intermediate values of confidence.

% It turns out \cref{ax:funcform} is not as anodyne as it
% looks, now that we can describe intermediate confidence levels. 
While Shafer endorses Dempster's rule of combination, for example,
he is careful  
% to emphasize that it is only appropriate for combining 
% to add a qualification:
to emphasize a limitation: it only applies when combining
\emph{independent} pieces information.
% The following example illustrates why.
% only to do endores 
% for reasons we will now see.
% for reasons illustrated by the following example.

\begin{example}\label{ex:dupl}
	We have initial belief state $\theta_0$.
	Now, a friend tells us $\phi$ is true, 
	which we trust to some intermediate degree 
	% which we 
	% $c \notin\{ 0, 1 \}$.
	% $0 < c < 1$. 
	% $c \in (0,1)$.
	$c \in (\bot,\!\top)$.
	So, in accordance with \Cref{ax:funcform}, we use $F$ to update our beliefs, partially incorporating $\phi$ arrive at a belief state $\theta_1 := F^c_\phi(\theta_0)$.
	Immediately afterwards, the friend repeats herself: ``$\phi$ is true''.
	By \Cref{ax:funcform}
	we must again update our beliefs with $F$,
	and if our confidence remains the same,
	then we arrive at $\theta_2 := F^c_\phi(\theta_1)$.
	% Except in very special circumstances (e.g., you already know that $\phi$ is true, or $c \in \{\bot,\top\}$), typically $\theta_2$.
	Unless we were already certian of $\phi$, then $\theta_1 \ne \theta_2$,
	% And yet, it seems our attitude towards $\phi$ ought 
	% to still be $\theta_1$.
	yet it seems our beliefs ought
	% to be the same whether we've heard it twice or only once.
	% to be the same whether or not we hear the echo.
	to be $\theta_1$ whether or not we hear her the second time.
\end{example}


% So, one resolution to these problems is to follow Shafer, and
% place strict limitations on when one should use $F$ to update beliefs:
The analogue of Shafer's resolution would be to 
% significantly
restrict our endorsement of the $F$,
  % would mean stating that it is only appropriate to use $F$ 
% and be clear that
so that \cref{ax:funcform} applies only when recieving information that is, in some sense, independent of our present beliefs. 
% Indeed, even \cref{ax:zero} is suspect in such cases.
Unfortunately, inputs are seldom completely
	independent of our prior beliefs---%
	% worse, there is often no way to determine whether . 
	and worse, there is often no way to know
	whether they are or not. 
% But we must update our beliefs nonetheless. 
% But we still have to choose what to do with our beliefs.
Yet unknowably entangled information still comes, and we must still choose
	% whether and how to modify our beliefs.
	what to make of it.
	
In one sense, Shafer's approach gives us the answer we wanted in 
\cref{ex:dupl}, by preventing us from making the second update. 
But in another, truer sense, it is so restrictive that 
% we cannot say anything at all about our final beliefs:
it says nothing about our final beliefs: 
the second utterance is not independent, so $F$ does not apply, 
	% and we simply cannot say what beliefs we end up with. 
	and we simply cannot say anything about our beliefs in the end.
	
% The traditional approach in learning theory takes the opposite position,
% noting that in practice 
% In practice, data set curation to try to satisfy this assumption. 


% To take another example, many of the theoretical guarantees we have 
% A broader, perhaps more relevant illustration of the same phenomenon:
% Another example of this phenomenon can be found in machine learning.
% An analogous issue arises in the context of \cref{ex:classifier}.
An analogous issue about input independence
 	often arises in machine learning,
	as touched on briefly in \cref{ex:classifier}.
% Recall, from , that classifiers 
% many of the theoretical guarantees we have 
Many theoretical guarantees about the correctness of
learning algorithms assume that samples
% are drawn independently from a fixed distribution.
are drawn \emph{independently} from a fixed distribution. 
% In fact, this independence assumption underlies the
Such assumptions underly the standard notion of
learnability \parencite{valiant1984theory},
% a major branch of statisical learning theory.
and indeed the bulk of statisical learning theory.
% , so that each observation is an independent source of information.  
%
% Many learning theorists seem to take view at the opposite extreme:
% Many learning theorists seem to take an oppostite position: 
% In practice, many learning theorists 
%
But at a pragmatic level, 
Many learning theorists take a pragmatic
stance: 
sure, the guarantee only holds for independent inputs, an
assumption is almost certainly false---but we use our learning
algorithm anyway, and find that in practice inputs are independent
enough that there is no need for concern.
% But how shall we quatify this? 
% It is obvious that it is possible for this approach to be problematic;
% It is obviously possible for this not to work out perfectly;
% Obviously, it is possible for this not to be ideal;
Obviously, this approach can go arwry;
% Applying this approach 
% applying it to \cref{ex:dupl} means
in \cref{ex:dupl}, it amounts to
	adopting $\theta_2$
	% and fully accepting 
	with full acceptance
	that we will, on occasion,
	% `inappropriately' inflate information that isn't independent.
	``inappropriately'' duplicate updates.
	% taking belief $\theta_2$ without losing any sleep. 
	% and adopting $\theta_2$ without thinking too hard about it.

% We argue that, by adopting an \emph{incremental} conception
% of confidence, we can get the best of both worlds. 
We argue that, by viewing confidence as \emph{incremental} 
quantity, it is possible to  
	unconditionally endorse \cref{ax:funcform}
	and use $F$ in all cases, without making silly, avoidable mistakes.
% In this case, we still update using $F$ 
In \cref{ex:dupl}, for example, we update using $F$ 
	% to update our beliefs 
	upon hearing the second utterance in \cref{ex:dupl},
	(even though it is not independent),
	but we make this update with zero (incremental) confidence,
	because it gives us no \emph{additional} information beyond
	what we already knew. 
%
If we restrict our attention to inputs that are independent of our beliefs, then
incremental confidence is no different from absolute confidence.
 % But if it is only partially 
But the real power of this approach comes when we observe an input
	that is not fully independent of our belief state. 
For example, what if the second ``$\phi$ is true'' in \cref{ex:dupl}
	is not a mistake, but communicates emphasis? 
It is clearly not independent of what we've heard, 
	(one can re-emphasize $\phi$ only having already articulated $\phi$),
	but perhaps we can still
	% estimate the degree to which it \emph{is} independent
	quantify how much incremental confidence it carries for us.
%
% The drawback is that it becomes harder to give a precise algorithmic
% prescription for how to choose a numerical degree of confidence,
% without essentially making the same independence assumptions. 

For the remainder of the paper, we take this incremental view 
for convenience, with the understanding that everything we say also 
admits a conservative reading by restricting \cref{ax:funcform} 
only to apply when $\theta$ and $(\phi,\chi)$ are independent.
% Either way, this 
% to independent information.
	
\commentout{
This is the approach we took implicitly
% How then, shall we make sense of
when arguing for \cref{ax:seq-for-more}: even though the second update 
($F^{\chi_3}_\phi$) is not independent of the first one 
($F^{\chi_1}_\phi$),
$\chi_3$ measures the \emph{additional} confidence in $\phi$ we wish
to incorporate, or put another way,}



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Other Confidence Domains}

% \begin{phaseout}
To describe a degree of partial incorporation, we will need a domain of possible confidence values.
Mostly, we will stick to using real numbers, but it will clarify things to stay more general for now, so that we can see the properties we actually need.
% Formally, we represent the possible degrees of confidence by a group
Formally, a \emph{confidence domain} is a tuple $(\confdom, \oplus, \bot, \top)$,
where $(\confdom, \oplus, \bot)$ is a monoid with operation $\oplus$ and neutral element $\bot$, and $\top \in \confdom$ is an absorbing element---i.e., $\top \oplus c = \top$ for all $c \in \confdom$.
In terms of confidence, we interpret the components as follows:

\begin{itemize}%[]
	\item
	The elements of $\confdom$ are the possible degrees of confidence.

	\item
	The monoid operation $\oplus : \confdom \times \confdom \to \confdom$ describes how to combine two (independent) confidences in some statement, to obtain a new confidence in that statement.

	\item The neutral element $\bot \in \confdom$ indicates ``no confidence'' in an observation.
		%
		% The fact that we want to ignore information we have no confidence in
		% gives corresponds to the group identity laws: that
	The monoid identity laws, which assert that
		$\bot \oplus c = c = c \oplus \bot$ for all $c \in \confdom$,
	reflect the intuition that we should ignore untrusted information in combining confidences.
		% in which case we should ignore the information at hand,
	\item The absorbing element $\top$ indicates ``full confidence''.
	The absorbtion property corresponds to the intuition that, definitive information that $\phi$ is true, when combined with other (perhaps less reliable) information that $\phi$ is true, is still definitive.
\end{itemize}
% \end{phaseout}

In this more general setting, the analogue of additivity (\cref{ax:additivity}) becomes:
\begin{CFaxioms}
	\item For all $c_1, c_2 \in \confdom$,~
		% $F^{c_1}_\phi \circ F^{c_2}_\phi = F^{c_1 \oplus c_2}_\phi$
		$F^{c_1}_\phi \circ F^{c_2}_\phi = F^{c_1 \oplus c_2}_\phi$
		% \hfill \textbf{(additivity)} \label{ax:additivity}
		\hfill \textbf{(combination)} \label{ax:gen-combine}
\end{CFaxioms}
\Cref{ax:gen-combine}
 % looks like it could be problematic, but it
 simply states that \cofunc s respect the combination operation.
If we fix an assertion $\phi$, then an update with confidence $c_1$ followed by an update with confidence $c_2$ is equivalent to an update with confidence $c_1 \oplus c_2$, which is, by definition, the result of comining confidences $c_1$ and $c_2$.
On its own, so long as we have the freedom to choose $\confdom$, \Cref{ax:gen-combine} has no teeth.


\begin{prop} \label{prop:free-additivity}
	If $F: \Phi \times \confdom \times \Theta \to \Theta$ satisfies \cref{ax:zero,ax:idemp}, then we can construct a new update
	% function for $\Theta$ on $\Phi$, that behaves in exactly the same way, but \emph{is} additive, but with the altered confidence domain
	function for $\Theta$ on $\Phi$, that accepts confidences in an extended confidence domain $\confdom' \supseteq \confdom$, and has the same behavior as $F$ on the orginal confidence domain, yet also satisfies \cref{ax:combinativity}.
\end{prop}
\begin{lproof}
Consider the new confidence domain
$$
	\confdom' := \Big\{ \text{ finite lists } [c_1, \ldots c_n]
		\text{ with each } c_i \in \confdom,
		% \text{ such that } c_i \in \confdom \text{ for all } i = 1, \ldots, n
		\quad
		% \text{list concatenation}~::,
		::,
		\quad
		[\,],
		\quad
		[\top]
		\,
	\Big\},
$$
whose group operation ``$::$'' is list concatenation, except that it collapses instances of $\top$, i.e.,
\[
	[c_1, \ldots c_n] :: [d_1, \ldots, d_m]
	 := \begin{cases}
		 % [\top] & \text{ if } c_i = \top \text{ for some $i$ or $d_j=\top$ for some $j$}\\
		 [\top] & \text{ if } \top \in \{c_1, \ldots, c_n,d_1, \ldots,d_m \} \\
		 [c_1, \ldots, c_{n}, d_1, \ldots, d_m] & \text{otherwise.}
 \end{cases}
\]
Concatenating the empty list $[\,]$ on either side has no effect,
by construction, for all $L \in \confdom'$, we have $[\top] :: L = [\top] = L :: [\top]$,
and $::$ is clearly associative, so $\confdom'$ is also a confidence domain.

The new update rule for this confidence is given by:
	\[
		AF^{[c_1, \ldots, c_n]}_\phi (\theta)  :=
				(F^{c_n}_\phi \circ \cdots \circ F^{c_1}_\phi) (\theta).
	\]
$AF$ has the same behavior as $F$ on the elements that correspond to the original confidence domain, since
$
	AF^{[c]}_\phi(\theta) = F^c_\phi(\theta),
$
and it is additive by construction, since
\begin{align*}
AF^{[c_1, \ldots, c_n]}_\phi ( AF^{[d_1, \ldots, d_m]}_\phi (\theta) )
		&:=
			F^{d_m}_\phi \circ \cdots \circ F^{d_1}_\phi (
			F^{c_n}_\phi \circ \cdots \circ F^{c_1}_\phi (\theta))\\
		&= (F^{d_m}_\phi \circ \cdots \circ F^{d_1}_\phi \circ
		F^{c_n}_\phi \circ \cdots \circ F^{c_1}_\phi) (\theta) \\
		&= AF^{[c_1, \ldots, c_n, d_1, \ldots, d_m]}_\phi (\theta) \\
		&= AF^{[c_1, \ldots, c_n] :: [d_1, \ldots, d_m]}_\phi (\theta).
\end{align*}
\end{lproof}% We are primarily interested in the case where confidence can be measured as a real number,
For convenience of measurement, and so that we may better study confidence as a \emph{smooth} interpolation between ignoring and fully incorporation, we shall focus primarily on cases where confidence can be measured as a real number.
% We now define two confidence domains that are real numbers between 0 and 1.
We now consider two such confidence domains.

\begin{itemize}
	\item
	% Another confidence domain we could consider
	First, we consider the zero-one confidence domain
	\[
		\ZO := \Big(~ [0,1],
			% \quad a \star b := a b,
			\quad a \oplus b :=
					% 1- (1-a)(1-b) =
					a + b - ab,
			\quad 1,
			\quad 0 ~\Big),
	\]
	which uses the same numerical endpoints as probability;
	a value of zero represents no confidence, a value of one represents full confidence.
	For the purposes of updating, we may interpret a confidence of $a \in \ZO$ as the fration of the way between ignoring and fully incorporating information.
	This motivates the definition of the operator $\star$.
	If you go $90\%$ of the way to fully incorporaing some information $\phi$, and then $50\%$ of the remaining way, then in total you have gone $90\% + 50\%(100\%-90\%) = 0.9 + 0.5 - (0.9)(0.5)$ of the way to fully incorporating $\phi$.

	% In this domain
	% Clearly, 1 is neutral and 0 is absorbing element.\end{itemize}
	\item
	% First, we consider the monoid of positive extended real numbers under addition.
	We now introduce a second confidence domain based on the real numbers,
	which is mathematically cleaner, if
		% at first
		more difficult to interpret numerically in absolute terms.
	\[
		\Rplus :=
			\Big([0, \infty) \cup \{\infty\},
				\quad +,
				\quad 0,
				\quad \infty
				~\Big)
	\]

	% The use of addition as the combination operator means that independent measurements add, which in turn makes it
	The use of addition as the combination operator makes it particularly natural to speak of linear combinations of inputs.
	% Here are some examples.
	This point is best illustrated by example.

	\begin{itemize}
		\item \textbf{Voting.}
		Suppose the elements of $\Phi$ correspond to candidates in an election.
		In a sense, the number of votes a candidate recieves is a measure of how much confidence the electorate has in them---a candidate who recieves no votes is ignored, while a candidate who recieves all of the votes should be listened to exclusively.

		It's hard to say much  the raw number of votes a candidate recives in absolute terms, in part becasue it depends on the number of votes recieved by other candidates, and also how many votes you will recieve in the future.
		% Nevertheless, it is still m
		Nevertheless, if we are collecting votes, is especially natural to weight candidates by the total number of votes behind them.
		% Similarly, this way of counting fractional votes.
		This way of measuring confidence also applies without change to measure fractional votes.

		\item \textbf{Chemical Reactions.}
		Suppose that we have a mixture of nano-bots.
		Each nano-bot has some type $\phi \in \Phi$, and has the effect of turning matter into bots of type $\phi$.
		For every $\phi \in \Phi$, let $\beta_\phi$ be the concentration of bots of type $\phi$, say measured in number of bots per liter of solution.
		In some sense, $\beta_\phi$ measure of how much ``confidence'' the mixture has in $\phi$---if the concentration is zero, then that bot type may be ignored, and if all particles are of type $\phi$, then


	\end{itemize}

\end{itemize}


\begin{prop}
	$\ZO$ is isomorphic to $\Rplus$, but therere is no canonical choice of isomorphism.
\end{prop}
\begin{lproof}
	For every $k > 0$ can construct an isomorphism $\varphi_k: \ZO \to \Rplus$ explicitly by $\varphi(a) := - k \log a$.
	It is a homomorphism, since
	\[
		\varphi(a \star b) = - k \log (a b) = - k \log a - k \log b =
			\varphi(a) + \varphi(b),
	\]
	while $\varphi(1) = 0$ (so it preserves the identity) and $\varphi(0) = \infty$ (so it preserves the absorbing element).
	The inverse mapping can also be explicitly by $\varphi^{-1}(r) := \exp( - r / k)$, which is also a homorphism for the same reasons as above.
\end{lproof}

\section{More Examples}

\subsection{Examples with Stories}


Next, and example in which we have an additive flow update rule,
that is not an optimizing update rule, and hence has no loss representation.

\begin{example}[Weighted Average]
    Suppose we recieve vectors $\phi \in \mathbb R^n$, 
    say estimates of a quantity from different sources.
    Suppose further that our belief state
    $(\mat x,w) \in \Theta$ consists a current estimate
    $\mat x$ of the quantity of interest, and a weight $w$ of the total internal confidence in the estimate.
    In other words:    
	\begin{align*}
		\Theta = \mathbb R^n \times [0,\infty];
		\qquad
        \text{and}
        \qquad
		\Phi = \mathbb R^n %; \qquad
        .
	\end{align*}

	Updating proceeds by taking a weighted average of the previous estimate and the new input, weighted by their respective confidences, which is captured by:
	\begin{align*}
		F^\beta_{\mat y}(\mat x, w) &=  \left( \frac{ w \mat x + \beta \mat y}{w + \beta} , w + \beta \right)
		\qquad\text{and}\qquad
		F^{\beta}_{\mat y}(\mat x, \infty) = (\mat x, \infty)
	\end{align*}
	It is additive, since
	\begin{align*}
		&F^{\beta_2}_{\mat y} \circ F^{\beta_1}_{\mat y}(\mat x, w) \\
		&= \left( \frac{ (w + \beta_1) \frac{ w \mat x + \beta_1 \mat y}{w + \beta_1} + \beta_2 \mat y}{(w + \beta_1) + \beta_2}, (w  + \beta_1) + \beta_2 \right) \\
		&= \left( \frac{  w \mat x + (\beta_1 + \beta_2) \mat y}{w + (\beta_1 + \beta_2)}, w  + (\beta_1 + \beta_2)\right)
		= F^{\beta_1 + \beta_2}_{\mat y}(\mat x, w).
	\end{align*}
	And it is clearly differentiable, with a simple calculation revealing that
	$ F'_{\mat y}(\mat x, w) = \left( \frac{\mat y - \mat x}{w}, 1\right) $.

	Observations:
	\begin{itemize}
		\item The update rule cannot be extended differentiably to states $\theta = (\mat x, w)$ with $w = 0$.
			Intuitively, we need to have some estimate with positive confidence to update beliefs in a differentiable way.
        This is related to the fact that plain emperical risk minimization (ERM) is unstable, but stable with even a small amount of regularization.
			% It is also similar to the fact that one needs a prior in order to do
			% In this case, the prior may be arbitrarily weak.
		\item The certainties are given by
		\[
			\lim_{\beta \to \infty} F^{\beta}_{\mat y}(\mat x, w) = (\mat y, \infty)
		\]
		% \item In this case,
		\item $F$ is commutative, invertible, and symmetric with respect to permutation of the dimensions, but it is not conservative: if we had $U(\mat x, w, \mat y)$ twice differentiable such that $\nabla_{\mat x, w} U = F'$, then we would have
        \begin{align*}
        \frac{\partial^2}{\partial w \partial x_i} U &=
			\frac\partial{\partial w} \frac{y_i - x_i}{w} = \frac{x_i - y_i}{w^2},\quad\text{but}
            \\
		\frac{\partial^2}{\partial x_1 \partial w} U
			&= \frac\partial{\partial x_i} 1 = 0    
        \end{align*}
		violating Clairaut's theorem (which asserts equality of mixed partials).
		Therefore, $F'$ cannot be written as the gradient of a function,
        and so $F$ is not an optimizing update rule. \qedhere
	\end{itemize}
\end{example}

Next, a toy example that showcases an assortment of other features and themes that can be captured with our definition of confidence.

\begin{example}\label{ex:jugo}
% \def\Gpressed(#1,#2){\mathsf{G}_{#1}\text{-}\mathit{pressed}(\mathsf{G},#1,#2)}
% \def\Gpressed(#1,#2){\mathit{pressed}(#1,\mathsf{G},#2)}
\def\pressed(#1,#2,#3){\mathit{pressed}(#1,\mathsf{#2},#3)}
Jugo is an impartial juror.
Like the other jurors, she has two buttons in front of her,
labeled $\sf G$ and $\sf N$.
% The button labeled $\mathsf G$ increases the probability 
% of a guilty verdict, while the button labeled $\mathsf N$ increases
% the probability of a not-guilty verdict. 
Her instructions are to listen to evidence, and press $\sf G$ to 
increase the probability of a guilty verdict, and $\sf N$ 
to increase the probability of a not-guilty verdict.

More concretely, the system works as follows.
There are $J$ jurors, labeled $\{1, \ldots, J\}$;
let 
% $G_j(t)$ be
$\pressed(j,B,t)$ be
a variable that is equal to one if juror $j \in $
is pressing button  $\mathsf B$ button at time $t$, and zero otherwise.
% ; symmetrically, let $N_j(t)$ indicate whether $j$ is pressing the $\mathsf N$ at time $t$.
The ``belief state'' of this automated system is
a single number $g \in [0,1]$, representing the probability of a guilty verdict.
When a single juror presses $\mathsf G$, $g$ approaches 1
exponentially, and if they instead press $\sf N$, $g$ decays to zero.
In the first case ($\sf G$ is pressed) the system evoves according to 
$\frac{\mathrm dg}{\mathrm dt} = (1-g)$
while in the second, 
$\frac{\mathrm dg}{\mathrm dt} = -g$.
The first is the vector field associated with the $\mathsf G$ button,
and the second is the vector field associated with $\sf N$. 
The total effect of all buttons is then the sum of that of all buttons across all vector fields, when they are active:
% in which case $g(t) = 1 + (g(0) - 1)e^{-t} $.
% Concretely, their dynamics are governed by:
\[
	\frac{\mathrm dg}{\mathrm dt} = 
	\sum_{j = 1}^J 
		% G_j(t)
		\pressed(j,G,t)
		(1-g) 
		-
		% N_j(t)
		\pressed(j,N,t)
		(g)
		~,
	% \begin{cases}
	% 	(1 - p)	& \text{button $\uparrow$ pressed} \\
	% 	-p & \text{button $\downarrow$ pressed}
	% \end{cases}	
\]
so that $g$ exponentially approahces $1$ when more $\mathsf G$ buttons are pressed than $\mathsf N$ buttons,
and symmetricaly, exponentially approaches $0$ when more $\mathsf N$ buttons than $\mathsf G$ buttons are pressed.
At the end of the trial, the defendant is convicted with probability
equal to the final value of $g$. 

Let $\phi$ represent a piece of evidence suggesting guilt, presented by the
prosecution from time $t_1$ to time $t_2$,
and suppose for now that only buttons labeled $\mathsf G$ are pressed
in this interval.
% Many aspects of this scenario can be understood in terms of confidence.
%
	% The amount of time that 
	% What is the confidence of the system in that evidence, as it understands it from the Jurors?
% First, suppose that only $\mathsf G$ buttons are pressed
	 % between $t_1$ and $t_2$. 
% in between times $t_1$ and $t_2$.
The system measures $j$'s confidence in $\phi$ by
\[
	w_j := \!\int_{t_1}^{t_2}\!\! G_j(t) \mathrm d t 
	= \text{ total time $j$ presses $\mathsf G$ during $\phi$,}
\]
Note that $w_j = 0$ if and only if $j$ does not press any buttons,
which (a) indicates that $j$ does not trust the evidence $\phi$, 
and (b) communicates this fact to the system, by telling it to ignore 
the evidence. 
Note that this is an additive representation of confidence, since
pressing the button for four seconds, and then three more later, is
by definition the same as pressing it for seven. 
While the maximum possible confidence of $w_j$ is $(t_2 - t_1)$,
this system does not allow a juror to express \emph{full} confidence in $\phi$
because no finite amount of $\mathsf G$-pressing will result in a 
guilty verdict with probability one; it is always possible to increase
the value of $g$ through additional evidence. 

% Now, recall that ,
Altogether, the system's confidence in $\phi$ can be measured by
as the unique value $W$ for which
\begin{align*}
	% \int_{t_1}^{t_2} W (1-g(t)) \mathrm d t = \int_{t_1}^{t_2} \sum_{j=1}^J G_j(t) (1-g(t)) \mathrm d t,
	\int_{t_1}^{t_2} W (1-g(t)) \mathrm d t =
	 g(t_2) - g(t_1),
	% \\
	% W = \frac{1}{t_2-t_1 - \int_{t_1}^{t_2}g(t) \mathrm dt} \int_{t_1}^{t_2} \sum_{j=1}^J G_j(t) (1-g(t)) \mathrm d t
\end{align*}
which, so long as only $\mathsf G$ buttons are pressed, equals
$W := \sum_{j} w_j$, so this measure of confidence is additive 
across jurors as well as across time. 
This is appropriate, since the jurors are independent and 
not communicating with each other.
% This is a weighted sum of $w_i's$
As before, $W = 0$ if and only if no juror presses any buttons between times $t_1$ and $t_2$,
indicating zero trust leant to $\phi$. In such a case, the system ignores $\phi$ in updating its beliefs.
And just as no individual juror can send a full-confidence update to the system,
	the system cannot recieve a full-confidence from the jurors as a whole.
	
The picture gets significantly more complicated if we consider the possibility
that jurors might press the $\mathsf N$ button. For example, if $\phi$, which was intended
as evidence of guilt, has the effect of getting jurors to press $\mathsf N$, there is a sense
in which they have \emph{negative} confidence in $\phi$, since the belief update happened in the opposite direction of what $\phi$ represents; rather than \emph{no} trust, this is represents \emph{distrust}. 
Small negative updates are always possible except at the boundary of belief space, but in this paper, we focus almost entirely on positive confidence updates.

The introduction of the second button also uncovers a significant source of complexity:
unlike \cref{ex:prob-simple,ex:classifier,ex:shafer}, 
the order that evidence is presented matters, when there is more than one possible response to it.
Evidence presented later has a larger effect,
meaning that this system exhibits a recency bias.
% (This lack of commutativity corresponds to possibility that,
%	as a Lie algebra, the bracket may not be identically zero.)

Now consider a variant of this system that does
not trust all jurors equally; rather, it trusts each juror $j$
to a degree $\beta_j \in [0, \infty]$, and now $g$ evolves
according to
\[
	\frac{\mathrm dg}{\mathrm dt} = 
	\sum_{j = 1}^J \beta_j \Big( G_j(t) (1-g) 
		- g N_j(t) \Big).
	% \begin{cases}
	% 	(1 - p)	& \text{button $\uparrow$ pressed} \\
	% 	-p & \text{button $\downarrow$ pressed}
	% \end{cases}	
\]
In this case, the system can be said to have trust $\beta_j$ in juror
$j$, since $j$'s buttons are ignored when $\beta_j = 0$. 
When $\beta_j = \infty$ (an expression of full confidence in $j$),
$g$ immediately jumps to 0 when $j$ presses 
$\mathsf N$, or to 1 if $j$ presses $\mathsf G$ (unless canceled by
another full-confidence juror pressing the opposite button).
If all jurors have full confidence, then the verdict of this system is
a majority vote at the last moment a button was pressed. 
Thus, the weights attached to weighted combinations are (additive) expressions 
of confidence as well. 
\end{example}

\cref{ex:jugo} illustrates how a (sufficiently nice) vector field, 
	which is simpler than a smooth path for every starting point, is enough to define
an additive notion of confidence, via its integral curves.
It may seem strange to define confidence via a vector field, which does not mention confidence at all---but in a sense, it works because a vector field captures precisely everything about the update \emph{except} for the confidence. We do this formally in \cref{sec:vecrep}. 



\begin{example}[The General Kalman Filters]
\label{ex:kalman-general}
\def\estx{\hat{\mat x}}
Suppose we are interested in modeling a
dynamical system whose state is a vector
$\mat x \in \mathbb R^n$, and we recieve observations
% where $H$ is a matrix relating observations to state,
% $\mat z = H \mat x + \boldsymbol\xi \in \mathbb R^m$ where $H \in \mathbb R^{m \times n}$ is models a linear relation between states and observations,
$\mat z$ that are assumed to be a linear function of $\mat x$,
% plus independent centered Gaussian noise of known variance $R$.
plus Gaussian noise.
% $\mat z = H \mat x + \boldsymbol\xi$,
% where $H \in \mathbb R^{m \times n}$, called the \emph{observation matrix} is a linear function, and $\boldsymbol\xi \sim \mathcal N(0, R)$ models random noise
% (which we assume is drawn independently from Gaussian with mean zero and covariance $R$).
% Suppose further that we re
In many engineering disciplines, the
standard way to track this information is
% the Kalman filter \parencite{kalman1960new}.
the \citeauthor{kalman1960new} filter [\citeyear{kalman1960new}].
It prescribes belief state $(\estx, P)$,
where $\estx \in \mathbb R^{n}$ is our current estimate of
$\mat x$, and
% $P \in (-\infty,\infty]^{n\times n}$,
$P \in \mathbb R^{n\times n}$
% called a precision matrix
% is a description of our uncertainty,
% is a positive semidefinite matrix.
is a covariance matrix encoding our certainty $\estx$.
% (%
% Intuitively, this corresponds to a belief
% that
% $\mat x \sim \mathcal N(\estx, P)$.
% is normally distributed with mean $\estx$ and variance $P$.)
(Intuitively, this amounts to a belief that
$\mat x \sim \mathcal N(\estx, P)$
is normally distributed with mean $\estx$ and variance $P$.)
% (symbolically: $\mat x \sim \mathcal N(\estx, P)$).

% Suppose we now recieve an observation $\mat z$.
Suppose we now recieve an observation
$\mat z = H \mat x + \boldsymbol\xi \in \mathbb R^m$,
from a particular sensor,
where $H \in \mathbb R^{m \times n}$
is called the sensor's \emph{observation matrix},
and
the noise
$\boldsymbol\xi\sim \mathcal N(0,R)$
has known variance $R$.
For example, perhaps $\mat x = (x_1, x_2)$ is the location of an aircraft,
and we have a sensor that
% observes its first coordinate (plus noise), meaning that
measures its first coordinate (plus noise $\xi$).
This sensor's observation matrix
$H$ then represents
the map $(x_1, x_2) \mapsto x_1$, and we observe $\mat z = x_1 + \xi$.
How should we update our beliefs in response
% to $\mat z$
% to $z$
% to this new information
\unskip ?

Once again, the right answer depends on how much trust we have
in the sensor, and ranges from ignoring $\mat z$ to defering entirely
to it.
% taking the measurement to be truth.
In our example, the latter means replacing $x_1$ with $z$,
% and setting
% the covariances $p_{11}, p_{21}, p_{12}$ to zero.
% every covariance involving
% % the first coordinate
% $x_1$
% (i.e., all elements of $P$ except for bottom right one)
% to zero.
% $P := \begin{matrix}
%     R & 0 \\ 0 & p_{22}
% \end{matrix}$
% and adopting a covariance $P$ with the variance of $x_2$ unchanged, the variance of $x_1$ equal to variance $R$ of the sensor's noise.
% and changing covariances to reflect that $x_1$ has the same variance as the sensor,
% but is uncorrelated with $x_2$.
and altering $P$ so that $x_1$ has the same variance as the sensor,
and is uncorrelated with $x_2$.
% The
In general,
the Kalman filter measures confidence in the observation
% with a quantity $K$ called \emph{Kalman gain},
with a matrix $K \in \mathbb R^{n \times m}$ called \emph{Kalman gain},
which is used to compute posterior beliefs
$(\estx', P')$
% as follows:
according to:
% we also have an objective quantity on which to base
% our assessment of the
\begin{align*}
\estx' &= \estx + K (\mat z - H \estx) \\
P' &= (I - K H)^{\sf T} P (I - K H) + K R K^{\sf T} .
% = (I - KH) P,
% \begin{pmatrix}
%     \estx \\ P
% \end{pmatrix}
% &\gets
% \begin{pmatrix}
%     \estx + K (\mat z - H \estx) \\
%      (I - K H)^{\sf T} P (I - K H) + K R K^{\sf T}
% \end{pmatrix}
% \\
% &\text{where}~~ K = P H^{\sf T} (HPH^{\sf T} + R)^{-1}
\end{align*}
% we now argue that $K$
% We argue that it measures confidence in $\mat z$.
% can be veiewed as a measuring our confidence in the observation.
% acts as a ``blending factor''
% Because of the first update equation, $K$
Often introduced as a ``blending factor'', $K$ is
% not so different from
similar to $\alpha$ in \cref{ex:prob-simple},
especially
% in the one dimensional case in which
% except that it might simultaneously tracks
% and interpolates between our previous beliefs and fully adopting our new observation.
% And, like $\alpha$, Kalman gain may be seen as measuring our
% 	confidence in the observation.
\def\onedvar{\sigma^2}%
% \def\onedvar{p}
for a sensor that directly measures a one-dimensional quantity
$x$ (i.e., with $H = 1$). In this case, our belief state is a pair
$(x, \onedvar) \in \mathbb R \times [0, \infty)$,
% and the equations simplify to
% and the update equations simplify to:
and our posterior after observing $z$ simplifies to:
\begin{align*}
x'
	% &\gets x + K (z - x)
	&= (1-K)\, x + (K)\, z
	% &= x + \frac{\onedvar}{\onedvar + r}(z-x)
	\\
% p &\gets (1 - k)^2 p + k^2 r .
({\onedvar})' &= (1 - K)^2 \,\onedvar + (K)^2\, R.
	% = (1-k) p.
	% &= \frac{pr}{p+r}
 % \\
% &\qquad k = \frac p {p + r}
\end{align*}
Observe how $K$ linearly interpolates between our prior estimate of the mean
and the new observation,
and in a sense ``quadratically'' between our prior variance estimate $\onedvar$ and
the variance $R$ of the noise $\boldsymbol\xi$.

% Unlike the previous examples,
% Much more directly than in our previous examples,
More directly than in previous examples,
we can also say something prescritive about how best to select
a degree of confidence.
This is made possible by two assumptions:
\begin{itemize}
\item
% We know the stochastic process by which observations $\mat z$ are generated,
% including a number quantifying the reliability of observations (i.e., the variance $R$ of the noise $\boldsymbol\xi$ added to observations).
We know how observations $\mat z$ are generated,
and, in particular, can objectively quantity
their reliability
	% (i.e., the variance $R$ of the noise $\boldsymbol\xi$).
\unskip,
with

 the variance $R$ of the added noise $\boldsymbol\xi$.
\item
% We want to choose $K$ such that the trace of $P'$ is minimal.
% (We won't fully defend this choice, but
%  note that if $n=m=1$, then the trace of $P'$ is the mean
%  square error of our estimate.)
% We would like
Our objective is to select $K$ so as to minimize
(the sum of the eigenvalues of)
our resulting variance
% $P'$
$P' =
\Ex_{x \sim \mathcal N(\estx,P)}[ (\estx - \mat x)(\estx - \mat x)^{\sf T}]$
% the posterior (co)variance $P'$ to be as small as possible (in the L\"owner order).
% (Without fully defending this choice, we remark that if $n=m=1$, this quantity is the mean-square error of our estimate.)
\unskip, which happens to be
the multivariate analogue of mean-square error.
\end{itemize}
Under these assumptions, the optimal Kalman gain is
% $K = P H^{\sf T} (HPH^{\sf T} + R)^{-1}$
\begin{equation}\label{eq:opt-K}
% K_{\text{opt}}
K = P H^{\sf T} (HPH^{\sf T} + R)^{-1}
\end{equation}
\parencite[p. 146]{brown1997introduction}, and so
$K$ is typically chosen this way in practice
\parencite{kalmanfilter.net}.
% In principle any matrix is possible,but really
% the $(i,j)^{\text{th}}$ entry of $K$ .
% , in the sense that the posterior beliefs will minimize
% the expected mean square error
% Like before, this measure of confidence can be measured as lying between two extremes.

With this in mind, let's revisit the state update
equations in the of extreme values of confidence.
If $K = \mat 0$, which is
prescribed by \eqref{eq:opt-K}
iff the noise $\boldsymbol\xi$ has unbounded variance (in every coordinate),
then the update leaves $(\estx, P)$ unchanged.
Intuitively, so much noise is added to observations $\mat z$
that we have no trust that incorporating them will improve our estimate
of $\mat x$.
%
At the other extreme,
if there is no noise
($R = \mat 0$),
% ($ R = \lim_{\delta\to 0^}\delta I$),
then the optimal Kalman gain
% is $K = H^{-1}$ and
% $K = H^{+}$,
% must satisfy $H K $
$K = H^+$ is the pseudo-inverse of $H$,
and
we end up with posterior beleifs
% $(H^{-1}\mat z, \mathrm{diag}(\infty,\cdots,\infty))$,
% $(H^{-1}\mat z, \mat 0)$
% $(H^{-1}\mat z + \estx^{\perp}, P^{\perp})$,
% where $\estx^\perp$ is the component of $\est x$ perpendicular to the
% column space of $H$, and $P^\perp$
$(\estx', P')$ such that
$H \estx' = \mat z$ and $H P' H^{\sf T} = \mat 0$
and so fully trusts the new observation $\mat z$
(but is otherwise as close to $(\estx, P)$ as possible).
\end{example}

Just as mentioned in the discussion after \cref{ex:kalman1d}, 
the general Kalman filter \cref{ex:kalman-gen} features
three distinct kinds of uncertainty:
\begin{enumerate}
	\item $R$, the objective (un)reliablility of the observation $\mat z$
		(as measured by variance), a feature of the environment;
		\label{item:measnoise}
	\item $P$, the subjective ``internal confidence'' in the current
	 	estimate $\mat x$, a feature of the current belief state.
		\label{item:internal-conf}
	\item $K$, the subjective confidence in the observation $\mat z$,
		a feature of what one knows about where $\mat z$ comes from and
		how it relates to one's confidence.
			\label{item:kalman-gain}
\end{enumerate}
% Our focus is on quantities like \ref{item:kalman-gain}, which
% we call confidences, and how they relate to posterior beliefs.
% We will later see that quantities like \ref{item:internal-conf},
% which reflect the certainty present in a particular belief state,
% can be viewed as aggregating the confidences of past observations.
% We have less to say about how one ought to select a numerical value of confidence in general,
% but when one has access an objective measure of (un)reliability such as \ref{item:measnoise},
% it ought to play a major role.


% \section{Exploring Axiom Violations}
\section{Axiom Ablation}
	\label{sec:axiom-ablation}

\subsection{Nonmonotonic Updates (Dropping \ref{ax:seq-for-more})}
    % strangely, doesn't work with \cref, or \ref* or \cref*.
Without \cref{ax:seq-for-more}, we could have a situation like the below,
in which resuming an update could take you to a completely different position
if you were interrupted in the middle of the path. 

\begin{center}
\begin{tikzpicture}
    \coordinate[label=left:$\mu$] (A) at (0,0);
    \coordinate[label=right:$\mu|\phi$] (C) at (3,0);
    \coordinate[label=above:$F_\phi^{.6}(\mu)\mid\phi$] (C') at (2,1);
    \draw (A) to[bend right] node[outer sep=0,pos=0.6](B){} (C);
    \fill 
        (A) circle (0.1)
        (B) circle (0.1)
        (C) circle (0.1)
        (C') circle (0.1);
    \draw (B.center) to (C');
\end{tikzpicture}
\end{center}


\subsection{Halting Updates (Dropping \ref{ax:nopause})}
The next two examples further explore the motivation behind \cref{ax:nopause}

\begin{example}
Suppose $\Theta = \mathbb R \cup \{-\infty, \infty\}$, $\Phi = \{\star\}$ is a singleton, and
and 
\[
    F^t_{\!\star}(\theta) = \theta + t + \sin(t).
\]

This is interesting because it ``pauses'' 
Clearly $F$ satisfies \cref{ax:zero,ax:idemp,ax:cont,ax:diffble}.
It also satisfies \cref{ax:seq-for-more}, since it is increasing in $t$
(but it is not additive \cref{ax:additivity}).
%
Its vector field is given by 
\[
    \frac{\partial}{\partial t} F^t_{\!\star}(\theta) = 1 + \cos(t) \Big|_{t=0}
     = 2,
\]
so its unique additive representation is
$
    {^+}F^t_{\!\star}(\theta) = \theta + 2 t.
$
\end{example}

\begin{example}
Now suppose $\Theta$ and $\Phi$ are as before,
but now
\[
    F^t_{\!\star}(\theta) = \theta + t - \sin(t).
\]
As before, it satisfies \cref{ax:zero,ax:idemp,ax:cont,ax:diffble,ax:seq-for-more},
but now its vector field is
\[
    \frac{\partial}{\partial t} F^t_{\!\star}(\theta) = 1 - \cos(t) \Big|_{t=0}
     = 0,
\]
so its unique additive representation does nothing:
$
    {^+}F^t_{\!\star}(\theta) = \theta. 
$
\end{example}

\subsection{}




\section{Extra Properties of Update Rules}

\subsection{Invertable Update Rules}

\begin{CFaxioms}
	\item For all $\phi\in\Phi$, and $\beta \in \mathbb R$, the update
	$F^{\beta}_{\phi}: \Theta \to \Theta$ is invertable.
	\hfill\textbf{(Invertability)} \label{ax:invert}
\end{CFaxioms}


This effectively partitions $\Theta$ into two


\begin{prop}
	If $F$ is a differentiable and invertable update rule (i.e., satisfies \cref{ax:zero,ax:additivity,ax:invert,ax:diffble}), then for all $\beta \in \mathbb R$, $\phi \in \Phi$, the function
	% $F^\beta_\phi : \Delta\X \to \Delta\X$
	$F^\beta_\phi : \Theta \to \Theta$
	is a diffeomorphism, and its inverse is given by $F^{-\beta}_\phi$, in the sense that
	\[
		F^{-\beta}_\phi( F^{\beta}_\phi (\mu) ) = \mu = F^{\beta}_\phi( F^{-\beta}_\phi (\mu) ).
	 \]
\end{prop}


% Together with strong additivity, we get
% \begin{prop% We are primarily interested in the case where confidence can be measured as a real number,
%     If $F$ is an update rule satisfying \cref{ax:additivity,ax:invert},
%     then any update prescribed by $F$ (or sequence thereof) takes positive distributions to positive distributions.
%     %
%     Concretely, for all $\beta$ and $\phi$,   $\mu \in \mathrm{Int}(\Delta\X)$ if and only if $F^{\beta}_\phi(\mu) \in \mathrm{Int}(\Delta\X)$.
%     If $F$ further satisfies \cref{ax:sufficiency, ax:diffble}, then
%     % \[
%         $F^\beta_\phi$ is a diffemorphism of $\mathrm{Int}(\Delta\X)$.
%     % \]
% \end{prop}


As a consequence,
\begin{coro}
	If for any $\beta < \infty$ there exist $\mu, \phi, A$ such that
	$\mu(A) > 0$  but $F^{\beta}_\phi(\mu)(A) = 0$, then $F$ is not invertable.
\end{coro}


\subsection{Truth}
% We have seen how idempotence seems to be a feature of full-confidence updates. 
Implicit in a full-confidence update rule is a notion of truth:
given a full-confidence update rule $F^\top$, we define
a binary relation ${\models_{F^\top}} \subseteq \Phi \times \Theta$ 
between belief states and propositions, by 
$\theta \models_{F^\top} \phi$
(read: $\phi$ is true in $\theta$)
iff $F^\top_\phi(\theta) = \theta$.
%
It is easy to verify that in our previous examples,
this gives the appropriate notion of truth.

\begin{itemize}
	\item \textbf{Conditioning.}
	$\mu \models_{(|)} \phi \iff \mu(\phi) = 1$.
	% {\color{red}%
	% or $\mu(\phi) = 0$. 
	% }
	\item \textbf{Imaging.}
	For $w \in W$, $w \models_f \phi$ iff $\phi$ is true in $w$. 
	Meanwhile, $\mu \models \phi \iff \mu(\phi) = 1$.
	\item \textbf{Jeffrey's Rule.}
	$\mu \models_J \pi(X) \iff \mu_X = \pi$.
	That is, if the marginal of $\mu$ on the variable $X$ equals $\pi$. 
\end{itemize}


\subsection{Enriching Belief Space to Track Confidence}

\begin{example}\label{ex:dupl-enriched}
Suppose $F$ is an additve update rule. Then, we can explicitly construct a resolution to the problem posed in \cref{ex:dupl} by defining enriched spaces
\begin{align*}
	\Phi' &:= \Phi \times \Big\{ \text{ identities }~ \mathit{id}~ \Big\}\\
	\Theta' &:= \Theta \times
		\Big\{ \text{histories } L = [(\phi_1, \mathit{id}_1, c_1), \ldots (\phi_n, \mathit{id}_n, c_n)] \Big\} \\
\end{align*}
and new \cofunc\ $G$ by
\begin{align*}
	 G^{\beta}_{(\phi,\mathit{id})}(\theta, L) & :=
		\begin{cases}
		\Big( F^{\beta- \sum_{i}\beta_i \mathbbm1[(\phi_i,\mathrm{id}_i) = (\phi, \mathrm{id})]}_{\phi}(\theta),~
			 L :: (\phi,\mathit{id}, \beta)
		 \Big)
			 &\text{ if } \beta \ne \bot \\
		(\theta, L) &
			   \text{ if } \beta = \bot
	\end{cases}
\end{align*}
\end{example}


\subsection{More on Path Update Rules}
Since each $\phi$ corresponds to a path

\begin{defn}[Homotopic update rules]
	$\phi \sim_F \psi$  iff 
	they behave the same way for full confidence (that is, 
	$F^1_{\phi}(\theta) = F^1_{\psi}(\theta)$ for all $\theta \in \Theta$)
	and  there exists a continuous function
	$H : \Theta \times [0,1] \times [0,1]$ such that,
	for all $\theta \in \Theta$ and $\chi \in [0,1]$,
	\begin{enumerate}[nosep]
		\item $H(\theta, \chi, 0)= F(\theta, \chi, \phi)$,
		\item $H(\theta, \chi, 1)= F(\theta, \chi, \psi)$
	\end{enumerate}
	and for all $s \in [0,1]$,
	\begin{enumerate}[nosep,resume]
		\item $H(\theta, 0, s) = \theta$;
		\item $H(\theta, 1, s) = F^1_{\phi}(\theta) = F^1_{\psi}(\theta)$,
		the last two of	which are the same by assumption. \qedhere
	\end{enumerate}
\end{defn}

As usual, homotopy is an equivalence relation.

For example, the Dempster-Shafer update rule \eqref{eq:ds-prob} is homotopic 
to the linear update rule from \cref{ex:prob-simple}. 

\subsection{Linear Update Rules}

% In some sense, ALL update rules are linear in $\bar\Phi$ by definition.

There are many definitions of linear update rules:
\begin{defn}\label{ax:linear}
Let $F$ be a differentiable update rule on $\Theta$. We say that $F$ is \textellipsis
\begin{itemize}
\item \emph{linear} if $\Theta$ is a vector space over $\mathbb R$, and the
vector field $F'_\phi$ is a linear operator, i.e., for all $a, b \in \mathbb R$, we have that
\[ F'_\phi(a \theta_1 + b \theta_2) = a F'_\phi(\theta_1) + b F'_\phi(\theta_2). \]

\item \emph{cvx-linear} if $\Theta \subset \mathbb R^n$ is a convex set, and, for all $a \in [0,1]$, we have that
\[ F'_\phi(a \theta_1 + (1-a) \theta_2) = a F'_\phi(\theta_1) + (1-a) F'_\phi(\theta_2). \]

\item \emph{$\mathcal L$-cvx-linear} if $\Theta \subset \mathbb R^n$ and $F$ is an optimizing update rule with a loss representation $\mathcal L$ linear in its first argument, i.e.,
\[
    \mathcal L(a \theta_1 + (1-a) \theta_2, \varphi) = a \mathcal L(\theta_1, \varphi) + (1-a) \mathcal L(\theta_2, \theta).
\]
\end{itemize}
% $F'_\phi(\theta) = \mathrm{V}_\phi \theta$ for some linear operator $V_\phi \in \mathbb R^{n \times n}$.
% $F'_\phi(\theta) = \mathrm{V}_\phi \theta$ for some linear operator $V_\phi$.
\end{defn}

\begin{prop}
If $F$ is a $\mathcal L$-cvx-linear, then it is also cvx-linear.
\end{prop}

In fact, the first condition is much stronger;
\begin{prop}
if $F$ is a nontrivial $\mathcal L$-cvx-linear optimizing UR, then $\Theta$ equals cone generated by  the rays $\{ F'_\varphi\theta : \theta \in \Theta, \varphi \in \Phi \}$. In particular, if there is some $\theta$ such that $0$ is in the interior of the convex hull $\mathrm{conv}(\{F'_\phi\theta\}_{\phi \in \Phi})$, then $\Theta = \mathbb R^n$.
\end{prop}

% Implicit in this definition is the supposition that the integral curves generated by the differential equations, started at any point $\theta \in \Theta$, are

\begin{prop}
% If $F$ is a  differ
Every linear update rule is of the form
$
    F^{\beta}_\phi(\theta) =  \theta^{T} \exp(\beta V)
$,
where $\exp(\beta V)$ is the matrix exponential.%
    \footnote{Concretely, if $V = U^T \mathrm{Diag}(\lambda_1, \ldots \lambda_n) U$ is an eigendecomposition of $V$, then $\exp(V) = U^T \mathrm{Diag}(e^{\beta\lambda_1}, \ldots e^{\beta\lambda_n}) U$.}
\end{prop}

\begin{prop}
A linear update rule $F$ is commutative iff, for every pair of statements  $\phi, \phi' \in \Phi$, the
matrices $V_\phi$ and $V_{\phi'}$ commute.
\end{prop}




\clearpage
\section{Proofs}

\begin{linked}{prop}{synthetic-bel}
	For every learner $\Lrn$, there exists a 
	believer $\Bel$ such that the pair $(\Lrn, \Bel)$ satisfy 
	\cref{ax:monotone,ax:truth-is-enough,ax:effectiveness}
\end{linked}
% \recall{prop:synthetic-bel}
\begin{lproof}\label{proof:synthetic-bel}
	Given $\Lrn: \Phi \times \confdom \times \Theta \to \Theta$,
	define 
	\[
	\Bel(\theta,\phi) := 
	\begin{cases}
		\top & \text{ if } \exists \chi.~\Lrn(\phi,\chi,\theta) = \theta \\
		\bot & \text{ otherwise } 
	\end{cases}
	\]
\end{lproof}


\recall{theorem:add-reparam}
\begin{lproof}\label{proof:add-reparam}
    
	
\end{lproof}





\begin{linked}{lemma}{Boltz-fields}
	% Fix $\varphi$.
	% Let $f(X) := \exp(-\beta U(X,\varphi))$, and $g(X) := U(X,\varphi)$.
	% With the metric on $\Delta\X$ induced by its embedding as a simplex in $\mathbb R^{|X|}$, we have that
	% When parameterized as a simplex,
	The associated vector field is given by
	%
	% \begin{align*}
	$
		% (\mathrm{Boltz}\,U'_\varphi p)_{x} = p(x) (\Ex_p[U(X,\varphi)] - U(x,\varphi) )
		% (\mathrm{Boltz}\,U)'_\varphi p = p (\Ex_p[U_\varphi] - U_\varphi )
		\Boltz[U]'_\phi (\mu) = \mu \odot (\Ex_\mu[U_\phi] - U_\phi )
	$.
	% \end{align*}
\end{linked}

\recall{prop:bolz-props}
\begin{lproof}\label{proof:bolz-props}
	\textbf{Commutativity.}
	For some normalization factors $Z, Z', Z''$, we have:
	\begin{align*}
		 F^\beta_\phi( F^{\beta'}_{\phi'}(\mu))
		 &= F^\beta_\phi \Big( \frac{1}{Z} \,\mu\, \exp(- \beta' c_{\phi'}) \Big) \\
		 &= \frac{1}{Z'} \frac{1}{Z} \,\mu\, \exp(- \beta' c_{\phi'}) \exp(- \beta c_{\phi}) \\
		 &= \frac{1}{Z''} \,\mu\, \exp(-\beta' c_{\phi'} - \beta c_\phi)
	\end{align*}
	which is the same expression when we exchange $(\phi, \beta)$ and $(\phi', \beta')$.
\end{lproof}

\recall{prop:bolz-fields}
\begin{lproof}\label{proof:bolz-fields}
	Let $f(X) := \exp(-\beta U(X,\varphi))$, and $g(X) := U(X,\varphi)$.
	\begin{align*}
		\mathrm{Boltz}'_\varphi\theta &= \frac{\partial}{\partial \beta} \mathrm{Boltz}^\beta_\varphi(p) \Big|_{\beta=0} \\
	\intertext{\TODO[TODO: finish typesetting algebra]}
		&= x \mapsto
			p(x) \frac{f(x)}{\Ex_p[f]}
				\left(\Ex_p\left[ \frac{f}{\Ex_{p}[f]} g\right] - g(x) \right)
				% {\exp(-\beta U(x,\varphi))}
				% {\Ex_{}}
				\Big|_{\beta=0}
				\\
		&= \frac{pf}{\Ex_p[f]^2}
			\left(\Ex\nolimits_p\left[ f g\right] - g \Ex\nolimits_{p}[f] \right)
			\Big|_{\beta=0} \\
		&= x \mapsto p(x) (\Ex\nolimits_p[g] - g(x)) &
			\text{since $f(X) = 1$ when $\beta=0$}
	\end{align*}
	As a sanity check, note that the sum over all components is
	\[ \sum_{x \in X} ((\mathrm{Boltz}\,U)'_\varphi\, \theta)_x
		 = \sum_{x \in X} p(x) (\Ex\nolimits_p[g] - g(x))
		 = \Ex\nolimits_p[ \Ex\nolimits_p [ g ]] - \Ex\nolimits_p [g] = 0,
	 \]
	 so indeed it lies within the tangent space.
\end{lproof}

\section{Dumping Ground}

\subsection{Continuity in Priors Discussion}

Ideally, the posterior would be continuous in our initial
beliefs as well, which suggests
% similar priors typically result in similar posterior beliefs.
% This would allow us to strengthen \cref{ax:cont} to something simpler:
% This suggests
a simpler strengthening of \cref{ax:cont-and-smooth}:
% \begin{LrnAxioms}[nosep]
% 	\item
% 	[L{\the\numexpr\value{LrnAxiomsi}\relax}${^\prime}$]
% 	$\Lrn_\phi :\confdom \times \Theta \to \Theta$ 
% 	is continuous
% 		(resp. differentiable)
% 	for all $\phi \in \Phi$.
% 	\label{ax:cont-strong}
% \end{LrnAxioms}
that
% $\Lrn_\phi :\confdom \times \Theta \to \Theta$ 
$\Lrn_\phi$ 
also be continuous (resp. differentiable) as a function of $\theta$. 
%
% Unfortunately,
% % \cref{ax:cont-strong} 
% that is too strong to handle our examples at full confidence.
% In the probabilistic case, for instance:
Yet this is too much to ask for, in the probabilistic case, at high confidence. 

% Axiom \cref{ax:cont-strong} says more---it says that the posterior 
% belief is also continuous in the prior beliefs, 
% which also seems appropriate. But this assumption has significant bite.
% \commentout{%
% actually, this shows a problem with defining 
%
% \begin{example}
% 	Again let $W$ be a finite set, and choose disjoint non-empty subsets
% 	$A, B \subset W$ with $A \cap B = \emptyset$.
% 	Let $p\ne q$ be two distinct distributions over $W$ supported
% 	on $A$, and $d$ be one suppoerted on $B$. Now, consider 
% 	% and consider a sequence $(\mu_i)_{i \in \mathbb N}$ of positive probability 
% 	% distributions over $W$ whose limit 
% 	% is the point mass $\delta_w$ on a particular world $w \in W$.
% 	% $\mu^*$ has support $A \subsetneq W$ (i.e., $\mu^*(A)=1$).
% 	the two sequences of probability distributions
% 	\[
% 		\Big(p_n= (1-e^{-n}) d + (e^{-n}) p \Big)_{n \in \mathbb N}
% 		,
% 		\qquad
% 		\Big(q_n = (1-e^{-n}) d + (e^{-n}) q \Big)_{n \in \mathbb N},
% 	\]
% 	both of which have limit $d$. But every $p_n | A = p$ while every $q_n |A = q$, so
% 	now \cref{ax:cont-strong} implies that 	
% \end{example}%
\begin{prop}
	% \label{prop:no-continuous-condition-ext}
	%joe3: I don’t understand what you mean by this.
	% There is no continuous extension of conditioning to a function
	% $F$ satisfying \cref{ax:cont-strong}.
	There is no extension of conditioning that satisfies \cref{ax:cont-strong}.
	%
	% That is, if $(W, \mathcal F)$ is a measurable space $\Phi = \mathcal F$,
	% and $\Theta$ consists of all probability measures on $(W, \mathcal F)$, then
	% there is no continous function $F : $
	% In particular, 
	That is,
	% if $\Theta = \Delta W$ and $\phi\subset W$ is an event,
	for $\phi\subsetneq W$,
	there is no continuous function
	$F_\phi : \Delta W \times [0,1] \to \Delta W$
	such that $F_\phi(\mu, 1) = \mu|\phi$ when $\mu(\phi) > 0$. 
	% nor even one whose restriction to  $ [0,\epsilon) \times \Phi \times \Theta \to \Theta$
	% is continuous, for $\epsilon > 0$.
\end{prop}
% This is a consequence of the fact that there's no 
% continuous extension of conditioning that handles
% observations of events that have probability zero.
%
Intuitively, though, this is just an edge case; we can still get continuity
if we never observe an event we believe has probability zero. 
% So, rather than insist that updates are always continuous in our priors, 
% Rather than insist that updates always be continuous in our priors,  we simply take note of a set of priors for which 
Rather than insisting on this stronger axiom or giving up on it entirely, we can get something in between 
% with a proposition instead of an axiom
% with a proposition in place of the axiom: 
% learning a particular $\phi$ is continuous. 
with the following definition.


\begin{linked}{prop}
		% {maximal-continuous-theta}
	% For all $\phi \in \Phi$,
	% there is a maximal open set $\Theta_\phi \subseteq \Theta$ such that
	% the restriction 
	%
	% Given $\phi \in \Phi$, 
	% let $\Theta_\phi \subseteq \Theta$ be the maximal set 
	For all $\phi \in \Phi$, 
	there is a maximal open set 
	$\Theta_\phi \subseteq \Theta$ such that
	the restriction
	$
	% F_{\phi} |_{\Theta_\phi} : 
	\Lrn_{\phi} |_{\Theta_\phi} : 
		% [0,1) \times \Theta_\phi \to \Theta
		[\bot,\!\top) \times \Theta_\phi \to \Theta
	$		
	of 
	% $F_\phi$
	$\Lrn_\phi$
	to $\Theta_\phi$ is continuous. 	
\end{linked}
% \begin{linked}{defn}
% 	Let $\Theta_\phi$ 
% \end{linked}
In each of our examples, $\Theta_\phi$ consists of those
of belief states that do not flatly contradict $\phi$.
In \cref{ex:prob-simple}, 
for instance, \cref{prop:no-continuous-condition-ext,prop:maximal-continuous-theta}
imply that $\Theta_\phi = \{ \mu \in \Delta W : \mu(\phi) > 0\}$
is the set of distributions $\mu$ on which conditioning on $\phi$ is well-defined.
% \footnote{
% For those familiar with the basic anatomy of an ML system: 
% in \cref{ex:classifier}, if $\phi=(x,y)$, then $\Theta_{\phi}$ is the set of weights for which the gradients $\nabla_{\theta}\ell(f_\theta(x), y)$ of the loss function $\ell$ are finite.
In \cref{ex:classifier}, if $\phi=(x,y)$, then $\Theta_{\phi}$ is the parameter space in which the gradients $\nabla_{\theta}\ell(f_\theta(x), y)$ of the loss function $\ell$ are finite.
 % }
 

\subsection{Useful Facts}
Updating with confidence $\alpha$ in with LIN (as in \cref{ex:prob-simple}) starting from prior $P$ is equivalent to updating with additive confidence $t$ with the Boltzmann update rule (with $V_A = \mathbbm 1_A$), if and only if 
\begin{align*}
	\alpha &= \frac{e^{t}-1}{e^{t}-1 + \frac{1}{P(A)}}
		= \frac{\chi}{\chi + \frac{1-\chi}{P(A)}}
		= \frac{1}{1 + \frac{1-\chi}{\chi  P(A)}}
	% 		= \frac{c}{c - } 
	% \alpha &= 1 - \frac{1}{\Ex_{P}[ \exp(t \mathbbm 1_A)]} \\
	% 	&= 1 - \frac{1}{P(A) e^{t} + 1 - P(A)}
\end{align*}
where $\chi = 1-e^{-t}$ is the multiplicative form of the Boltzmann confidence. 
For symmetry, let $\beta = 1-e^{-\alpha}$ be the additive form of $\alpha$.
Then the two updates are equivalent iff 
\begin{align*}
	\frac{1-\alpha}{\alpha} = \frac{1}{P(A)} \cdot \frac{1-\chi}{\chi}
		\quad\iff\qquad
	e^{\beta} - 1 = P(A) \Big( e^{t} - 1\Big).
\end{align*}
