

% , at a certain level of trust.
% We are interested in . A \emph{}
% Each of these update rules takes its
If we take a step back, fully incorporating information is really quite extreme.
% For agents that uses conditioning, for instance, incorporation is permanent.
% Observing information
An agent that updates with conditioning for instance, is forever committed to fully believing $A$, and consequently, learns nothing from observing $A$ agin in the future.
% Humans don't work this way. The effectiveness of flash cards as a learning tool demonstrates this clearly: if we were using an update rule, two cycles through a deck of flash cards would be no different from one.
Clearly humans are not like this.
Similarly, artificial neural networks are trained with many incremental updates, and cycle through training data more than once.
Indeed, this is one biggest differences between modern machine learning techniques and  older rule-based ones: modern algorithms update parameters little-by-little, rather than fully incorporating input information.
% Once an agent that uses conditioning incorporates $A$, it is forever committed to believing $A$, and as a side effect, there is no point to making
How shall we alter our picture to account for less extreme belief alterations, in which information is only partially incorporated?
This is where confidence comes in.
% This is where other value confidence comes in.
% Humans don't always update our beliefs with update functions.
% Often there seems to be value in learning the same thing more than once---or, put another way---in updating with confidence.


% To that end, we now consider a domain of possible values of confidence, which describes a degree of incorporation.
% To that end,


% We now define a \emph{\cofunc} to be a function

Let $\confdom$ be the set of possible confidences, which, for now, we will take to be the interval $[0, 1]$.
% We are now in a position to consider confidence when updating.
We are now in a position to take confidence into account in our updates.
As before, our first axiom is that we can capture the updating process in functional form.

\begin{CFaxioms}
	\item[CF0]
	There exists some function
	% $F : \Theta \times \Phi \to \Theta$,
	\[
		F : \confdom \to ( \Phi \to ( \Theta \to \Theta) )
	\]
	which, given a confidence and new information $\phi$, in addition to a prior belief state $\theta$, produces the belief state $F^c_\phi\theta$ that corresponds to the result of observing $\phi$ in state $\theta$. \label[CFaxiomsi]{ax:funcform}
\end{CFaxioms}

% Although not incontrovertable, this seems like a reasonable requirement.
% After all, if
% Although not incontrovertable, this seems like a reasonable requirement.
% It is not so important that the function be deterministic---we can make it a non-deterministic or probabilistic without too much hassle---the real purpose of \cref{ax:funcform} is to assert that all of the information we need to compute the next belief state is contained in either
% % (1) our previous belief state, (2) the new information, or (3) our confidence in it.
% in our prior belief state $(\theta)$, the new information $(\phi)$, or our attitude towards it ($c$).



% We submit that any information that is relevant to that final belief state ought to be present in one of these places. For instance, if we've already heard this same information before, this fact should either be present in our beliefs $(\theta)$, in the observation $(\phi)$, or in the description of our confidence in it ($c$).


% which, given an incremental confidence $c \in \mathbb \confdom$, returns an ``incremental'' update rule, i.e., a function with the same type as an update rule, except possibly non-idempotent.
% which, given a confidence $c \in \mathbb \confdom$, returns an ``incremental'' update rule,
% i.e., a function with the same type as an update rule, except possibly not idempotent.
% i.e., a function $F^c : \Phi \to (\Theta \to \Theta)$ that may not be idempotent.
Given a confidence $c \in \confdom$ and a statement $\phi \in \Phi$, we write
 % Given a piece of information $\phi \in \Phi$, , we write
% $F^\beta_\phi : \Delta\X \to \Delta X$
$F^c_\phi : \Theta \to \Theta$
for the update prescribed by the \cofunc\ $F$.
% Furthermore, we will insist that \cofunc s .
Furthermore, we will insist that \cofunc s respect our interpretation of confidence at the two extremes.
% , especially the following two.
\begin{CFaxioms}
	% \item \textbf{(zero)} $F^{0}_A(\Pr) = (\Pr)$
	% \item  $F^{0}_A  = {1}_{\Delta\X}$. (That is, $F^{0}_A(\Pr) = \Pr$ for all $\Pr \in \Delta\X$.)
	% \item  $F^{0}_\phi  = {1}_{\Delta\X}$. (That is, $F^{0}_\phi(\Pr) = \Pr$ for all $\Pr \in \Delta\X$.)
	%     \hfill \textbf{(zero)} \label{ax:zero}
	\item
		% $F^{\bot}_\phi  = {\mathrm{Id}}_{\Theta}$.\\
		% (That is, $F^{\bot}_\phi(\theta) = \theta$ for all $\theta \in \Theta$.)
		For all $\theta \in \Theta$ and $\phi \in \Phi$, $F^{\bot}_\phi(\theta) = \theta$.
		% \hfill \textbf{(no confidence)} \label{ax:zero}
		% \hfill \textbf{(zero)} \label{ax:zero}
		\hfill \textbf{(neutrality)} \label{ax:zero}
	% \item $F^{\beta_1}_A \circ F^{\beta_2}_A = F^{\beta_1 + \beta_2}_A$
	% \item $F^{\top} : \Phi \to (\Theta \to \Theta)$ is an update rule, i.e.,
	%     the funciton $F^{\top}_\phi : \Theta \to \Theta$ is an idempotent.
	\item
		% For all $\phi$,
		For all $\phi$,
		% $F^\top_\phi$
		$F^\top_\phi : \Theta \to \Theta$
		is an idempotent update.\\
		Equivalently, $F^\top: \Phi \to (\Theta \to \Theta)$ is an update rule.
		% the funciton $F^{\top}_\phi : \Theta \to \Theta$ is an idempotent.
% If $F$ is a \cofunc, then by \cref{ax:idemp}, $F^{\top}$ is an update rule, and we call $F$ a ``refinement'' of the update rule $F^{\top}$.
		\hfill \textbf{(certainty)} \label{ax:idemp}\\
		We call $F$ a \emph{refinement} of the update rule $F^\top$.
% \end{CFaxioms}
% The next axiom,
% \begin{CFaxioms}
	% \item For all $\beta_1, \beta_2 \in \mathbb R_{\ge 0}$,~
	%
	% \item For all $c_1, c_2 \in \confdom$,~
	%     $F^{c_1}_\phi \circ F^{c_2}_\phi = F^{c_1 \oplus c_2}_\phi$
	%     % \hfill \textbf{(additivity)} \label{ax:additivity}
	%     \hfill \textbf{(combination)} \label{ax:additivity}
\end{CFaxioms}
\Cref{ax:zero} captures the intuition that we should ignore information in which we have no confidence, while \cref{ax:idemp} formalizes the intuition that a full-confidence updates act as we imagined.




% \Cref{ax:additivity} states that, for all $\theta$, the function $c \mapsto F^c_\phi\theta : \confdom \to \Theta$ is a group homomorphism.
 % a confidence of $\top$ indicates that we are fully incorporating information into our beliefs.




\begin{phaseout}
and so for most of this paper, we take $\confdom := \Rplus$ to be the group of extended nonnegative real numbers under addition.
With this choice of confidence domain, \cref{ax:additivity} begins to have more bite, although, as we will see, the effect is more to pin down a coherent system of measurement, and does not appear to restrict modeling expressivness.
%
%
% Below is a concrete representative example of a \cofunc\ with our standard confidence domain $\mathbb R_+$.


Here are some more abstract examples of \cofunc s, with confidence domain
$\confdom := \mathbb R_+$.
\begin{enumerate}
\item
Once again, suppose $W$ is a finite set,
$\Theta := \Delta W$, and $\Phi := 2^W$.
Here are two natural \cofunc s for this scenario, both of which are refinements of conditioning.
\begin{itemize}
	\item
	$\displaystyle
		(F1^c_A \mu)(B) = (1-e^{-c}) \mu(B|A) +  e^{-c} \mu(B)
	$
	\item
	$\displaystyle
		(F2^c_A \mu)(B) \propto \mu(B|A)^{(1-e^{-c})} \mu(B)^{e^{-c}}
	$
\end{itemize}
The first \cofunc, $F1$, linearly interpolates between the result of ignoring the information contained in the event $A$ (i.e., leaving the belief state $\mu$ unchanged) and conditioning on $A$.
By contrast, $F2$ does a similar interpolation, but multiplicatively.

\item
	% \textbf{Neural Networks Updates.}
	% \textbf{Machine Learning.}
Suppose that $\Theta$ is the set of possible parameter settings for a neural network, which aims to predict an element of $Y \subset \mathbb R^{m}$ given an in put from $X \in \mathbb R^{n}$.
So, for each $\theta \in \Theta$, we have a function $f_\theta : X \to Y$, and for fixed $x \in X$, the function $\theta \mapsto f_\theta(x) : \Theta$ is differentiable.

% $\{ f_\theta : X \to Y \}_{\theta \in \Theta}$;
% perhaps it is the set of possible weights of a neural network.


 % suppose $\Theta$ is a set of parameter settings ,

% WE DO NOT WANT TO TALK ABOUT DS FUNCTIONS HERE BECAUSE THEY ARE VARIABLE CONFIDENCE.
\item
Again consider a finite set $W$ and suppose $\Theta$ consists of all Dempster-Shafer belief functions
\end{enumerate}



% By currying we
% \begin{prop}
%
% \end{prop}



We are particularly interested in the setting where $\Theta$ parameterizes a family of probaility distributions.
To that end, suppose that $\X = (X, \mathcal A)$ be a measurable space, so that $X$ is a set and $\mathcal A$ be a $\sigma$-algebra over it, let $\Delta \X$ denote the set of probability measures over $\X$,
and keep in the back of our heads an indexed family
% $\{ p_\theta ( X_\theta ) : \theta \in \Theta\}$.
$
	\mathcal P =
	\{ p_\theta \in\Delta\X \mid \theta \in \Theta \}
$ of probability distributions.
% If we take
\end{phaseout}

% For instance, starting with a distribution $\mu_0 \in \Delta \X$, we can \emph{compose} updates.
%
% % \[
% %     \mu_0
% %         \xmapsto{\displaystyle F^{.3}_{\mathit{Height}=5'11''}} \mu_1
% %         \xmapsto{\displaystyle F^{.6}_{\Pr(Y=1|X=3)=0.4}} \mu_2
% %         \xmapsto{\displaystyle F^{2.1}_{K_i(\varphi)}} \mu_3.
% % \]
