\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnatnourl}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{xcolor}
\newcommand{\red}[1]{\textcolor{red}{#1}}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{graphicx}
%\usepackage[ruled,vlined]{algorithm2e}
\usepackage{algorithm,algorithmicx,algpseudocode}
\usepackage{subcaption}
\usepackage{bbm}

\usepackage{xr}
\externaldocument{feng_69}

\renewcommand{\theequation}{S.\arabic{equation}}

\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\newtheorem{lemma}{Lemma}
\newtheorem{example}{Example}
\newtheorem{definition}{Definition}
\newtheorem{assumption}{Assumption}
\newtheorem{condition}{Condition}


\DeclareMathOperator{\init}{init}
\DeclareMathOperator{\Ss}{\texttt{S}}
\DeclareMathOperator{\C}{\texttt{C}}
\DeclareMathOperator{\M}{\texttt{M}}
\DeclareMathOperator{\N}{\texttt{N}}
\DeclareMathOperator{\adapt}{adapt}
\DeclareMathOperator{\approved}{approved}
\DeclareMathOperator{\prespec}{pres}
\DeclareMathOperator{\Var}{Var}
\newcommand{\fix}{\marginpar{FIX}}
\newcommand{\new}{\marginpar{NEW}}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Sequential algorithmic modification with test data reuse: Supplementary material}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Jean Feng}
\author[2]{Gene Pennello}
\author[2]{Nicholas Petrick}
\author[2]{Berkman Sahiner}
\author[3]{Romain Pirracchio}
\author[2]{Alexej Gossmann}
% Add affiliations after the authors
\affil[1]{%
    Department of Epidemiology and Biostatistics\\
    University of California, San Francisco
}
\affil[2]{%
    U.S. Food and Drug Administration
}
\affil[3]{%
    Department of Anesthesiology\\
    University of California, San Francisco
  }

 \begin{document}
\maketitle

\onecolumn

\appendix

\section{Proofs}

\begin{lemma}
	The adaptive SRGP in Algorithm~\ref{algo:graph_update} with a fixed strategy is equivalent to a prespecified SRGP.
	\label{lemma:equiv_prespec}
\end{lemma}
\begin{proof}
	We define a filtration over approval histories up to the maximum number of iterations $T$.
	That is, define sample space $\Omega$ as the set of approval histories over $T$ iterations, i.e. $\Omega = \{0,1\}^{T - 1}$, and $\sigma$-algebras $\mathcal{F}_t$ for $t = 1,...,T$ over approval histories up to iteration $t$.
	To show that the adaptive SRGP is equivalent to a prespecified SRGP, we need to show that that the adaptive procedure defines a set of hypotheses, node weights, and edge weights for the initial set of hypotheses $I_0$, the hypotheses and weights are $\mathcal{F}_1$-measurable functions, and the weight constraints are satisfied.
	First, we note that the edge weights being elicited at iteration $t$ in Algorithm~\ref{algo:graph_update} is equivalent to eliciting the edge weights for the initial set of hypotheses $I_0$, i.e. $g_{a_{t'}, a_t} = g_{a_{t'}, a_t}(I_0)$ in Algorithm~\ref{algo:graph_update}.
	This is because we only elicit the edge weight $g_{a_{\tau_t}, a_t}$ if there has been no approval since time $\tau_t$ so the edge weights being elicited are never updated via the edge-weight renormalization step in SRGPs.
	As such, the adaptive SRGP in Algorithm~\ref{algo:graph_update} for a model developer with a fixed strategy for selecting hypotheses and weights can be described to have a fixed hypothesis testing tree structure with
	\begin{itemize}
		\item $\mathcal{F}_t$-measurable hypotheses $H_{a_{t}}(I_0)$ for all $a_{t}$
		\item $\mathcal{F}_1$-measurable node weights $w_{a_t}(I_0)$ for all $a_t \in \{0,1\}^{T - 1}$ that satisfy the constraint that they sum to one,
		\item and $\mathcal{F}_t$-measurable edge weights $g_{a_{t'}, a_t}(I_0)$ for all valid edges $(a_{t'}, a_t)$ in the graph that satisfy the constraint that all outgoing edge weights sum to one.
	\end{itemize}
	Although the hypotheses and edge weights are $\mathcal{F}_t$-measurable, they can also be viewed as $\mathcal{F}_1$-measurable functions over the input space $a_t$ and $(a_{t'}, a_t)$, respectively.
	Moreover, the edge weights satisfy the edge weight constraints by design.
	Thus the adaptive SRGP satisfies the node and edge weights constraints with respect to $\mathcal{F}_1$.

\end{proof}

\begin{lemma}
	If the adaptive SRGP in Algorithm~\ref{algo:graph_update} controls the FWER for any fixed strategy, then the adaptive SRGP in Algorithm~\ref{algo:graph_update} controls the FWER for any stochastic strategy.
	\label{lemma:stochastic}
\end{lemma}

\begin{proof}
	Let $\mathcal{S}$ be the set of all fixed strategies. The stochastic adaptive strategy is a random distribution over $\mathcal{S}$.
	Its FWER is
	\begin{align*}
		\Pr\left(\text{incorrectly reject some } H_{t}^{\adapt} \right)
		= \sum_{s \in \mathcal{S}} \Pr(S = s) \Pr\left(\text{incorrectly reject some } H_{t}^{\adapt} \mid S = s \right)
	\end{align*}
	where the latter probability on the right hand side is the FWER for a fixed strategy $s$.
	As such, the FWER of the stochastic strategy is properly controlled as long as the FWER of any fixed strategy is properly controlled.
\end{proof}

\begin{corollary}
	Algorithm~\ref{algo:graph_update} with the significance thresholds defined per
	\begin{equation}
		 c_{a_{t}}(I_{t}) = w_{a_{t}}(I_{t}) \alpha
		 \label{eq:srgp_bonf_crit}
	\end{equation}
	controls the FWER at level $\alpha$.
\end{corollary}
\begin{proof}
	Per Lemmas~\ref{lemma:equiv_prespec} and \ref{lemma:stochastic}, it suffices to show that the fully prespecified SRGP controls the FWER.
	Recall that \eqref{eq:srgp_bonf_crit} is a closed weighted Bonferroni test in \citet{Bretz2011-hd}.
	As such, any fixed or stochastic adaptive strategy would control FWER.
\end{proof}

\begin{proof}[Proof for Theorem~\ref{thrm:ffs}]
	Per Lemmas~\ref{lemma:equiv_prespec} and \ref{lemma:stochastic}, it suffices to show that the fully prespecified SRGP controls the FWER.

	First, per the proof in \citet{Bretz2009-bt}, we note that node weights for any intersection hypothesis $I$ calculated using Algorithm~\ref{algo:graph_update} are well-defined, in that it does not depend on ordering in which we remove nodes from the graph.

	We begin with proving that for any intersection hypothesis $I$, the critical values calculated using \eqref{eq:sig_thres_corr} controls the Type I error.
	First we show that for any $a_t$ ending with success (i.e. $a_{t, t - 1} = 1$) and any $I$, the calculated critical values for testing the intersection hypotheses $\bigcap_{a_k \in G_{a_t} \cap I} H_{a_k}$ controls the Type I error at level $\left ( \sum_{a_{k} \in G_{a_t} \cap I} w_{a_k}(I)\right) \alpha$.
	% 	That is,
	% 	\begin{align}
		% 	\Pr\left(p_{k} > c_{k} \forall k \in K, p_{j} < \tilde{c} | \cap_{k \in K} H_{k} \cap H_j \right)
		% 	\le \frac{w_{j}}{\sum_{k\in K}w_{k} + \sum_{j'=j}^\infty w_{j'}} \alpha
		% 	\qquad
		% 	\forall K \subseteq\{1,...,j-1\}
		% 	\end{align}
	% 	Suppose hypotheses with indices $I$ are true.
	% 	Then
	% The critical values for rejecting hypotheses in $I$ assigned via $G$ are strictly smaller than the crit values for each element in $I$ if we knew $I$.
	% Under $I$, we have
	Per the definition of the critical values in \eqref{eq:sig_thres_corr}, we have that
	\begin{align*}
		& \Pr \left (\text{we reject for some } a_j \in G_{a_t} \cap I
		\  \middle\vert
		H_{G_{a_t} \cap I}
		\right)\\
		= &
		\sum_{a_{j} \in (G_{a_t} \cap I)} \Pr\left(p_{a_k} > c_{a_k}(I) \forall a_{k} \in G_{a_t} \cap I, k< j, p_{a_j} < c_{a_j}(I)
		\  \middle\vert
		\bigcap_{a_{k} \in G_{a_t} \cap I, k\le j} H_{a_{k}} \right)\\
		\le & \left ( \sum_{a_{j} \in (G_{a_t} \cap I)} w_{a_j}(I)\right) \alpha.
	\end{align*}
	Therefore, as long as the total node weight across $I$ is no more than one, we control the Type I error at level $\alpha$.
	Because Type I error control holds for all intersection hypotheses $I$, we have established that this procedure is a valid closed test.

	Next, per the proof  in \citet{Bretz2009-bt}, we must show that the critical values satisfy the monotonicity condition to prove that our procedure is a valid consonant, shortcut procedure.
	More specifically, we require the following to hold for all $t = 1,...,T$:
	\begin{equation}
		c_{a_t}(I) < c_{a_t}(J) \qquad \forall J\subseteq I.
		\label{eq:monotonic_spec}
	\end{equation}
	The proof is by induction.
	It is easy to see that \eqref{eq:monotonic_spec} holds for $t = 1$.
	Suppose \eqref{eq:monotonic_spec} holds for $1,...,t - 1$.
	Now consider any history $a_{\tilde{t}}$ that ends with an approval.
	Consider any $a_t$ and subset $J \subseteq I$ such that $a_t \in G_{a_{\tilde{t}}} \cap J$.
	We have that
	\begin{align*}
		& c_{a_t}(J)\\
		=& \sup\left\{
		\tilde{c}:
		\Pr\left(p_{a_k} > c_{a_k}(J) \forall a_k \in K, p_{t} < \tilde{c} | H_{K \cup \{a_t\}} \right)
		\le \left[\sum_{\substack{a_k \in ((G_{a_{\tilde{t}}} \cap J) \setminus K) \\ k \le t}}
		w_{a_k} (J)\right] \alpha
		\forall K \subseteq \{a_k: a_k \in G_{a_{\tilde{t}}} \cap J, k < t \}
		\right\}\\
		\ge & \sup\left\{
		\tilde{c}:
		\Pr\left(p_{a_k} > c_{a_k}(I) \forall a_k \in K, p_{t} < \tilde{c} | H_{K \cup \{a_t\}} \right)
		\le \left[
		\sum_{\substack{a_k \in ((G_{a_{\tilde{t}}} \cap J) \setminus K) \\ k \le t}}
		w_{a_k} (J)\right] \alpha
		\forall K \subseteq \{a_k: a_k \in G_{a_{\tilde{t}}} \cap J, k < t \}
		\right\}\\
		\ge&  \sup\left\{
		\tilde{c}:
		\Pr\left(p_{a_k} > c_{a_k}(I) \forall a_k \in K, p_{t} < \tilde{c} | H_{K \cup \{a_t\}} \right)
		\le \left[
		\sum_{\substack{a_k \in ((G_{a_{\tilde{t}}} \cap I) \setminus K) \\ k \le t}}
		w_{a_k} (I)\right] \alpha
		\forall
		K \subseteq \{a_k: a_k \in G_{a_{\tilde{t}}} \cap I, k < t \}
		\right\}\\
		= &\   c_{a_t}(I)
	\end{align*}
	where the first inequality follows by induction and the second inequality is because the weights are monotonic.
\end{proof}

\begin{proof}[Proof for Theorem~\ref{thrm:parallel}]
	Per Lemmas~\ref{lemma:equiv_prespec} and \ref{lemma:stochastic}, it suffices to show that the fully prespecified SRGP controls the FWER.

	We first prove that the critical values per \eqref{eq:adapt_err} control the Type I error for any intersection hypothesis $I$.
	For any $I$, define $\tilde{I}$ as the union of $I$ and all prespecified nodes.
	Then the Type I error can be bounded using a sequence of union bounds:
	\begin{align*}
		& \Pr\left(
		\exists (t, a_t) \in I \text{ s.t. } p_{a_t} < c_{a_t}({I}) \mid H_{I}
		\right)\\
		\le & \Pr\left(
		\exists t \text{ s.t. } \xi_{t,n}^{\prespec} \le z_{t}^{\prespec}({I}) \text{ OR }  \exists (t, a_t) \in I \text{ s.t. } p_{a_t} < c_{a_t}({I}) \mid H_{I}
		\right)\\
		\le &
		\sum_{t = 1}^\infty
		\left[
		\Pr\left(
		\xi_{t',n}^{\prespec} > z_{t'}^{\prespec}({I}) \forall t' \le t - 1,
		\xi_{t,n}^{\prespec} \le z_{t}^{\prespec}({I})
		\  \middle\vert
		H_{I}
		\right)
		+
		\sum_{a_t \in I}
		\Pr\left(
		\xi_{t',n}^{\prespec} > z_{t'}^{\prespec}({I}) \forall t' \le t,
		p_{a_t} < c_{a_t}({I})
		\  \middle\vert
		H_{I}
		\right)\right]\\
		\le &
		\left (\sum_{t = 1}^\infty \left(w_{t}^{\prespec}\left ( \tilde{I}\right)
		+ \sum_{a_t \in I} w_{a_t}\left(\tilde{I}\right ) \right) \right) \alpha\\
		=&\alpha.
	\end{align*}
	% 	\red{[AG: In the currently first line of the equation shouldnt it be\\ $\Pr\left(
		% 	\exists t \text{ s.t. } \xi_{t,n}^{\prespec} \le z_{t}^{\prespec}({I}) \text{ OR } \left[ (\forall t): \xi_{t,n}^{\prespec} > z_{t}^{\prespec}({I}) \text{AND} \exists (t, a_t) \in I \text{ s.t. } p_{a_t} < c_{a_t}({I}) \right] \mid H_{I}
		% 	\right)$ ? (maybe I'm misunderstanding something)]}\\
	Because the weights are nondecreasing in Algorithm~\ref{algo:graph_update}, the critical values defined in \eqref{eq:adapt_err} satisfy the monotonicity condition.
	As such, Algorithm~\ref{algo:graph_update} is a consonant, short-cut procedure for the above closed test.

\end{proof}

\section{Hypothesis test details}

\subsection{Testing for an improvement in AUC}
\label{sec:auc_compare}

In Section~\ref{sec:sim}, we decide whether or not to approve a modification by testing the adaptively-defined null hypothesis \eqref{eq:null_delta} at each iteration $j$, which compares the AUC between the $j$th adaptively proposed model and the initial model.
Per Algorithm~\ref{algo:graph_update}, we test the adaptive hypotheses by treating them as pre-specified hypotheses from a bifurcating tree, i.e.
\begin{equation}
	H_{0,a_j}: \psi\left (\hat{f}_{a_j}, P_0 \right) \le \psi\left (\hat{f}_0; P_0 \right) + \delta_{a_j}
	\label{eq:null_delta_fix}
\end{equation}
for approval histories $a_j$.
We now describe how the test statistics and significance thresholds are constructed.


Recall that the AUC is equal to the Mann-Whitney U-statistic for comparing ranks across two populations, i.e.
\begin{align}
	\psi(f, P_0) = P_0 \left (f(X_1) > f(X_2) \mid Y_1 = 1, Y_2 = 0 \right),
\end{align}
where $(X_1,Y_1)$ and $(X_2,Y_2)$ represent independent draws from $P_0$.
The empirical AUC is defined as
\begin{equation}
	\psi(f, P_n) = \frac{1}{n_0 n_1} \sum_{i=1}^{n_0} \sum_{j=1}^{n_1} \mathbbm{1}\left\{f(X_j) > f(X_i)\right\} \mathbbm{1}\left\{Y_j = 1, Y_i = 0\right\},
	\label{eq:auc_emp}
\end{equation}
where $n_0$ is the number of observations with $Y = 0$ and $n_1 = n - n_0$.
To test \eqref{eq:null_delta_fix}, we characterize the asymptotic distribution of \eqref{eq:auc_emp} by analyzing its influence function.
Given IID observations from $P_0$, \eqref{eq:auc_emp} is an asymptotically linear estimator of the model's AUC \citep{LeDell2015-qz}, in that
\begin{equation}
	\psi(f, P_n) - \psi(f, P_0)
	= \frac{1}{n} \sum_{i=1}^n \phi(f, P_0)(X_i, Y_i) + o_p(1/\sqrt{n})
	\label{eq:auc_diff}
\end{equation}
with influence function
\begin{align*}
	\begin{split}
	\phi(f, P_0)(X_i, Y_i) =&
	\frac{\mathbbm{1}\{Y_i = 1\}}{P_0(Y = 1)} P_0\left(f(X) < c \mid Y= 0; c = f(X_i)\right)\\
	&\ + \frac{\mathbbm{1}\{Y_i = 0\}}{P_0(Y = 0)} P_0\left(f(X) > c \mid Y= 1; c = f(X_i)\right)\\
	&\ - \left\{
	\frac{\mathbbm{1}\{Y_i = 0\}}{P_0(Y = 0)}
	+ \frac{\mathbbm{1}\{Y_i = 0\}}{P_0(Y = 0)}
	\right\} \psi\left(f, P_0\right).
	\end{split}
\end{align*}
Per the Central Limit Theorem, we have that
\begin{equation}
\sqrt{n}\left(\psi(f, P_n) - \psi(f, P_0) \right) \rightarrow_d N\left(0, \sigma(f,P_0)^2 \right)
\end{equation}
where $\sigma(f,P_0)^2 = \Var(\phi(f, P_0)(X,Y))$.
We can then test the null hypothesis $H_0: \psi(\hat{f}_0, P_0) \le c $ for some constant $c$ based on the asymptotic normality of \eqref{eq:auc_emp}.
In addition, we can test \eqref{eq:null_delta_fix} by deriving the asymptotic distribution of $\psi\left (\hat{f}_{a_j}, P_0 \right) - \psi\left (\hat{f}_0; P_0 \right)$ based on the difference of the influence functions $\phi(\hat{f}_{a_j}, P_0)(X,Y) - \phi(\hat{f}_0, P_0)(X,Y)$.
To run \texttt{fsSRGP}, we can extend the above derivations to construct a flexible fixed sequence test for testing a family of null hypotheses \eqref{eq:null_delta_fix} across multiple iterations $j$ by analyzing the  joint asymptotic distribution of the test statistics $\psi\left (\hat{f}_{a_j}, P_n \right) - \psi\left (\hat{f}_0; P_n \right)$ and compute the significance thresholds defined in \eqref{eq:sig_thres_corr}.
Similar logic can be used to derive the critical values \eqref{eq:pres_ci} and significance thresholds \eqref{eq:adapt_err} in \texttt{fsSRGP}.

\subsection{Testing model discrimination and calibration}

Section~\ref{sec:eci} considers the more complex hypothesis test \eqref{eq:eicu_hypo}, which checks for an improvement in AUC and calibration-in-the-large.
We implement this by testing three individual hypothesis tests using sequential gatekeeping.
First, we test that the difference between the average risk prediction and the observed event rate is no smaller than $-\epsilon$.
Next, we test that this difference is no larger than $\epsilon$.
Finally, we test for an improvement in AUC using the procedure described in Section~\ref{sec:auc_compare}.
To control the Type I error for rejecting the overall null hypothesis, we perform alpha spending across the individual hypotheses.
% For the empirical analyses, we allocated 0.05 of the total alpha wealth to the first hypothesis, 0.05 to the second, and 0.9 to the last.

\section{Additional Experiments}

\subsection{Sensitivity analysis to risk tolerance of model developer}

In Section~\ref{sec:sim_power}, we simulated a model developer who submits a refitted model for testing only if the power exceeds a threshold of 50\%.
This threshold is a reflection of the model developer's risk tolerance.
A model developer who selects a higher threshold is more likely to have their modifications approved, but the time between each model submission is also longer.
To understand how a more conservative model developer would affect the results in Section~\ref{sec:sim_power}, we rerun the same simulation except with a threshold of 80\%.
As seen in Figure~\ref{fig:power_risk}, the overall rate of model improvement is slower.
For example, \texttt{presSRGP} previously required 180 observations to reach an AUC of 0.80 when the power threshold was 50\%.
In comparison, it requires nearly 250 observations when the power threshold is set to 80\%.
Also, the performance of the different MTPs are now more similar, particularly between the different SRGPs.
This is also unsurprising, as the power to approve these modifications is much higher in this simulation; the additional power gain from employing \texttt{fsSRGP} and/or \texttt{presSRGP} as compared to \texttt{bonfSRGP} is now much smaller.
Finally, a more conservative model development strategy decreases the variability of the approval histories, as evidenced by the narrower error bars.

\begin{figure}
	\centering
	\includegraphics[width=0.7\textwidth]{images/improve_196_power80.png}
	\caption{Comparison of MTPs for approving modifications when the model developer is highly risk averse. The simulation is the same as that for Figure~\ref{fig:power}, except the modifications are submitted only if the calculated power exceeds 80\%.}
	\label{fig:power_risk} 
\end{figure}


\subsection{Sensitivity analysis of SRGP with hypothetical prespecified model updates}
The power of SRGP with hypothetical prespecified model updates (\texttt{presSRGP}) depends on the similarity between the prespecified and adaptive model updates.
As their correlation increases, the power of \texttt{presSRGP} will increase, all other things being equal.
Here we present a simulation study where we investigate the sensitivity of \texttt{presSRGP} to the similarity of the model updates, using the same data stream as that in Section 3.2.
We have carefully designed three model developers such that their adaptively generated model updates have different correlations with the prespecified model updates but the rate of improvement in AUC is the same.
To do so, the prespecified updating procedure trains on only observations with \textit{even} indices.
The first model developer (\texttt{Even}) generates updates as close as possible to the prespecified rule: they refit the model using only even indices and adaptively submit modifications if the calculated power exceeds 50\%.
The second model developer (\texttt{Odd}) generates updates in a very different manner: they refit using only observations with odd indices.
Finally, the third model developer (\texttt{Even/Odd}) deviates moderately from the prespecified model updates: they train modifications on observations with indices that are 0, 3, and 5 mod 6.
Figure~\ref{fig:power_alignment} shows that the power for approving modifications depends on how much the model developer deviates from their prespecified updating procedure.
The moderate deviations in \texttt{Even/Odd} lead to a small drop in the approval rate and very slight drop in AUC.
The drop in performance is more obvious in \texttt{Odd}, where the adaptive strategy does not align with the prespecified updating procedure at all.

\begin{figure}
	\centering
	\includegraphics[width=0.7\textwidth]{images/sensitivity_prespec}
	\caption{
		We analyze the sensitivity of \texttt{presSRGP} to the similarity between the adaptive and prespecified model updates.
		We simulate three model developers who increasingly deviate from the prespecified updates: \texttt{Even}, \texttt{Even/Odd}, and \texttt{Odd}, ordered from lowest to highest deviation from the prespecified updating procedure.
	}
	\label{fig:power_alignment}
\end{figure}


\subsection{Revising detection algorithm for intracranial hemorrhages}

Here we present a second data analysis.
We analyze data from the RSNA 2019 Brain CT Hemorrhage Challenge \citep{Flanders2020-jp}, where the prediction task was to detect and classify intracranial  hemorrhages (ICH)  based  on  head  CT  scans.
We follow nearly the same pre-processing procedure outlined in \citep{Gossmann2021-qk}: we extract two axial slices from each subject's head CT scan and then apply a pre-trained ResNet50 model (without any training or fine-tuning on the medical images) to extract 2048 features from each image.
We will consider the binary classification task of detecting the presence of any ICH subtype.
The adaptive testing setup is similar to that outlined in Section~\ref{sec:eci}.
For each simulation replicate, we randomly select 100 subjects to train the initial GBT model, 900 subjects to generate model updates, and 500 subjects for adaptive test data reuse.
The 900 subjects are randomly ordered to construct a data stream, in which data from 10 patients arrive at each time point.
At each time point, we refit the GBT on all previously collected data.
The model developer is allowed $T=10$ adaptive tests.
Figure~\ref{fig:rsna_exp} shows that the result for 20 replicates. \texttt{presSRGP} and \texttt{fsSRGP} performed the best; \texttt{bonfSRGP} performed very similarly.

\begin{figure}
	\centering
	\includegraphics[width=\textwidth]{images/rsna}
	\caption{Approving refitted gradient boosted trees for detecting intracranial hemorrhages in head CT scans. Models are approved if the calibration-error-in-the-large is close to the ideal value of zero and the AUC is improving.}
	\label{fig:rsna_exp}
\end{figure}

\bibliography{notes}

\end{document}
