\section{\rev{Data Splits}}
\begin{table}[ht]
    \centering
    \begin{tabular}{lrrrrrrr}
        % TODO get subject counts to avoid questions for "largest" vendor
        \toprule
        \multicolumn{7}{c}{M\&M Dataset} & \\
        % \midrule
                 & Sym. (train/val) & Trio & Avanto & HDxt & Excite & Explorer & Achieva \\
        Cases & 172              &  5   &  42    &  25  &  27    &  1       & 88 \\
        Slices   & 2699/299         & 94   & 695    & 426  & 459    & 18       & 1422 \\
        \midrule
        \multicolumn{7}{c}{PMRI Dataset} \\
        % \midrule
                 & RUNMC (train/val) & BMC & I2CVB & UCL & BIDMC & HK \\
        Cases &    30             & 30  & 19    & 13  & 12    & 12 \\ 
        Slices   &    378/41         & 324 & 505   & 171 & 197   & 157 \\
        \bottomrule
    \end{tabular}
    \caption{\rev{Total slice counts for the M\&M and PMRI datasets. Training and validation splits are consistent across U-Net and predictor trainings.}}
    \label{tab:dataset_cases}
\end{table}

\clearpage
\section{\rev{Gradient Visualization}}

\begin{figure}[!htb]
    \centering
    \includegraphics[width=\textwidth]{figures/grad_vis.png}
    \caption{\rev{Example from the M\&M dataset comparing the behavior of confidence predictors trained with and without our adversarial perturbation scheme on volumetric dice prediction. In both cases, adversarial perturbations affect the whole image, but are difficult to visually discern. Both perturbations decrease predicted confidence ($0.94 \rightarrow 0.87$ top, $0.92 \rightarrow 0.70$ bottom). However, only the perturbation from our method actually results in a degradation of the segmentation quality ($0.90 \rightarrow 0.81$), while the one from a predictor without adversarial training has little effect on the segmentation ($0.90 \rightarrow 0.90$).}}
    \label{fig:grad-vis}
\end{figure}

\clearpage
\section{Ablation Study}
\begin{table}[!htb]
    \centering
    \begin{tabular}{lrrrrrr}
        \toprule
        \multicolumn{7}{c}{Pearson Correlation} \\
        Method  & Achieva & Avanto & EXCITE & Explorer & HDxt & Trio \\
        $\Delta_s$           &               &               &              &                &               &               \\
        0.05           & $.675\pm.018$ & $.736\pm.018$ & $.662\pm.032$ & $.847\pm.019$ & $.608\pm.035$ & $.624\pm.035$ \\
        0.1            & $.718\pm.024$ & $.749\pm.015$ & $.700\pm.013$ & $.856\pm.033$ & $.619\pm.012$ & $.654\pm.015$ \\
        \;fine-tuned   & $.730\pm.033$ & $.754\pm.013$ & $.716\pm.017$ & $.858\pm.036$ & $.632\pm.014$ & $.668\pm.011$ \\
        0.2            & $.719\pm.037$ & $.779\pm.023$ & $.695\pm.016$ & $.850\pm.041$ & $.631\pm.019$ & $.616\pm.044$ \\
        \vspace*{0.01cm} \\
        \midrule
        \multicolumn{7}{c}{eAURC} \\
                 &               &               &               &               &               &               \\
        0.05         & $.019\pm.003$ & $.012\pm.001$ & $.023\pm.003$ & $.004\pm.001$ & $.025\pm.002$ & $.018\pm.002$ \\
        0.1          & $.017\pm.002$ & $.012\pm.001$ & $.021\pm.002$ & $.005\pm.001$ & $.024\pm.002$ & $.016\pm.003$ \\
        \;fine-tuned & $.016\pm.002$ & $.012\pm.001$ & $.020\pm.002$ & $.005\pm.001$ & $.023\pm.002$ & $.015\pm.001$ \\
        0.2          & $.017\pm.001$ & $.012\pm.001$ & $.022\pm.002$ & $.007\pm.004$ & $.023\pm.002$ & $.017\pm.001$ \\
        \vspace*{0.01cm} \\
        \midrule
        \multicolumn{7}{c}{MAE} \\
                 &               &               &               &               &               &               \\
        0.05         & $.076\pm.002$ & $.064\pm.004$ & $.088\pm.004$ & $.078\pm.005$ & $.078\pm.006$ & $.071\pm.002$ \\
        0.1          & $.074\pm.002$ & $.061\pm.002$ & $.086\pm.002$ & $.075\pm.006$ & $.078\pm.003$ & $.068\pm.003$ \\
        \;fine-tuned & $.075\pm.003$ & $.061\pm.002$ & $.086\pm.002$ & $.074\pm.008$ & $.076\pm.004$ & $.074\pm.004$ \\
        0.2          & $.073\pm.003$ & $.060\pm.002$ & $.083\pm.003$ & $.075\pm.006$ & $.077\pm.004$ & $.067\pm.003$ \\
        \bottomrule
    \end{tabular}
    \caption{Ablation exploring different values of $\Delta_s$, as well as fine-tuning of the segmentation network's parameters for confidence prediction (with $\Delta_s=0.1$) for volumetric Dice prediction on test domains of the M\&M dataset. Correlation and eAURC improve for our default setting $\Delta_s=0.1$ compared to a reduced $\Delta_s=0.05$, but increasing it to $\Delta_s=0.2$ does not always yield further improvement. Overall, results are stable with respect to variations in $\Delta_s$. Fine-tuning leads to a small benefit in correlation and eAURC, but also to higher complexity and running times. MAE benefits from larger $\Delta_s$, but less clearly from fine-tuning.}
    \label{tab:ablation}    
\end{table}

%%% Local Variables:
%%% mode: latex
%%% TeX-master: "../submission"
%%% End:
