\section{Experiments}
\input{4_Experiments}

\section{Ablation Studies}
\input{Tables/ablation_tables}

We ablate key architectural choices of the Transolver block along three orthogonal axes: the number of clusters used for tokenization ($M$), the number of attention heads ($H$), and the MLP expansion ratio. Experiments are conducted on CAMELYON16, with full sweeps reported in Figures A~\ref{fig:ablation_clusters_cam16}, \ref{fig:ablation_heads_cam16_0}, \ref{fig:ablation_heads_cam16_1}, \ref{fig:ablation_hdims_cam16} and \ref{fig:ablation_mlp_cam16}.



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Times, Params, FLOPs
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{table}[h]
    \centering
\begin{tabular}{lcccc}
\toprule
Model & Training (s) & Inference (s) & Params (M) & FLOPs (G) \\
\midrule
ABMIL~\cite{ilse_attention-based_2018}      & $\mathbf{5.5}$ & $\mathbf{0.8}$ & $0.660$ & $1.31$ \\
CLAM~\cite{lu_data-efficient_2021}       & $7.0$         & $0.9$         & $0.920$ & $1.84$ \\
TransMIL~\cite{shao_transmil_2021}   & $13.4$        & $1.2$         & $2.67$  & $85.02$ \\
DGRMIL~\cite{zhu_dgr-mil_2025}    & $16.7$        & $1.5$         & $4.34$  & $79.88$ \\
PAMIL~\cite{pamil} & $6.2$ & $0.9$ & $0.796$ & $1.32$ \\ 
BayesMIL~\cite{cui_bayes-mil_2023}& $9.5$         & $1.1$         & $1.32$  & $2.63$ \\
% MixMIL     & $11.0$        & $1.5$         & $1.57$  & $1.05$ \\
% AGP        & $31.0$        & $6.0$         & $1.21$  & $2.86$ \\
SGPMIL~\cite{lolos_sgpmil_2025}    & $9.0$         & $1.0$         & $1.21$  & $2.43$ \\
\ours & $6.3$     & $\mathbf{0.8}$         & $\mathbf{0.314}$ & $\mathbf{0.628}$ \\
\bottomrule
\end{tabular}

    \caption{Training and inference times (in seconds) and model sizes (number of trainable parameters in millions, M). 
    Training times are averaged over 30 epochs, while inference times correspond to processing the full test set of $129$ slides.}
    \label{tab:timing_comparison}
\end{table}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% ABLATIONS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Number of clusters vs performance for different 
% number of heads
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\clearpage
\begin{figure}[p]
\centering

\resizebox{0.625\textwidth}{!}{%
\begin{minipage}{\textwidth}

\includegraphics[width=\linewidth]{ablations_gathered_friday/cluster_cam16_heads1.png}
\vspace{0.1em}

\includegraphics[width=\linewidth]{ablations_gathered_friday/cluster_cam16_heads2.png}
\vspace{0.1em}

\includegraphics[width=\linewidth]{ablations_gathered_friday/cluster_cam16_heads4.png}
\vspace{0.1em}

\includegraphics[width=\linewidth]{ablations_gathered_friday/cluster_cam16_heads6.png}
\vspace{0.1em}

\includegraphics[width=\linewidth]{ablations_gathered_friday/cluster_cam16_heads8.png}
\vspace{0.1em}

\includegraphics[width=\linewidth]{ablations_gathered_friday/cluster_cam16_heads10.png}
\vspace{0.1em}

\includegraphics[width=\linewidth]{ablations_gathered_friday/cluster_cam16_heads12.png}

\end{minipage}}
\caption{
Ablation on the number of clusters for CAMELYON16.
For each row, the number of attention heads is fixed while the number of clusters is varied.
From top to bottom: 1, 2, 4, 6, 8, 10 and 12 heads.
}
\label{fig:ablation_clusters_cam16}
\end{figure}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Number of heads vs performance for different 
% number of clusters
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\clearpage
\begin{figure}[p]
\centering

\resizebox{0.68\textwidth}{!}{%
\begin{minipage}{\textwidth}

\includegraphics[width=\linewidth]{ablations_gathered_friday/heads_cam16_cluster2.png}
\vspace{0.1em}

\includegraphics[width=\linewidth]{ablations_gathered_friday/heads_cam16_cluster4.png}
\vspace{0.1em}

\includegraphics[width=\linewidth]{ablations_gathered_friday/heads_cam16_cluster6.png}
\vspace{0.1em}

\includegraphics[width=\linewidth]{ablations_gathered_friday/heads_cam16_cluster8.png}
\vspace{0.1em}

\includegraphics[width=\linewidth]{ablations_gathered_friday/heads_cam16_cluster10.png}
\vspace{0.1em}

\includegraphics[width=\linewidth]{ablations_gathered_friday/heads_cam16_cluster12.png}

\end{minipage}}
\caption{
Ablation on the number of attention heads for a single \ours\ block on CAMELYON16.
For each row, the number of clusters is fixed while the number of heads is varied.
From top to bottom: 2, 4, 6, 8, 10, and 12 clusters.
}
\label{fig:ablation_heads_cam16_0}
\end{figure}


\clearpage
\begin{figure}[p]
\centering

\resizebox{0.9\textwidth}{!}{%
\begin{minipage}{\textwidth}

\includegraphics[width=\linewidth]{ablations_gathered_friday/heads_cam16_cluster16.png}
\vspace{0.1em}

\includegraphics[width=\linewidth]{ablations_gathered_friday/heads_cam16_cluster32.png}
\vspace{0.1em}

\includegraphics[width=\linewidth]{ablations_gathered_friday/heads_cam16_cluster64.png}
\vspace{0.1em}

\includegraphics[width=\linewidth]{ablations_gathered_friday/heads_cam16_cluster128.png}

\end{minipage}}
\caption{
Ablation on the number of attention heads for a single \ours\ block on CAMELYON16.
For each row, the number of clusters is fixed while the number of heads is varied.
From top to bottom: 16, 32, 64, and 128 clusters.
}
\label{fig:ablation_heads_cam16_1}
\end{figure}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Initial mlp hidden dims for on model instantiation
% namely 8heads 16 clusters
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\clearpage
\begin{figure}
    \centering
    \includegraphics[width=0.75\linewidth]{ablations_gathered_friday/hdims_cam16.png}
\caption{
    Ablation on the dimensionality of the input projection layer on CAMELYON16.
    We vary the number of hidden units while keeping the number of clusters (16)
    and attention heads (8) fixed.
}
    \label{fig:ablation_hdims_cam16}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[width=0.75\linewidth]{ablations_gathered_friday/mlp_cam16.png}
\caption{
    Ablation on the MLP expansion ratio in the \ours\ block on CAMELYON16.
    We vary the expansion factor of the feed-forward network
    (MLP ratio $\in \{1, 2, 4\}$) while keeping all other components fixed (16 clusters, 8 heads).
}
    \label{fig:ablation_mlp_cam16}
\end{figure}

\begin{figure}[h]
    \centering
    \includegraphics[width=1.0\linewidth]{Figures/flops_vs_M.png}
    \caption{\textbf{Effect of the number of clusters $M$ on computational cost and performance.}
    \textbf{Left:} Inference FLOPs of \ours\ as a function of $M$, measured for a bag of 1{,}000 patch embeddings. The reference FLOPs of ABMIL (1.31G) are shown for comparison.
    \textbf{Right:} CAPRMIL classification performance (mean AUC $\pm$ standard deviation) under 10-fold cross-validation for different values of $M$. FLOPs increase with $M$, while performance saturates beyond small numbers of clusters.}
    
    \label{fig:flops_v_M}
\end{figure}

\clearpage

\begin{table}[t]
\centering
\setlength{\tabcolsep}{6pt}
\resizebox{\textwidth}{!}{\begin{tabular}{l cc cc cc cc}
\toprule
 & \multicolumn{2}{c}{\textbf{CAMELYON16}} & \multicolumn{2}{c}{\textbf{TCGA-NSCLC}} & \multicolumn{2}{c}{\textbf{PANDA}} & \multicolumn{2}{c}{\textbf{BRACS}} \\
\cmidrule(lr){2-3}\cmidrule(lr){4-5}\cmidrule(lr){6-7}\cmidrule(lr){8-9}
\textbf{Init} 
& \textbf{AUC} & \textbf{ACE} 
& \textbf{AUC} & \textbf{ACE} 
& $\boldsymbol{\kappa}$ & \textbf{ACE} 
& \textbf{AUC} & \textbf{ACE} \\
\midrule
Orthogonal 
& $0.975_{0.007}$ & $0.028_{0.005}$
& $0.978_{0.016}$ & $0.033_{0.021}$
& $0.953_{0.044}$ & $0.019_{0.023}$
& $0.850_{0.031}$ & $0.189_{0.026}$ \\
Random 
& $0.964_{0.011}$ & $0.032_{0.012}$
& $0.975_{0.018}$ & $0.029_{0.024}$
& $0.952_{0.046}$ & $0.021_{0.024}$
& $0.851_{0.027}$ & $0.186_{0.020}$ \\
\bottomrule
\end{tabular}}
\caption{Ablation of the initialization of $W_{\text{cluster}}$ (slice/cluster projection). Results are reported as mean$_{\text{std}}$ over cross-validation folds for CAMELYON16, TCGA-NSCLC, and PANDA, and over $4$ random seeds for BRACS.}
\label{tab:init_ablation_compact}
\end{table}


\begin{table}[t]
\centering
\begin{tabular}{c cc cc}
\toprule
 & \multicolumn{2}{c}{\textbf{PANDA}} & \multicolumn{2}{c}{\textbf{BRACS}} \\
\cmidrule(lr){2-3} \cmidrule(lr){4-5}
$M$ & $\kappa$ & ACE & AUC & ACE \\
\midrule
4  & $0.944_{0.053}$ & $0.021_{0.024}$ & $0.850_{0.031}$ & $0.189_{0.026}$ \\
6  & $0.945_{0.049}$ & $\mathbf{0.020_{0.022}}$ & $0.852_{0.032}$ & $0.182_{0.019}$ \\
12 & $\mathbf{0.955_{0.049}}$ & $0.021_{0.026}$ & $0.870_{0.016}$ & $\mathbf{0.153_{0.019}}$ \\
16 & $0.944_{0.053}$ & $0.021_{0.024}$ & $\mathbf{0.871_{0.024}}$ & $0.183_{0.010}$ \\
\bottomrule
\end{tabular}
\caption{Effect of the number of clusters $M$ on performance and calibration for PANDA and BRACS. Results are reported as mean$_{\text{std}}$ over 5 cross-validation folds for PANDA and 4 seeds for BRACS.}
\end{table}




\section{Pooling operators}
\label{app:aggregation}

Given context-aware patch representations
$\mathbf{H}^{(T)} = \{\mathbf{h}_1,\dots,\mathbf{h}_N\}$ with
$\mathbf{h}_n \in \mathbb{R}^{D}$,
we consider the following MIL aggregation operators $\mathcal{A}(\cdot)$:

\paragraph{Mean pooling.}
\[
\mathbf{z}_{\text{mean}} = \frac{1}{N} \sum_{n=1}^{N} \mathbf{h}_n
\]

\paragraph{Max pooling.}
\[
\mathbf{z}_{\text{max}} = \max_{n \in \{1,\dots,N\}} \mathbf{h}_n
\]
where the maximum is taken element-wise across instances for each feature.

\paragraph{Attention pooling.}
\[
a_n =
\frac{\exp\!\left(\mathbf{w}^\top \tanh(\mathbf{V}\mathbf{h}_n^\top)\right)}
{\sum_{m=1}^{N} \exp\!\left(\mathbf{w}^\top \tanh(\mathbf{V}\mathbf{h}_m^\top)\right)},
\qquad
\mathbf{z}_{\text{attn}} = \sum_{n=1}^{N} a_n\, \mathbf{h}_n
\]

\paragraph{Gated-attention pooling.}
\[
a_n =
\frac{\exp\!\left(
\mathbf{w}^\top
\left(\tanh(\mathbf{V}\mathbf{h}_n^\top)\odot\sigma(\mathbf{U}\mathbf{h}_n^\top)\right)
\right)}
{\sum_{m=1}^{N} \exp\!\left(
\mathbf{w}^\top
\left(\tanh(\mathbf{V}\mathbf{h}_m^\top)\odot\sigma(\mathbf{U}\mathbf{h}_m^\top)\right)
\right)},
\qquad
\mathbf{z}_{\text{gated}} = \sum_{n=1}^{N} a_n\, \mathbf{h}_n
\]

All aggregation operators map $\mathbf{H}^{(T)}$ to a fixed-dimensional
slide-level embedding $\mathbf{z} \in \mathbb{R}^{D}$ and can be used
interchangeably without modifying the \ours\ blocks.

\clearpage






\section{Clustering Behavior and Token Specialization}
\label{app:cell_analysis}

\subsection{Quantitative Analysis of Cluster Utilization}
\label{app:quantitative_cluster}

To quantitatively assess cluster utilization and verify that CAPRMIL does not suffer from cluster collapse, we analyze both the uncertainty of patch-to-cluster assignments and the distribution of cluster usage across attention heads. Patch-to-cluster assignment scores are explicitly regularized through three mechanisms: (i) normalization over the $M$ clusters such that $\sum_{m=1}^{M} W_{b,h,n,m} = 1$, where $W_{b,h,n,m}$ denotes the assignment weight of patch $n$ to cluster $m$ for head $h$ in batch $b$; (ii) softmax scaling over the cluster dimension, which sharpens confident assignments; and (iii) a learnable, head-specific temperature parameter that directly controls the entropy of the assignment distribution (Section~3.1.3).

\textbf{Assignment entropy.} To quantify whether patches are preferentially routed to specific clusters rather than uniformly distributed, we compute the normalized entropy of the soft cluster assignment vector for each patch and attention head,
\[
H(\mathbf{w}_n^{(h)}) = -\frac{1}{\log M}\sum_{m=1}^{M} w_{n,m}^{(h)} \log w_{n,m}^{(h)} ,
\]
where $H=1$ corresponds to uniform assignment and $H=0$ to a deterministic assignment. Entropy is computed over 20k randomly sampled patches per dataset (to ensure tractability and reproducibility) and distributions are plotted per head. As shown in Appendix~D, Figures~\ref{fig:appendix_entropy_usage1}--\ref{fig:appendix_entropy_usage2} (violin plots), entropy values remain consistently below $0.5$ across heads and datasets, indicating confident and non-uniform patch-to-cluster assignments.

\textbf{Cluster occupancy.}
Complementarily, we measure per-head cluster occupancy by assigning each patch to its most likely cluster via $\arg\max_m w_{n,m}^{(h)}$. For each slide, we compute the proportion of patches assigned to each cluster per head and then average these proportions across slides. The resulting heatmaps (Appendix~D, Figures~\ref{fig:appendix_entropy_usage1}--\ref{fig:appendix_entropy_usage2}) reveal clear head specialization: individual heads predominantly activate distinct subsets of clusters while still utilizing secondary clusters, with this behavior consistent across binary and multiclass datasets.

Together, these entropy and occupancy analyses provide quantitative evidence that CAPRMIL maintains diverse, head-specialized, and non-collapsed cluster assignments, supporting the interpretability and stability of the learned morphology-aware tokens.


\begin{figure}[h!]
    \centering
    \includegraphics[width=1.0\linewidth]{Figures/Cluster_analysis_001.png}
    \caption{\textbf{Left:} Distribution of normalized per-patch cluster-assignment entropy for each attention head, computed as
    $H(\mathbf{w}_{n}^{(h)}) = - \sum_{m=1}^{M} w_{n,m}^{(h)} \log w_{n,m}^{(h)} \,/\, \log M$,
    where $w_{n,m}^{(h)}$ denotes the soft assignment of patch $n$ to cluster $m$ in head $h$.
    \textbf{Right:} Per-head cluster usage, measured as the percentage of patches assigned to each cluster via hard assignment (argmax over clusters).
    Across all datasets, entropy distributions remain well below the maximum value, indicating assignment to certain clusters (non-uniform assignment), while cluster usage maps show that heads actively utilize different clusters with distinct specialization patterns rather than collapsing to a single cluster.}
        \label{fig:appendix_entropy_usage1}
    \end{figure}



\clearpage
\begin{figure}[h!]
    \centering
    \includegraphics[width=1.0\linewidth]{Figures/Cluster_analysis.png}
    \caption{\textbf{Left:} Distribution of normalized per-patch cluster-assignment entropy for each attention head, computed as
    $H(\mathbf{w}_{n}^{(h)}) = - \sum_{m=1}^{M} w_{n,m}^{(h)} \log w_{n,m}^{(h)} \,/\, \log M$,
    where $w_{n,m}^{(h)}$ denotes the soft assignment of patch $n$ to cluster $m$ in head $h$.
    \textbf{Right:} Per-head cluster usage, measured as the percentage of patches assigned to each cluster via hard assignment (argmax over clusters).
    Across all datasets, entropy distributions remain well below the maximum value, indicating assignment to certain clusters (non-uniform assignment), while cluster usage maps show that heads actively utilize different clusters with distinct specialization patterns rather than collapsing to a single cluster.}
        \label{fig:appendix_entropy_usage2}
    \end{figure}



\clearpage

\subsection{Cell-Level Morphological Characterization of Cluster Assignments.}
\label{app:cell_specific_subsection}

In the absence of direct expert pathologist assessment, we assess the morphological coherence of the learned clusters using two complementary sources of evidence: pixel-level tumor annotations available for CAMELYON16 and cell-level composition analysis using established pretrained models such as HoVerNet. These analyses are reported in \textbf{Appendix D, Figures~\ref{fig:cells_test001}--\ref{fig:cells_test021}}.

First, on CAMELYON16, we leverage the available tumor-versus-normal ground-truth masks as a proxy for morphological relevance. By assigning each patch, for one attention head, to the cluster with the highest assignment score (argmax over clusters) and plotting these assignments to slide space, we observe that clusters form spatially coherent regions that align closely with annotated tissue types. As illustrated in \textbf{Figure~\ref{fig:cells_test001}} for a representative slide, Cluster~0 predominantly corresponds to normal tissue, Clusters~1 and~3 concentrate on tumor regions, while Cluster~2 is sparsely activated in areas consistent with adipose tissue. This spatial alignment indicates that clusters capture meaningful histological structure rather than arbitrary patch groupings.

Second, to further characterize these regions at a finer scale, we perform a cell-level analysis using HoVerNet on the top~10\% highest-scoring patches per cluster. The resulting cell-type distributions and nuclei overlays (\textbf{Appendix D, Figures~\ref{fig:cells_test001}--\ref{fig:cells_test021}}) reveal distinct and consistent cellular compositions across clusters: tumor-associated clusters are dominated by neoplastic cells, while others exhibit various proportions of inflammatory, connective, necrotic, or acellular (adipose) tissue. While clusters may partially overlap at tissue boundaries, their cellular composition profiles and visual appearance remain clearly differentiated, indicating that clusters do not collapse onto identical patch sets. Together, these results demonstrate that CAPRMIL clusters correspond to coherent, morphologically meaningful regions and capture biologically interpretable variation in tissue and cellular organization.


\begin{figure}[h]
    \centering
    \includegraphics[width=1.0\linewidth]{Figures/test_001_cluster_analysis_cell.png}
    \caption{\textbf{Cluster-level cellular analysis.}
    \textbf{Left:} Slide-level cluster maps where each patch is assigned to the cluster with the highest soft-assignment score (argmax over clusters). Yellow contours indicate expert annotations.
    \textbf{Middle:} Cell-type distributions computed by applying HoVer-Net~\cite{hovernet} to the top $10\%$ highest-confidence patches for each cluster, illustrating cluster-specific cellular composition.
    \textbf{Right:} Representative patches per cluster with HoVer-Net cell segmentation and classification overlays, highlighting characteristic cellular patterns captured by each cluster.}
    
    \label{fig:cells_test001}
\end{figure}

\clearpage

\begin{figure}[h]
    \centering
    \includegraphics[width=1.0\linewidth]{Figures/test_016_cells.png}
    \caption{\textbf{Cluster-level cellular analysis.}
    \textbf{Left:} Slide-level cluster maps where each patch is assigned to the cluster with the highest soft-assignment score (argmax over clusters). Yellow contours indicate expert annotations.
    \textbf{Middle:} Cell-type distributions computed by applying HoVer-Net~\cite{hovernet} to the top $10\%$ highest-confidence patches for each cluster, illustrating cluster-specific cellular composition.
    \textbf{Right:} Representative patches per cluster with HoVer-Net cell segmentation and classification overlays, highlighting characteristic cellular patterns captured by each cluster.}
    \label{fig:cells_test016}
\end{figure}

\clearpage

\begin{figure}[h]
    \centering
    \includegraphics[width=1.0\linewidth]{Figures/test_021_cells.png}
    \caption{\textbf{Cluster-level cellular analysis.}
    \textbf{Left:} Slide-level cluster maps where each patch is assigned to the cluster with the highest soft-assignment score (argmax over clusters). Yellow contours indicate expert annotations.
    \textbf{Middle:} Cell-type distributions computed by applying HoVer-Net~\cite{hovernet} to the top $10\%$ highest-confidence patches for each cluster, illustrating cluster-specific cellular composition.
    \textbf{Right:} Representative patches per cluster with HoVer-Net cell segmentation and classification overlays, highlighting characteristic cellular patterns captured by each cluster.}
    \label{fig:cells_test021}
\end{figure}


% \begin{table}[h]
% \centering
% \small
% \setlength{\tabcolsep}{6pt}
% \resizebox{\textwidth}{!}{\begin{tabular}{lccccccccc}
% \toprule
%  & \multicolumn{5}{c}{\textbf{Efficiency}} 
%  & \multicolumn{2}{c}{\textbf{CAMELYON16}} 
%  & \multicolumn{2}{c}{\textbf{PANDA}} \\
% \cmidrule(lr){2-6} \cmidrule(lr){7-8} \cmidrule(lr){9-10}
% \textbf{Model} & Params (k) & FLOPs (G) & Peak GPU (GiB) & Train (s) & Val (s) & AUC & ACE & $\kappa$ & ACE \\
% \midrule
% Transformer & $330$ & $1.180$ & $17.2$ & $12.3$ & $1.2$ & $.977_{.010}$ & $.028_{.012}$ & $.950_{.050}$ & $.022_{.026}$ \\
% \ours\ & $315$ & $.628$ & $.263$ & $6.2$ & $0.8$ & $.975_{.006} $& $.028_{.006} $ & $.944_{.053}$ & $.021_{.024}$ \\
% \bottomrule
% \end{tabular}}
% \caption{Comparison between CAPRMIL and a parameter-matched full self-attention Transformer. Performance metrics are reported as mean $\pm$ standard deviation.}
% \label{tab:caprmil_vs_transformer}
% \end{table}

\begin{table}[h]
\centering
\small
\setlength{\tabcolsep}{6pt}
\resizebox{\textwidth}{!}{%
\begin{tabular}{lccccccccccc}
\toprule
 & \multicolumn{5}{c}{\textbf{Efficiency}} 
 & \multicolumn{2}{c}{\textbf{CAMELYON16}} 
 & \multicolumn{2}{c}{\textbf{PANDA}} 
 & \multicolumn{2}{c}{\textbf{BRACS}} \\
\cmidrule(lr){2-6} 
\cmidrule(lr){7-8} 
\cmidrule(lr){9-10}
\cmidrule(lr){11-12}
\textbf{Model} & Params (k) & FLOPs (G) & Peak GPU (GiB) & Train (s) & Val (s) & AUC & ACE & $\kappa$ & ACE & AUC & ACE \\
\midrule
Transformer+Mean & $330$ & $1.180$ & $17.2$ & $12.3$ & $1.2$ & $.977_{.010}$ & $.028_{.012}$ & $.950_{.050}$ & $.022_{.026}$ & -- & -- \\

\ours+Mean & $315$ & $.628$ & $.263$ & $6.2$ & $0.8$ & $.975_{.006}$ & $.028_{.006}$ & $.944_{.053}$ & $.021_{.024}$ & $.850_{.031}$ & $.189_{.026}$ \\
\bottomrule
\end{tabular}}
\caption{Comparison between \ours+Mean and a parameter-matched (within $5\%$) full self-attention Transformer+Mean. The full self-attention Transformer could not be evaluated on BRACS due to prohibitive memory requirements caused by large bag sizes (4k--20k).}
\label{tab:caprmil_vs_transformer}
\end{table}


