\documentclass{midl}  
\jmlryear{2026}
\jmlrworkshop{Full Paper -- MIDL 2026}
\jmlrvolume{-- nnn}
\editors{Accepted for publication at MIDL 2026}

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{soul}
\usepackage{xcolor}

\title[Hide-and-Seek Attribution]{Hide-and-Seek Attribution: Weakly Supervised Segmentation of Vertebral Metastases in CT}


\midlauthor{
\Name{Matan Atad\nametag{$^{1,2,3}$}} \orcid{0000-0001-6952-517X}
\Email{matan.atad@tum.de}\\
\Name{Alexander {W. Marka}\nametag{$^{4}$}} \orcid{0000-0002-2111-8177}
\Email{alexander.marka@tum.de}\\
\Name{Lisa Steinhelfer\nametag{$^{2}$}} \orcid{0000-0003-2452-4389}
\Email{lisa.steinhelfer@tum.de}\\
\Name{Anna {Curto-Vilalta}\nametag{$^{2,9}$}} \orcid{0000-0002-6625-3639} \Email{anna.curto-vilalta@tum.de}\\
\Name{Yannik Leonhardt\nametag{$^{4}$}} \orcid{0000-0003-0028-6654}
\Email{yannik.leonhardt@tum.de}\\
\Name{Sarah {C. Foreman}\nametag{$^{1}$}} \orcid{0000-0001-9140-0162}
\Email{sarah.foreman@tum.de}\\
\Name{{Anna-Sophia} {Walburga Dietrich}\nametag{$^{1,8}$}} \orcid{0009-0004-4341-4032}
\Email{anna-sophia.dietrich@unimedizin-ffm.de}\\
\Name{Robert Graf\nametag{$^{1,2}$}} \orcid{0000-0001-6656-3680}
\Email{robert.graf@tum.de}\\
\Name{Alexandra {S. Gersing}\nametag{$^{5}$}} \orcid{0000-0003-1687-5541}
\Email{alexandra.gersing@med.uni-muenchen.de}\\
\Name{Bjoern Menze\nametag{$^{6}$}} \orcid{0000-0003-4136-5690}
\Email{bjoern.menze@uzh.ch}\\
\Name{Daniel Rueckert\nametag{$^{2,3,7}$}} \orcid{0000-0002-5683-5889}
\Email{daniel.rueckert@tum.de}\\
\Name{Jan {S. Kirschke}\nametag{$^{1}$}} \orcid{0000-0002-7557-0003}
\Email{jan.kirschke@tum.de}\\
\Name{Hendrik Moeller\nametag{$^{1,2}$}} \orcid{0009-0001-1978-5894}
\Email{hendrik.moeller@tum.de}\\
\addr $^{1}$ Institute of Neuroradiology, TUM University Hospital, School of Medicine and Health, Technical University of Munich (TUM), Munich, Germany\\
\addr $^{2}$ Chair for AI in Healthcare and Medicine, TUM and TUM University Hospital, Munich, Germany\\
\addr $^{3}$ Munich Center for Machine Learning (MCML)\\
\addr $^{4}$ Institute of Diagnostic and Interventional Radiology, TUM University Hospital, School of Medicine and Health, Technical University of Munich (TUM), Munich, Germany\\
\addr $^{5}$ Department of Neuroradiology, University Hospital Munich (LMU), Munich, Germany\\
\addr $^{6}$ Department of Quantitative Biomedicine, University of Zurich, Switzerland\\
\addr $^{7}$ Department of Computing, Imperial College London, UK\\
\addr $^{8}$ Department of Diagnostic and Interventional Radiology, University Hospital Frankfurt, Germany\\
\addr $^{9}$ Department of Orthopedics and Sports Orthopedics, TUM University Hospital, School of Medicine and Health, Technical University of Munich (TUM), Munich, Germany\\
}


\begin{document}

\maketitle

\begin{abstract}
Accurate segmentation of vertebral metastasis in CT is clinically important yet difficult to scale, as voxel-level annotations are scarce and both lytic and blastic lesions often resemble benign degenerative changes. We introduce a 2D weakly supervised method trained solely on vertebra-level healthy/malignant labels, without any lesion masks. The method combines a Diffusion Autoencoder (DAE) that produces a classifier-guided healthy edit of each vertebra with pixel-wise difference maps that propose suspect candidate lesions. To determine which regions truly reflect malignancy, we introduce Hide-and-Seek Attribution: each candidate is revealed in turn while all others are hidden, the edited image is projected back to the data manifold by the DAE, and a latent-space classifier quantifies the isolated malignant contribution of that component. High-scoring regions form the final lytic or blastic segmentation. On held-out radiologist annotations, we achieve strong blastic/lytic performance despite no mask supervision (F1: 0.91/0.85; Dice: 0.87/0.78), exceeding baselines (F1: 0.79/0.67; Dice: 0.74/0.55). These results show that vertebra-level labels can be transformed into reliable lesion masks, demonstrating that generative editing combined with selective occlusion supports accurate weakly supervised segmentation in CT.
\end{abstract}

\begin{keywords}
Weakly supervised segmentation, spinal metastases, CT imaging
\end{keywords}

\section{Introduction}

Bone metastases are among the most frequent complications of advanced cancer, with the skeleton being the third most common metastatic site, typically arising from breast, prostate, and lung primary sites \cite{brenson2006pathophysiology}. Spinal involvement is particularly consequential: vertebral lesions can cause severe pain, mechanical instability, fractures, and spinal cord compression \cite{brenson2006pathophysiology}. Reliable monitoring is therefore essential to detect progression early and guide interventions to prevent structural failure and neurological compromise.

\begin{figure}[t]
  \centering
  \includegraphics[width=0.9\linewidth]{figures/graphical_abstract_ws_seg.pdf}
  \caption{
  Weakly supervised vertebral lesion segmentation from image-level labels. (1) Classifier-guided healthy edit (yellow) is generated in DAE latent space from the original (green). Difference maps produce suspect lesions. (2) Hide-and-Seek isolates each candidate (green contours), occludes the others (yellow), and computes normalized $\Delta$-scores. Thresholding these scores yields the final masks.
  }
  \label{fig:method_overview}
\end{figure}

Vertebral metastases are routinely evaluated on CT in standard oncologic staging, together with MRI and nuclear medicine imaging \cite{shah2011imaging}. Interpretation is difficult because malignant lesions vary in appearance and may resemble age-related findings such as trabecular rarefaction, Modic endplate changes, or Schmorl nodes \cite{oh2022diff}. Assessment is mostly qualitative because many lesions have diffused margins and no clear measurable boundaries from adjacent normal tissue, degenerative changes, and benign pathologies \cite{weber2022tumors}. Metastases manifest as lytic (bone loss), blastic (bone formation), or mixed, with primary tumors showing characteristic but non-exclusive patterns (e.g., myeloma often lytic, prostate cancer often blastic, breast cancer frequently mixed) \cite{Macedo2017BoneMetastases}. On CT, lytic components appear hypodense and blastic components hyperdense \cite{brenson2006pathophysiology}. Automated segmentation can provide reproducible delineation of metastatic lesions, which is particularly important when several lesions occur in the same vertebra. Accurate masks enable downstream analyses, such as estimating vertebral stability or measuring progression, that are difficult to perform from qualitative CT assessment.

Existing methods for automated vertebral lesion segmentation are fully supervised and require costly voxel-level annotations \cite{chmelik2018deep,chang2022sclerotic,motohashi2024spine,edelmers2024ai}, which are difficult to obtain reliably even among experts. Weakly supervised strategies could reduce this annotation burden but remain limited in their ability to localize and validate malignant lesions in the presence of confounders. CAM-based methods \cite{selvaraju2017gradcam,chattopadhay2018gradcamplus,jiang2021layercam,BanyMuhammad2021EigenCAM} highlight the most discriminative pixels yet often miss multiple distinct lesions. Medical SAM \cite{cheng2023sammed2d,ma2024medsam} require precise bounding box prompts and cannot determine pathology without finetuning. Pseudo-healthy reconstruction approaches \cite{wolleb2022diffad,guo2025unsup} detect deviations from normal anatomy, but may suppress malignant lesions that resemble degenerative changes. Classifier-guided healthy reconstructions \cite{jiang2025oct,shvetsov2024coin} provide more complete residuals, though they lack a mechanism to evaluate candidate lesions in isolation and reject correlated, non-causal deviations.

Occlusion-based attribution methods hide image regions to quantify their influence on a classifier’s output and are typically used for explainability \cite{gandomkar2022OcclusionCT,Uzunova2019interpretable,Agarwal2020explaining}. We use occlusion as a candidate-wise decision rule for segmentation: for each suspected region, all alternatives are suppressed, a pseudo-healthy image is reconstructed using a generative model, and the region’s independent contribution to malignancy is evaluated. Only regions with substantial contributions are retained. Our hypothesis is that malignant evidence is localized, and that though healthy edits introduce spurious artifacts, such artifacts, being unrelated to malignancy, will not yield high scores and will be suppressed. This enables weakly supervised, lesion-level segmentation without requiring dense ground-truth masks.

Our contributions are as follows:
(1) We introduce the first 2D weakly supervised approach for lesion-level segmentation for vertebral metastases, a clinically important and technically challenging task, achieving Dice scores of 0.87 (blastic) and 0.78 (lytic).
(2) We propose \emph{Hide-and-Seek Attribution}, a method that converts classifier-guided pseudo-healthy reconstructions into lesion masks by explicitly testing candidate regions and retaining only those that independently contribute to the malignancy score.
(3) We benchmark our method against multiple weakly supervised baselines and analyze its limitations. Our code is available at \url{https://github.com/matanat/hide_and_seek}.

\section{Related Work}

\paragraph{Weakly supervised segmentation in medical imaging.}
Segmentation based on \allowbreak image-level labels has been widely explored as an alternative to tedious voxel-level annotation in medical imaging. Most methods rely on classifier-derived Class Activation Maps (CAMs) \cite{selvaraju2017gradcam,chattopadhay2018gradcamplus,jiang2021layercam,BanyMuhammad2021EigenCAM} to generate coarse pseudo-masks, which are then refined and used to train fully supervised segmentation models \cite{chen2022ccam,viniavskyi2020wssx,yoon2024difws}. Nevertheless, CAMs remain intrinsically sparse, tending to highlight only the most discriminative pixels rather than all occurrences of a visual concept of interest. 

Adaptations of Segment Anything (SAM) \cite{kirillov2023sam} have also been applied for medical tasks \cite{cheng2023sammed2d,ma2024medsam}. These models perform well when provided with tight bounding-box prompts, but cannot determine whether a region is pathological without finetuning, restricting their utility.

Generative anomaly detection approaches reconstruct pseudo-healthy images and detect abnormalities via residuals \cite{wolleb2022diffad,guo2025unsup,jiang2025oct}. While effective for highlighting deviations from normal anatomy, they cannot distinguish the target pathology from benign co-occurring changes. \citet{shvetsov2024coin} employ conditional generative edits to derive kidney tumor masks from reconstruction residuals, assuming direct correspondence between residuals and lesions. In contrast, our method does not interpret residuals as lesions, but treats them as candidates that must be explicitly validated.
Recently, \citet{mehta2025cf} applied edits to increase robustness in fully supervised organ segmentation. Their setting differs fundamentally from ours, as our goal is to segment the metastatic tissue itself.

Multiple-instance learning (MIL) offers another paradigm by optimizing bag-level classification to highlight multiple regions \cite{nedra2024milb,pan2025mamba}. However, MIL infers instance relevance only in the context of the full bag, so scores do not reflect a candidate’s standalone contribution. Our method evaluates each candidate in isolation using healthy edits, enabling per-candidate attribution that MIL does not provide.

\paragraph{Spinal lesion segmentation in CT.}

Existing work on spinal metastasis segmentation is exclusively based on full supervision. Multiple studies report only moderate Dice scores \cite{chmelik2018deep,motohashi2024spine,chang2022sclerotic,edelmers2024ai}.\footnote{\citet{chang2022sclerotic} reported a Dice of 0.83 for lytic lesions. \citet{edelmers2024ai} reported Dice of 0.71 for lytic and 0.61 for blastic lesions with fully supervised training.} These results suggest that vertebral lesions remain difficult to segment reliably even under strong supervision. Longitudinal–based approaches \cite{onoue2021temporal,sanhinova2024register} detect interval changes but require multi-timepoint imaging and cannot distinguish malignancy from other developed disease. Methods occasionally described as “weakly-supervised’’ \cite{sheng2024weakly} still rely on manually corrected masks and therefore do not correspond to the setting considered here. To our knowledge, weakly supervised segmentation of spinal metastasis has not been previously explored.

\section{Dataset}

CT scans from 440 patients were collected at the TUM University Hospital (mean age $67.5 \pm 12.9$ years; 211 female). Each scan was reviewed by a radiologist with expertise in spinal imaging (A.S.W.D., 10 years of experience) who assigned vertebra-level labels (healthy vs.\ malignant)\footnote{Labeling protocol described in \citet{foreman2024deep}.}. Vertebrae with fractures were excluded, as lesion delineation in fractured vertebrae is highly challenging on CT \cite{foreman2024deep}. Because cervical levels appeared only in a minority of scans, this study focuses on the thoracic and lumbar spine. 

The dataset was divided into three parts (\tableref{tab:dataset}): 
(1) The generative model was trained on 2D sagittal slices extracted from all 5,644 vertebrae remaining after exclusions, without using any labels.
(2) A classifier-training subset of 565 vertebrae (300 healthy, 265 malignant) was used to learn the latent healthy–malignant direction. For malignant cases in this subset, a radiologist confirmed the presence of a lesion in the central sagittal slice.
(3) Quantitative evaluation was performed on a held-out test set of 17 patients (94 vertebrae with a malignant lesion visible in the central sagittal slice: 50 blastic, 16 lytic, and 28 mixed vertebrae). Two radiologists (L.S. and A.W.M. with 6 and 4 years of experience, respectively; A.W.M. supervised by Y.L., a board-certified radiologist) independently segmented all malignant lesions within these vertebrae: the first set (A) served as ground truth for all evaluations, and the second (B) was used for inter-rater agreement.
\begin{table}[htbp]
\floatconts
  {tab:dataset}%
  {\caption{Data usage in this study}}%
  {%
  \resizebox{\linewidth}{!}{%
  \begin{tabular}{lccc}
  \bfseries Feature 
      & \bfseries Full dataset
      & \bfseries Classifier subset
      & \bfseries Test set (held-out) \\ \hline
  Vertebrae 
      & 5,644 
      & 565 
      & 94 \\
  Healthy / malignant 
      & 4,972 / 672 
      & 300 / 265 
      & 0 / 94 \\
  Blastic / lytic / mixed
      & unknown
      & unknown
      & 50 / 16 / 28 \\
  Used annotations 
      & none
      & Image-level (healthy/malignant)
      & Pixel-level (lytic/blastic) \\
  Purpose 
      & Train DAE 
      & Train classifier 
      & Evaluation \\
  \end{tabular}}
  }
\end{table}

\section{Methods}

We propose a weakly supervised method that segments vertebral lesions in 2D by testing the malignant contribution of each suspect region with a classifier (\autoref{fig:method_overview}). As input, we use the central sagittal slice extracted from each CT volume. We first reconstruct a healthy approximation of this slice with a DAE and derive lesions as residual differences. Each candidate region is then evaluated independently using Hide-and-Seek Attribution: all other candidates are masked in the original image, the edited image is reconstructed and re-encoded, and the resulting latent is scored by the classifier. The derived malignancy score reflects how strongly the candidate elevates the malignant signal, enabling retention of only true lytic or blastic regions without pixel-level supervision.

\paragraph{Unsupervised pretraining.}

A Diffusion Autoencoder (DAE) \citep{preechakul2021diffusion} is used to obtain a semantic latent representation of input CT slices. The model consists of a semantic encoder that maps an input image to a latent vector $\vec{z}_{\text{sem}}$ and a diffusion-based decoder that reconstructs the image from this latent; both are trained jointly end-to-end. The resulting latent space has been shown to capture anatomical structure in an approximately linear and interpretable form \citep{preechakul2021diffusion}. Training is fully unsupervised and uses 2d mid-sagittal vertebral CT slices without lesion labels or masks.

\paragraph{Healthy edit.}

A logistic regression fitted to $\vec{z}_{\text{sem}}$ provides a semantic direction separating healthy from malignant vertebrae. Its output $c(\cdot)\rightarrow[0,1]$ represents the malignancy probability. Because the classifier is linear, its decision boundary is the hyperplane $\vec{n}\cdot\vec{z} + b = 0$, where $\vec{n}$ is the learned normal vector and $b$ is a bias term. The signed distance of a latent $\vec{z}$ to this hyperplane is given by 
$\text{dist}(\vec{z}) = (\vec{n}\cdot \vec{z} + b) / \|\vec{n}\|$. To obtain a \emph{healthy reconstruction}, we compute a closed-form edit of the latent that moves it to a predefined logit value. We convert a small target probability $p_{\text{target}}\!\approx\!0$ into its corresponding logit $d_{\text{target}}$, and then project the original latent onto that point along the classifier’s normal direction:
\begin{align}
\label{eq:healthy_recon}
\vec{z}_{\text{healthy}}
= \vec{z}
 - \bigl(\text{dist}(\vec{z}) - d_{\text{target}}\bigr)
   \frac{\vec{n}}{\|\vec{n}\|}\ .
\end{align}
This transformation suppresses the malignancy-associated component encoded in the latent representation and potentially other entangled factors \cite{atad2024counterfactual} while preserving the vertebra’s anatomical identity.

\paragraph{Hide-and-Seek Attribution.}

Given a malignant vertebral slice $I$ with latent $\vec{z}$, Hide{-}and{-}Seek Attribution begins by generating a healthy reconstruction $I_{\text{healthy}}$ from the edited latent $\vec{z}_{\text{healthy}}$\footnote{Algorithm pseudocode is provided in appendix~\ref{appendix:alg}.}. Candidate abnormalities are identified from the residual map $D = I - I_{\text{healthy}}$
which is decomposed into $D^{+} = \max(D, 0)$ and 
$D^{-} = \max(-D, 0)$, capturing lytic-like (brightening) and blastic-like (darkening) deviations from the healthy appearance. Each difference map is binarized using the per-image mean intensity. Connected-component analysis is then applied to the binary result to obtain candidate lesion regions, whose boundaries are used as mask proposals.

For each suspected component $M$, we isolate its effect by hiding all other candidates. Let $\Omega_M$ be the pixel set of $M$ and $\Omega_{\text{others}}$ the union of all remaining suspected regions. Then:
\begin{align}
\label{eq:hide}
I_{\text{hide}}(M)(x) =
\begin{cases}
I(x),                & x \in \Omega_M,\\[4pt]
I_{\text{healthy}}(x), & x \in \Omega_{\text{others}},\\[4pt]
I(x),                & \text{otherwise},
\end{cases}
\end{align}
i.e., the appearance of $M$ is preserved while all other suspected regions are replaced by their healthy reconstruction. We further reconstruct and encode $I_{\text{hide}}(M)$ into $\vec{z}_{\text{hide}}(M)$, projecting the edited image back onto the data manifold, to obtain an anatomically plausible version of the occluded slice. 

Finally, the malignancy score of component $M$ is defined as:
\begin{align}
\Delta(M)
&=
\frac{
c\!\left(\vec{z}_{\text{hide}}(M)\right)
-
c\!\left(\vec{z}_{\text{healthy}}\right)
}{
c(\vec{z}) - c\!\left(\vec{z}_{\text{healthy}}\right) + \varepsilon},
\label{eq:delta_prob}
\end{align}
where $\varepsilon$ ensures numerical stability. The score $\Delta(M)\ge 0$ quantifies the fraction of the original malignancy probability linked to component $M$: values near $1$ indicate that $M$ explains most of the malignant evidence, whereas values near $0$ reflect negligible influence giving it a direct semantic interpretation. Components satisfying $\Delta(M)\ge\tau$ for some $\tau>0$ are retained with $D^{+}$ yielding the lytic mask and $D^{-}$ the blastic mask. For mixed vertebrae, both $D^{+}$ and $D^{-}$ components are retained and their corresponding masks are merged. Because $D^{+}$ and $D^{-}$ are disjoint by construction, the resulting masks do not overlap.

\paragraph{Implementation details}
All CT scans were resampled to 0.8 mm isotropic resolution. Vertebral structures were segmented using SpineR (Bonescreen GmbH\footnote{\url{https://www.bonescreen.de}}), providing both vertebral masks and anatomical level labels. For each vertebra, a $64\times64\times64$ crop centered on the vertebral body was extracted, and a single sagittal slice from the central five was used as input to the DAE. The DAE was trained for 2206 epochs (93h 41m) on a single NVIDIA A40 GPU using the implementation of \citet{preechakul2021diffusion}. Healthy edits were obtained using a logistic regression classifier trained on semantic latents (F1: 0.90, AUC: 0.97). During inference, the segmentation pipeline is applied only to vertebrae predicted by the classifier to contain a malignant lesion in the central slice. This requires $69.6 \pm 22.2$ seconds on average per vertebra on the same hardware.

Difference maps are binarized using a per-image mean threshold, and $\Delta$ is thresholded at a fixed value of $\tau = 0.5$, chosen \emph{a priori} and not tuned. Additional preprocessing and training details are provided in appendix~\ref{appendix:hyperparam}. For fair comparison across methods, predictions are restricted to the vertebral body: the corpus mask from preprocessing is eroded to avoid boundary spillover, and only pixels within this region are retained. Finally, predicted components smaller than 5 pixels are removed uniformly across methods to suppress isolated noise.

\paragraph{Evaluation metrics}

Predictions are evaluated using Panoptica \cite{kofler2023panoptica}. Predicted and reference lesions are matched using positive-Dice assignment with a many-to-one scheme to avoid penalizing over-segmentation. \textbf{Detection F1} (RQ) quantifies lesion detection: a lesion is considered detected if any prediction overlaps it ($\text{Dice} > 0$), and the F1-score summarizes precision and recall across all lesions. \textbf{Instance Dice} (SQ) is computed only for matched lesion pairs and reports the mean Dice overlap of correctly detected lesions. \textbf{Panoptic Dice} ($PQ_D$) integrates detection and segmentation as $\text{PQ}_D = \text{RQ} \times \text{SQ}$. \textbf{ASSD} is computed per matched pair to assess boundary accuracy and averaged across lesions. \textbf{Global Dice} is a voxel-wise Dice over the entire vertebral body, obtained by merging all predicted components and comparing them to the merged ground-truth mask. Metrics are reported separately for blastic, lytic, and mixed lesions, with mixed vertebrae evaluated as their own category since most baselines cannot distinguish lesion subtypes. For statistical comparison, per-vertebra metrics were used for paired analyses between our method and the strongest baseline. Median differences (Ours-Baseline) are reported together with 95\% confidence intervals estimated via non-parametric bootstrap resampling, and statistical significance was assessed using paired Wilcoxon signed-rank tests. We define statistical significance as $p < 0.01$.

\section{Results}
We evaluate performance across baseline families, inter-rater agreement, and lesion-wise behavior of the proposed method.

\paragraph{Baseline methods.}
The proposed method was compared to representative weakly supervised segmentation methods spanning intensity-based, attribution-based, foundation-model, and anomaly detection. Specifically, we include naive Otsu thresholding, GradCAM / GradCAM++ / LayerCAM / EigenCAM \cite{selvaraju2017gradcam,chattopadhay2018gradcamplus,jiang2021layercam,BanyMuhammad2021EigenCAM}, MedSAM \cite{ma2024medsam}, and pseudo-healthy anomaly detection (AD). Baseline implementation details are provided in appendix~\ref{appendix:baselines}.

\paragraph{Inter-rater agreement.}
To characterize the inherent difficulty of the task, annotations from a second radiologist (B) were compared against the primary rater (A) (Tables~\ref{tab:interrater_metrics} and \ref{tab:interrater_counts} in the appendix). Agreement was high for blastic lesions (Detection F1 0.84, Instance Dice 0.81) but substantially lower for lytic lesions (Detection F1 0.51, Instance Dice 0.51), with B missing 25\% of A’s lesions and marking 45\% additional findings. ASSD remained low for both phenotypes (1.19 and 1.00, respectively), indicating that when both raters identified the same lesion, their boundary placement was largely consistent.

\paragraph{Overall segmentation performance.}
Segmentation accuracy across lesion phenotypes is summarized in Table~\ref{tab:main_metrics}. For blastic lesions, the proposed method reaches a Detection F1 of $0.91 \pm 0.17$ (vs.\ $0.79 \pm 0.23$ for the best CAM baseline and $0.72 \pm 0.23$ for AD), with Instance Dice increasing from $0.74 \pm 0.11$ (Otsu) to $0.87 \pm 0.16$ and Global Dice from $0.74 \pm 0.08$ to $0.88 \pm 0.14$, alongside the lowest ASSD (all improvements with p $<$ 0.001). For lytic lesions, Detection F1 rises from $0.67 \pm 0.23$ (MedSAM) to $0.85 \pm 0.21$, and Instance Dice from $0.55 \pm 0.23$ (Otsu) to $0.78 \pm 0.24$, with the lowest ASSD and highest Global Dice among all methods (all improvements with p $<$ 0.01). Mixed lesions are challenging for all approaches, show lower performance differences, and none reach statistical significance.\footnote{Full confidence intervals and p-values are in Table~\ref{tab:stat_significance} in the appendix.}

\begin{table}[htbp]
\floatconts
  {tab:main_metrics}%
    {\caption{Instance-level and global segmentation metrics (mean $\pm$ SD) for Otsu, CAM-based baselines, MedSAM, anomaly detection (AD), and our method. Arrows indicate the preferred direction; bold values denote the best score per metric. Asterisks indicate paired Wilcoxon signed-rank tests against the strongest baseline ($^{**}: p<0.01$, $^{***}: p<0.001$).}}%
  {
  \resizebox{\linewidth}{!}{%
  \begin{tabular}{l cccccccc}
    \bfseries Metric 
      & \bfseries Otsu 
      & \bfseries GradCAM 
      & \bfseries GradCAM++ 
      & \bfseries LayerCAM 
      & \bfseries EigenCAM
      & \bfseries MedSAM
      & \bfseries AD
      & \bfseries \textbf{Ours} \\ \hline

    % ============================================================
    \multicolumn{9}{l}{\textit{Blastic lesions}} \\[2pt]

    Detection F1 (RQ) $\uparrow$             
      & 0.68 $\pm$ 0.24 
      & 0.79 $\pm$ 0.23 
      & 0.77 $\pm$ 0.28 
      & 0.75 $\pm$ 0.24 
      & 0.68 $\pm$ 0.34
      & 0.64 $\pm$ 0.28
      & 0.72 $\pm$ 0.28
      & \textbf{0.91 $\pm$ 0.17}$^{***}$\\

    Instance Dice (SQ) $\uparrow$           
      & 0.74 $\pm$ 0.11 
      & 0.43 $\pm$ 0.19 
      & 0.43 $\pm$ 0.20 
      & 0.39 $\pm$ 0.18 
      & 0.38 $\pm$ 0.22
      & 0.44 $\pm$ 0.22
      & 0.53 $\pm$ 0.22
      & \textbf{0.87 $\pm$ 0.16}$^{***}$ \\

    Panoptic Dice (PQ$_D$) $\uparrow$
      & 0.49 $\pm$ 0.18 
      & 0.34 $\pm$ 0.19 
      & 0.35 $\pm$ 0.20 
      & 0.28 $\pm$ 0.13 
      & 0.31 $\pm$ 0.21
      & 0.31 $\pm$ 0.21
      & 0.38 $\pm$ 0.20
      & \textbf{0.80 $\pm$ 0.24}$^{***}$ \\

    ASSD $\downarrow$              
      & 1.36 $\pm$ 0.63 
      & 4.50 $\pm$ 2.12 
      & 4.71 $\pm$ 2.02 
      & 4.35 $\pm$ 1.78 
      & 4.68 $\pm$ 1.94
      & 4.07 $\pm$ 1.98
      & 2.75 $\pm$ 1.75
      & \textbf{0.89 $\pm$ 1.17}$^{***}$ \\

    Global Dice $\uparrow$         
      & 0.74 $\pm$ 0.08 
      & 0.42 $\pm$ 0.18 
      & 0.41 $\pm$ 0.20 
      & 0.36 $\pm$ 0.15 
      & 0.37 $\pm$ 0.20
      & 0.45 $\pm$ 0.22
      & 0.54 $\pm$ 0.22
      & \textbf{0.88 $\pm$ 0.14}$^{***}$ \\

    \hline
    % ============================================================
    \multicolumn{9}{l}{\textit{Lytic lesions}} \\[2pt]

     Detection F1 (RQ) $\uparrow$             
      & 0.64 $\pm$ 0.23 
      & 0.56 $\pm$ 0.33 
      & 0.53 $\pm$ 0.31 
      & 0.54 $\pm$ 0.25 
      & 0.43 $\pm$ 0.32
      & 0.67 $\pm$ 0.23
      & 0.32 $\pm$ 0.38
      & \textbf{0.85 $\pm$ 0.21}$^{***}$ \\

    Instance Dice (SQ) $\uparrow$           
      & 0.55 $\pm$ 0.23 
      & 0.25 $\pm$ 0.17 
      & 0.30 $\pm$ 0.21 
      & 0.32 $\pm$ 0.16 
      & 0.27 $\pm$ 0.22
      & 0.54 $\pm$ 0.19
      & 0.24 $\pm$ 0.28
      & \textbf{0.78 $\pm$ 0.25}$^{***}$ \\

    Panoptic Dice (PQ$_D$) $\uparrow$
      & 0.35 $\pm$ 0.18 
      & 0.14 $\pm$ 0.10 
      & 0.17 $\pm$ 0.15 
      & 0.17 $\pm$ 0.08 
      & 0.16 $\pm$ 0.21
      & 0.35 $\pm$ 0.15
      & 0.17 $\pm$ 0.19
      & \textbf{0.69 $\pm$ 0.33}$^{***}$ \\

    ASSD $\downarrow$              
      & 2.96 $\pm$ 2.29 
      & 4.35 $\pm$ 1.54 
      & 6.70 $\pm$ 2.85 
      & 4.16 $\pm$ 1.28 
      & 5.24 $\pm$ 1.42
      & 3.01 $\pm$ 2.13
      & 1.89 $\pm$ 0.69
      & \textbf{0.97 $\pm$ 1.19}$^{**}$ \\

    Global Dice $\uparrow$         
      & 0.56 $\pm$ 0.19 
      & 0.21 $\pm$ 0.16 
      & 0.33 $\pm$ 0.18 
      & 0.27 $\pm$ 0.13 
      & 0.31 $\pm$ 0.22
      & 0.52 $\pm$ 0.16
      & 0.19 $\pm$ 0.23
      & \textbf{0.75 $\pm$ 0.27}$^{***}$ \\

    \hline
    % ============================================================
    \multicolumn{9}{l}{\textit{Mixed lesions}} \\[2pt]

    Detection F1 (RQ) $\uparrow$             
      & 0.48 $\pm$ 0.06 
      & \textbf{0.74 $\pm$ 0.21} 
      & 0.58 $\pm$ 0.33 
      & 0.68 $\pm$ 0.31 
      & \textbf{0.74 $\pm$ 0.21}
      & 0.66 $\pm$ 0.28
      & 0.53 $\pm$ 0.08
      & 0.64 $\pm$ 0.19 \\

    Instance Dice (SQ) $\uparrow$           
      & 0.33 $\pm$ 0.13 
      & 0.46 $\pm$ 0.17 
      & 0.35 $\pm$ 0.23 
      & 0.36 $\pm$ 0.21 
      & 0.46 $\pm$ 0.21
      & 0.37 $\pm$ 0.18
      & 0.42 $\pm$ 0.14
      & \textbf{0.62 $\pm$ 0.18} \\

    Panoptic Dice (PQ$_D$) $\uparrow$
      & 0.20 $\pm$ 0.06 
      & 0.35 $\pm$ 0.18 
      & 0.25 $\pm$ 0.19 
      & 0.27 $\pm$ 0.17 
      & 0.35 $\pm$ 0.21
      & 0.25 $\pm$ 0.22
      & 0.24 $\pm$ 0.10
      & \textbf{0.56 $\pm$ 0.15} \\

    ASSD $\downarrow$              
      & 3.63 $\pm$ 1.69 
      & 5.11 $\pm$ 1.57 
      & 4.68 $\pm$ 1.74 
      & 4.67 $\pm$ 1.71 
      & 4.75 $\pm$ 1.93
      & 3.72 $\pm$ 1.35
      & 2.88 $\pm$ 1.14
      & \textbf{1.02 $\pm$ 0.53} \\
      
    Global Dice $\uparrow$         
      & 0.25 $\pm$ 0.14 
      & 0.48 $\pm$ 0.16 
      & 0.33 $\pm$ 0.23 
      & 0.35 $\pm$ 0.20 
      & 0.46 $\pm$ 0.19
      & 0.32 $\pm$ 0.21
      & 0.29 $\pm$ 0.01
      & \textbf{0.62 $\pm$ 0.20} \\

  \end{tabular}}}
\end{table}


\paragraph{Qualitative comparison.}
Four representative cases are shown in Fig.~\ref{fig:qualitative}: two where the method performs well and two that expose its limitations and typical failure modes of baselines\footnote{Additional examples and metrics appear in appendix~\ref{appendix:additional}.}. Otsu produces large intensity-driven masks, useful with strong contrast (2) but prone to oversegment bright or noisy tissue (1, 3, 4). CAM-based methods yield coarse, discriminative activations, often capturing only part of a lesion (2, 4), with GradCAM++ highlighting small fragments (2) or responses across the vertebra (4). LayerCAM permits multiple components, but secondary responses reflect classifier bias toward benign findings, such as vertebra endplate or curvature. MedSAM frequently segments dark marrow or the entire vertebral body. AD produces fragmented residuals due to reconstruction errors and misses lytic lesions resembling fatty marrow conversion (2). The proposed method provides anatomically aligned masks across all rows, capturing diffuse blastic spread (1) and the major extent of a large lytic lesion (2). Remaining limitations include small holes in blastic regions (3, 4) and missed lytic components due to size filtering (4). Qualitative evaluation was conducted with radiologist guidance.

\begin{figure}[t]
  \centering
  \includegraphics[width=1\linewidth]{figures/panel_rows.pdf}
  \caption{Qualitative comparison on four vertebral CT slices (rows). Columns show the input image, ground truth, evaluated baselines and our method. Blastic lesions are shown in blue, lytic in red. CAM-based results are thresholded heatmaps. The examples: (1) a diffuse blastic lesion with a bright focus, (2) a large lytic lesion with cortical breakthrough, (3) a blastic lesion in a grainy scan with imaging artifacts, and (4) a mixed case with a small lytic focus. Arrows indicate features referenced in the text.
  }
  \label{fig:qualitative}
\end{figure}

\paragraph{Phenotype-specific performance.}
We stratify inferred masks by lesion type (appendix \ref{appendix:delta}). Lesion-wise precision–recall curves versus the $\Delta$-score show a clear disparity: blastic lesions follow a clean precision–recall profile, whereas lytic lesions exhibit markedly noisier behavior. Lesion sizes follow the same pattern: blastic true positives are typically large (mean $225.9$ pixels), while lytic true positives are substantially smaller (mean $52.4$), and most lytic false positives are very small (mean $14.3$).

\paragraph{Ablation studies.}
We perform a series of ablation experiments to assess the contribution and reliability of the individual components of the proposed pipeline.
(1) To assess the role of Hide-and-Seek Attribution, we first evaluated a variant that uses the residuals directly, without candidate isolation or $\Delta$-score rating, which resulted in substantially degraded performance (appendix~\ref{appendix:delta_vs_prob}). We further analyze the behavior of the $\Delta$-score across decision thresholds and observe smooth variation in the corresponding ROC curves, indicating that the choice of $\tau$ does not induce abrupt changes. We also compared the $\Delta$-score with the classifier’s raw malignancy probability: the $\Delta$-score achieved higher ROC\-AUC for both blastic (0.93 vs.\ 0.90) and lytic lesions (0.62 vs.\ 0.50).
(2) The method evaluates candidate lesions independently, assuming that malignant evidence is localized. This raises the question of whether lesions could exist that carry little signal individually but become malignant only when combined with others\footnote{This concern is practically relevant in the test set, where 46 out of 87 vertebrae contain multiple lesions.}. To assess such joint effects, we analyzed pairwise combinations of predicted lesions in multi-lesion vertebrae (appendix~\ref{appendix:interactions}), and found that these effects are rare and small.
(3) Initial lesion candidates are derived via per-image mean binarization of the difference maps. To assess whether false negatives are attributable to this design choice, we measured lesion coverage, defined as whether any initial candidate overlaps a ground-truth, and observed similar average coverage compared to alternative standard heuristics (mean 0.90, median 0.88, Otsu 0.89; appendix~\ref{appendix:initial}).
(4) We verified that the DAE reconstruction and healthy-edit process preserves vertebral anatomy (appendix~\ref{appendix:recon}): reconstruction similarity was high (LPIPS 0.07, SSIM 0.97), and healthy edits remained structurally consistent with the originals (LPIPS 0.12, SSIM 0.83). Finally, appendix~\ref{appendix:projection} illustrates that projecting occluded images through the DAE removes masking artifacts, ensuring that downstream classifier responses are driven by anatomical content rather than by occlusion patterns.

\section{Discussion}

This work addresses 2D vertebral metastasis segmentation using only vertebra-level labels, a setting in which existing weakly supervised methods often miss lesion extent or confuse malignant and benign patterns. The proposed combination of DAE-based healthy reconstruction and Hide-and-Seek attribution evaluates each candidate region independently, enabling lesion-level segmentation of lytic and blastic components without voxel-level supervision. The method identifies multiple lesions per vertebra and outperforms representative weakly supervised baselines across major metrics.

\paragraph{Comparison to baselines.}
The results highlight characteristic failure modes of intensity, CAM, foundation-model, and anomaly-detection baselines (Table~\ref{tab:main_metrics} and Fig.~\ref{fig:qualitative}). \textbf{Intensity-based thresholding} (Otsu) depends solely on voxel contrast and consequently oversegments bright or noisy tissue. \textbf{CAM-based approaches} are constrained by
classifier discriminative capacity, highlighting only the most class-informative pixels rather than the full lesion extent. This matches their low Instance Dice and high ASSD variance, reflecting unstable and partial localization. LayerCAM can produce multiple disconnected
masks, but secondary activations frequently reflect classifier biases toward benign structures. \textbf{MedSAM}, lacking malignancy semantics, tends to segment dark marrow or bright bone nonspecifically. \textbf{AD} relies on deviations from a healthy manifold and is therefore sensitive to reconstruction noise and prone to suppressing lytic lesions that resemble benign marrow patterns. Because a healthy-only generative model cannot reintroduce lesion features, AD cannot provide region-wise contribution testing.

\paragraph{Phenotype-specific considerations.}
The weaker performance on lytic lesions is consistent with their clinical ambiguity. Inter-rater agreement is substantially lower for lytic disease, reflecting its heterogeneous appearance and similarity to benign processes \cite{oh2022diff}. The classifier also receives weaker global evidence: lytic vertebrae show markedly lower malignant probabilities compared with blastic ones (appendix~\ref{appendix:delta}). Combined with the predominance of very small candidate components, these factors explain the noisier $\Delta$-score behavior and the reduced separability observed for lytic lesions. Mixed lesions show weaker and less consistent performance across methods, which may be related to the heterogeneous nature of this disease patterns.

\paragraph{Lesion interactions.}
An implicit assumption of the proposed approach is that malignant evidence can be attributed to individual lesion candidates. A potential concern is that this formulation might fail in cases where malignancy is expressed only through the joint presence of multiple lesions. Our analysis (appendix~\ref{appendix:interactions}) suggests that this scenario is uncommon: joint effects beyond the strongest individual candidate are typically small. This supports the use of independent candidate evaluation for lesion selection and indicates that strong inter-lesion interactions are unlikely to be a dominant failure mode.

\paragraph{Failures and limitations.}
Several limitations remain. (1) The editing trajectory and malignancy scoring depend on a classifier-derived semantic direction. When malignant evidence is weak, as often observed in lytic disease, both edits and $\Delta$-scores become less reliable, consistent with lower classification probabilities and reduced detection performance. Conditional diffusion settings that directly generate healthy and malignant variants under explicit conditioning may reduce this dependency \citep{zhang2023controlnet}. (2) The method further relies on contrast-polarity separation (D$^{+}$/D$^{-}$) and latent edits. Editing can introduce artifacts, particularly in very bright blastic regions, occasionally producing holed masks; post-processing may mitigate this \citep{jiang2025oct}. When benign artifacts are spatially contiguous with true malignancy and share intensity characteristics, connected-component aggregation may merge them. Mixed lesions remain difficult because lytic and sclerotic components are often interwoven, making D$^{+}$/D$^{-}$ an imperfect decomposition. Addressing mixed disease likely requires phenotype-aware editing rather than intensity-based separation. (3) The method assumes a patient-level phenotype label (lytic, blastic, mixed) derived from the primary cancer. As these labels determine which polarity is retained, mis-specified phenotypes can suppress true lesions or amplify false positives. A vertebra-level phenotype classifier could eliminate this dependency. 

\paragraph{Clinical relevance and impact.}
Reliable delineation of vertebral metastases can support assessment of lesion burden, structural stability (e.g., SINS parameters \cite{Fourney2011SINS}), canal compromise, and longitudinal progression. These tasks are difficult when multiple lesions coexist within a vertebra and qualitative comparison is inconsistent and time-consuming. Producing lesion-level masks from weak supervision provides a basis to standardize such assessments and enable quantitative measurements in settings where RECIST diameter criteria do not apply \cite{Eisenhauer2009RECIST}. The framework also yields lesion-wise interpretability: each suspect is evaluated by its independent contribution to the malignancy score, producing calibrated attributions rather than diffuse heatmaps and extending counterfactual-style reasoning from classification to segmentation \cite{atad2024counterfactual}. 
Many of these applications require 3D context, and the current formulation should therefore be viewed as an enabling component rather than a standalone diagnostic tool.

\section{Conclusion and outlook}

We presented a weakly supervised method for segmenting lytic and blastic vertebral metastases using only vertebra-level labels, combining diffusion-based healthy editing with a Hide-and-Seek strategy to isolate lesion-specific contributions. The approach outperforms representative weakly supervised baselines and provides anatomically grounded, lesion-level attribution, demonstrating that generative editing can yield accurate segmentation without voxel-level supervision. The proposed approach is not intended to replace manual voxel-level annotation, but to enable lesion-level analysis and downstream modeling in settings where such supervision is unavailable.

Several extensions could further strengthen the framework. 
While the current method operates on central sagittal slices, extending the approach to 3D volumes is an important direction for future work and would improve coverage beyond the central plane. Incorporating pedicles or posterior elements may further improve anatomical coverage and reliability in multilevel disease. Conditioning the generative model on malignancy phenotype could reduce dependence on a separately trained classifier, and a phenotype classifier would address the coarse assumption that each patient presents with a single dominant lesion type, as clinical cases often show mixed patterns across vertebrae. Finally, external validation on multi-center CT data will be essential to assess robustness.

\clearpage  
\midlacknowledgments{This work received funding from the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) – Grant number 283653538 and the European Research Council (ERC) under the European Union’s Horizon 2020 research and innovation program (101045128-iBack-epic—ERC2021-COG). J.S.K. is cofounder and shareholder of Bonescreen GmbH. The other co-authors are not employees, cofounders, or shareholders of Bonescreen GmbH.}

\bibliography{midl26_78}

\appendix

\newpage

\section{Appendix}
\subsection{Implementation details}
  
  \subsubsection{Hide-and-Seek attribution algorithm}\label{appendix:alg}

Alg.~\ref{alg:hide} illustrates the procedure for identifying lytic lesions. Blastic lesions are derived in the same manner, except that the negative polarity of the difference map is used. For mixed lesions, the process is run in parallel and the resulting maps are merged.
  
  \begin{algorithm2e}
\caption{Hide-and-Seek attribution (lytic lesions example)}

\KwIn{Image $I$, semantic latent $\vec{z}$, classifier $c(\cdot)$, threshold $\tau$}
\KwOut{Lytic segmentation mask $M_{\text{lytic}}$}

\BlankLine
$\vec{z}_{\text{healthy}} \leftarrow \text{Eq.~\eqref{eq:healthy_recon}}; \quad I_{\text{healthy}} \leftarrow \mathrm{DAE}(\vec{z}_{\text{healthy}})$\tcp*{Healthy edit}

\BlankLine
$D \leftarrow I - I_{\text{healthy}}; \quad D^{+} \leftarrow \max(D, 0)$\tcp*{Difference map}

\BlankLine
$\{M^k\} \leftarrow \mathrm{ConnectedComponents}\bigl(\mathrm{Threshold}(D^{+})\bigr)$\tcp*{Lytic candidates}

\BlankLine
Initialize $C_{\text{lytic}} \leftarrow \emptyset;$

\For{each mask $M^k$}{
  $I_{\text{hide}} \leftarrow \text{Eq.~\eqref{eq:hide}}$\;
  $\vec{z}_{\text{hide}}(M^k) \leftarrow \mathrm{DAE}_{\text{enc}}(I_{\text{hide}})$\tcp*{DAE reconstruction + encoding}
  $\Delta(M^k) \leftarrow \text{as in Eq.~\eqref{eq:delta_prob}}$\;
  \If{$\Delta(M^k) \ge \tau$}{
    Append $M^k$ to $C_{\text{lytic}}$\;
  }
}

\BlankLine
$M_{\text{lytic}} \leftarrow \bigvee_{M^k \in C_{\text{lytic}}} M^k$\tcp*{Final lytic mask}

\Return{$M_{\text{lytic}}$}\;
\label{alg:hide}
\end{algorithm2e}

\newpage

\subsubsection{DAE training and hyperparameters}\label{appendix:hyperparam}
\tableref{tab:hparams} provides hyperparameters used per step in the pipeline.

\begin{table}[htbp]
\centering
\caption{Preprocessing, augmentation, DAE training and postprocessing hyperparameters}
  {\begin{tabular}{l l}
  \bfseries Parameter & \bfseries Value \\ \hline
  Preproessing and augmentations & \\ \hline
  Target voxel spacing & $0.8 \times 0.8 \times 0.8~\mathrm{mm}$ \\
  Crop size & $64 \times 64 \times 64$ voxels \\
  Orientation & PSR \\
  Crop & Vertebra body mask + 5 voxel margin \\
  Slice & Center crop to 5 sagittal slices, then random 1 slice \\
  Augmentation probability & 0.5 \\
  Rotation range & $\pm 33^\circ$ in-plane \\
  Flip & Horizontal flip\\
  Zoom range & $0.85$–$1.15$ in-plane \\
  Gaussian noise & $\mu = 0.0$, $\sigma = 0.015$\\ \hline
  DAE training and inference & \\ \hline
  Batch size & 8 \\
  Base channels & 128 \\
  UNet channel multipliers & $(1, 2, 4, 8, 8)$ \\
  Encoder channel multipliers & $(1, 2, 4, 8, 8, 8)$ \\
  Learning rate & $1 \times 10^{-4}$ \\
  Best scheduler & linear \\
  Optimizer & Adam\\
  Semantic latnet size (d) & 512 \\
  $T_{\text{Semantic}}$ & 250 \\
  $T_{\text{Stochastic}}$ & 100 \\ \hline
  Hide-and-seek & \\ \hline
  $p_{\text{healthy}}$ (Eq.~\eqref{eq:healthy_recon}) &  $10^{-4}$ \\
  Binarization threshold & mean \\
  $\tau$ cutoff & 0.5\\ \hline
   Lesion mask postprocessing & \\ \hline
   Crop & Vertebrae body mask - 2 voxel erosion \\
   Filter & Masks $<$ 5 pixels
  \end{tabular}}
  \label{tab:hparams}
\end{table}

\newpage

\subsubsection{Baseline implementation}\label{appendix:baselines}

\paragraph{Otsu thresholding.} Lesion masks are obtained by applying thresholding using single- or multi-level thresholds depending on whether lytic, blastic, or mixed patterns are clinically expected. 
\paragraph{GradCAM baselines.} A ResNet-18 classifier is trained on the classifier subset dataset (AUC 0.94, F1 0.87). GradCAM, GradCAM++, LayerCAM, and EigenCAM \cite{selvaraju2017gradcam,chattopadhay2018gradcamplus,jiang2021layercam,BanyMuhammad2021EigenCAM} are generated with the PyTorch Grad-CAM library\footnote{\url{https://github.com/jacobgil/pytorch-grad-cam}} from the one-before-last convolutional layer, and heatmaps are binarized via Otsu thresholding. 
\paragraph{MedSAM.} Using the official \emph{vit\_b} checkpoint \cite{ma2024medsam}, we create a loose bounding-box prompt by dilating the vertebral mask, run MedSAM once per vertebra, and restrict predictions to the eroded corpus. 
\paragraph{Anomaly detection.} The DAE is trained only on healthy vertebrae in the full dataset. At test time, malignant slices are reconstructed as pseudo-healthy images, residual maps are computed, and thresholded differences are post-processed into pseudo-lesion masks.

\newpage

\subsection{Dataset details}

The dataset comprises 5,644 vertebrae from 440 patients (mean age 67.5\,±\,12.9 years; 211 female, 229 male). It spans vertebral levels T1 through L5 and is highly imbalanced, containing predominantly healthy vertebrae (4,972 healthy; 672 malignant). During preprocessing, SpineR provides for each vertebra both an anatomical level label (T1–L5) and a vertebral corpus mask used for downstream analysis. Fig.~\ref{fig:dataset_dist} summarizes the distribution of healthy and malignant vertebrae across thoracic and lumbar levels. Malignancies are more frequent in the lumbar spine. Fig.~\ref{fig:lesion_size_hist} shows the distribution of manual lesion sizes for lytic and blastic phenotypes. These reference annotations span a broad range of sizes, whereas the 5-pixel threshold indicated applies only to \emph{predicted components} during evaluation. Vertebrae contained on average 2.06 metastatic components, with up to 7 lesions in rare cases.

\begin{figure}[htbp]
\floatconts
  {fig:dataset_dist}
  {\caption{Distribution of healthy and malignant vertebrae across thoracic and lumbar levels.}}
  {\includegraphics[width=0.6\linewidth]{figures/vertebrae_distribution.png}}
\end{figure}

\begin{figure}[htbp]
\floatconts
  {fig:lesion_size_hist}%
  {\caption{Log-scale histogram of manual lesion sizes (in pixels) for lytic (red) and blastic (blue) lesions. The dashed vertical line marks the 5-pixel threshold used as the minimum \emph{predicted} lesion size in evaluation.}}%
  {\centering
   \includegraphics[width=0.6\linewidth]{figures/lesion_size_distribution.png}%
  }
\end{figure}

\newpage


\subsection{Additional results}\label{appendix:results}
  \subsubsection{Inter-rater agreement}\label{appendix:interrater}

Table~\ref{tab:interrater_metrics} summarizes detection and segmentation metrics for radiologist B relative to radiologist A. Agreement is high for blastic lesions, whereas lytic lesions show markedly lower consistency across all metrics. Table~\ref{tab:interrater_counts} provides the underlying lesion counts, indicating that B recovers nearly all blastic findings from A, but misses a substantial portion of A’s lytic annotations (25\% FN) and adds many additional lytic findings (45\% FP), reflecting the inherent difficulty and subjectivity of identifying lytic metastasis.


  \begin{table}[hbp]
\floatconts
  {tab:interrater_metrics}%
  {\caption{Quantitative inter-rater metrics (mean $\pm$ SD) for radiologist B, evaluated against radiologist A as reference. Higher is better for all metrics except ASSD and RVD.}}%
  {%
  %\resizebox{0.6\textwidth}{!}{%
  \begin{tabular}{lccc}
    \bfseries Metric 
      & \bfseries Blastic 
      & \bfseries Lytic \\ \hline

    Detection Precision ($\uparrow$) 
      & 0.86 $\pm$ 0.25 
      & 0.58 $\pm$ 0.42 \\

    Detection Recall ($\uparrow$) 
      & 0.90 $\pm$ 0.20 
      & 0.73 $\pm$ 0.33 \\

    Detection F1 (RQ, $\uparrow$) 
      & 0.84 $\pm$ 0.23 
      & 0.51 $\pm$ 0.40 \\

    Instance Dice (SQ, $\uparrow$) 
      & 0.81 $\pm$ 0.18 
      & 0.51 $\pm$ 0.38 \\

    Panoptic Dice (PQ$_D$, $\uparrow$) 
      & 0.70 $\pm$ 0.23 
      & 0.39 $\pm$ 0.34 \\

    Global Dice ($\uparrow$) 
      & 0.82 $\pm$ 0.16 
      & 0.49 $\pm$ 0.37 \\

    ASSD ($\downarrow$) 
      & 1.19 $\pm$ 0.91 
      & 1.00 $\pm$ 0.49 \\

    RVD (SQ, $\approx 0$) 
      & 0.22 $\pm$ 0.40 
      & 0.07 $\pm$ 0.33 \\
  \end{tabular}%}
  }
\end{table}

\begin{table}[htbp]
\floatconts
  {tab:interrater_counts}%
  {\caption{Lesion-level agreement counts between radiologists A (reference) and B. ``Matched'' indicates lesions independently annotated by both raters. FN and FP rates reflect B’s misses and additional findings, respectively.}}%
  {%
  %\resizebox{0.55\textwidth}{!}{%
  \begin{tabular}{lcc}
    \bfseries Metric & \bfseries Blastic & \bfseries Lytic \\ \hline
    Annotated lesions (A)              & 142 & 61 \\
    Annotated lesions (B)       & 156 & 83 \\
    Matched (A) lesions          & 138 & 46 \\
    Missed (A) lesions (FN)      & 4   & 15 \\
    FN rate (\%)                & 2.82\% & 24.59\% \\
    Extra (B) lesions (FP)& 18  & 37 \\
    FP rate (per B)     & 11.54\% & 44.58\% \\
  \end{tabular}%}
  }
\end{table}

\pagebreak

\subsubsection{Statistical significance analysis}\label{appendix:stat_significane}

To assess whether observed performance differences are robust beyond overlapping mean $\pm$ SD values, we performed paired nonparametric significance testing. For each metric and lesion phenotype, the proposed method was compared against the strongest baseline using a paired Wilcoxon signed-rank test. Effect sizes are reported as the median difference (Ours$-$Baseline) with 95\% confidence intervals estimated via nonparametric bootstrap resampling.

\begin{table}[htbp]
\floatconts
  {tab:stat_significance}%
  {\caption{Paired statistical comparison between the proposed method and the strongest baseline for each metric. Median performance differences (Ours--Baseline) are reported with 95\% bootstrap confidence intervals and paired Wilcoxon signed-rank test p-values. Negative (Ours--Baseline) indicates lower ASSD (better).}}%
  {
  \resizebox{\linewidth}{!}{%
  \begin{tabular}{lcccc}
    \bfseries Metric
      & \bfseries Best baseline
      & \bfseries Median (Ours--Baseline)
      & \bfseries 95\% CI
      & \bfseries $p$-value \\ \hline

    % ============================================================
    \multicolumn{5}{l}{\textit{Blastic lesions}} \\[2pt]

    Detection F1 (RQ)
      & GradCAM & 0.333 & [0.000, 0.333] & $7.9{\times}10^{-8}$ \\

    Instance Dice (SQ)
      & Otsu & 0.257 & [0.231, 0.292] & $<10^{-12}$ \\

    Panoptic Dice (PQ$_D$)
      & Otsu & 0.521 & [0.482, 0.577] & $<10^{-12}$ \\

    Global Dice
      & Otsu & 0.260 & [0.238, 0.289] & $<10^{-12}$ \\

    ASSD
      & Otsu & $-1.18$ & [$-1.37$, $-1.10$] & $<10^{-12}$ \\

    \hline
    % ============================================================
    \multicolumn{5}{l}{\textit{Lytic lesions}} \\[2pt]

    Detection F1 (RQ)
      & MedSAM & 0.333 & [0.225, 0.333] & $5.7{\times}10^{-4}$ \\

    Instance Dice (SQ)
      & Otsu & 0.479 & [0.304, 0.598] & $<10^{-5}$ \\

    Panoptic Dice (PQ$_D$)
      & Otsu & 0.712 & [0.507, 0.768] & $<10^{-5}$ \\

    Global Dice
      & Otsu & 0.360 & [0.315, 0.554] & $<10^{-5}$ \\

    ASSD
      & AD & $-2.05$ & [$-2.24$, $-1.22$] & $7.8{\times}10^{-3}$ \\

    \hline
    % ============================================================
    \multicolumn{5}{l}{\textit{Mixed lesions}} \\[2pt]

    Detection F1 (RQ)
      & GradCAM & 0.417 & [0.000, 0.500] & 0.25 \\

    Instance Dice (SQ)
      & GradCAM & 0.626 & [0.497, 0.865] & 0.13 \\

    Panoptic Dice (PQ$_D$)
      & GradCAM & 0.760 & [0.543, 0.932] & 0.13 \\

    Global Dice
      & GradCAM & 0.642 & [0.540, 0.780] & 0.13 \\

    ASSD
      & MedSAM & $-3.63$ & [$-5.75$, $-1.58$] & 0.13 \\

  \end{tabular}}}
\end{table}

\newpage


\subsubsection{Additional metrics and qualitative comparisons}\label{appendix:additional}

Table~\ref{tab:more_metrics} reports complementary detection and volume-based metrics for all baselines and our method. Figure~\ref{fig:qualitative_more} provides additional qualitative examples.

  \begin{table}[htbp]
\floatconts
  {tab:more_metrics}%
  {\caption{Detection precision, detection recall, and relative volume difference (RVD; close to zero is better) for all baselines and our method (mean ± SD). Best scores per metric are in bold.}}%
  {
  \resizebox{\textwidth}{!}{%
  \begin{tabular}{l cccccccc}
    \bfseries Metric 
      & \bfseries Otsu 
      & \bfseries GradCAM 
      & \bfseries GradCAM++ 
      & \bfseries LayerCAM 
      & \bfseries EigenCAM
      & \bfseries MedSAM
      & \bfseries AD
      & \bfseries \textbf{Ours} \\ \hline

    % ============================================================
    \multicolumn{9}{l}{\textit{Blastic lesions}} \\[2pt]

   Detection Precision $\uparrow$           
      & 0.59 $\pm$ 0.29 
      & 0.94 $\pm$ 0.16 
      & \textbf{0.95 $\pm$ 0.21}
      & 0.84 $\pm$ 0.26 
      & 0.87 $\pm$ 0.33
      & 0.73 $\pm$ 0.34
      & 0.74 $\pm$ 0.30
      & 0.92 $\pm$ 0.19 \\

    Detection Recall $\uparrow$
      & \textbf{0.94 $\pm$ 0.14} 
      & 0.76 $\pm$ 0.30 
      & 0.72 $\pm$ 0.34 
      & 0.78 $\pm$ 0.31 
      & 0.62 $\pm$ 0.38
      & 0.72 $\pm$ 0.32
      & 0.85 $\pm$ 0.27
      & 0.93 $\pm$ 0.17 \\

    RVD $\approx 0$                
      & -0.18 $\pm$ 0.21 
      & 0.37 $\pm$ 0.96 
      & 0.80 $\pm$ 1.89 
      & 0.19 $\pm$ 2.48 
      & 0.59 $\pm$ 1.50
      & 0.84 $\pm$ 1.80
      & 0.12 $\pm$ 0.84
      & \textbf{0.05 $\pm$ 0.41} \\

    \hline
    % ============================================================
    \multicolumn{9}{l}{\textit{Lytic lesions}} \\[2pt]

    Detection Precision $\uparrow$           
      & 0.67 $\pm$ 0.28 
      & 0.75 $\pm$ 0.36 
      & \textbf{0.91 $\pm$ 0.26} 
      & 0.72 $\pm$ 0.30 
      & 0.81 $\pm$ 0.39
      & 0.71 $\pm$ 0.26
      & 0.60 $\pm$ 0.39
      & 0.85 $\pm$ 0.25 \\

    Detection Recall $\uparrow$
      & 0.73 $\pm$ 0.29 
      & 0.54 $\pm$ 0.38 
      & 0.44 $\pm$ 0.35 
      & 0.52 $\pm$ 0.34 
      & 0.34 $\pm$ 0.34
      & 0.75 $\pm$ 0.30
      & 0.37 $\pm$ 0.43
      & \textbf{0.89 $\pm$ 0.19} \\

 

    RVD $\approx 0$                
      & 1.63 $\pm$ 2.28 
      & 2.27 $\pm$ 4.42 
      & 4.68 $\pm$ 4.51 
      & 0.89 $\pm$ 1.01 
      & 2.68 $\pm$ 1.85
      & 1.34 $\pm$ 2.49
      & \textbf{-0.43 $\pm$ 0.20}
      & -0.22 $\pm$ 0.28 \\

    \hline
    % ============================================================
    \multicolumn{9}{l}{\textit{Mixed lesions}} \\[2pt]

    Detection Precision $\uparrow$    %       
      & 0.41 $\pm$ 0.05 
      & \textbf{1.00 $\pm$ 0.00} 
      & 0.82 $\pm$ 0.38 
      & 0.89 $\pm$ 0.31 
      & \textbf{1.00 $\pm$ 0.00}
     & 0.69 $\pm$ 0.33
      & 0.56 $\pm$ 0.06
      & 0.68 $\pm$ 0.14 \\

    Detection Recall $\uparrow$
      & \textbf{0.81 $\pm$ 0.06} 
      & 0.63 $\pm$ 0.28 
      & 0.49 $\pm$ 0.34 
      & 0.59 $\pm$ 0.34 
      & 0.63 $\pm$ 0.28
      & 0.72 $\pm$ 0.30
      & 0.78 $\pm$ 0.22
      & 0.75 $\pm$ 0.12 \\

    RVD $\approx 0$                
      & 3.33 $\pm$ 3.85 
      & 1.90 $\pm$ 3.03 
      & 1.11 $\pm$ 2.13 
  & 0.91 $\pm$ 1.92 
      & 2.03 $\pm$ 3.14
      & 0.59 $\pm$ 1.54
      & 0.33 $\pm$ 0.21
      & \textbf{0.51 $\pm$ 0.66} \\

  \end{tabular}}}
\end{table}


\begin{figure}[h]
  \centering
  \includegraphics[width=0.95\linewidth]{figures/panel_8rows_all_methods_appendix.png}
  \caption{Qualitative comparison on eight vertebral CT slices (rows). Columns show the input image, manual ground truth, Otsu, CAM-based baselines, MedSAM, anomaly detection (AD), and our method. 
  }
  \label{fig:qualitative_more}
\end{figure}

\newpage

\subsubsection{Lesion-wise Delta-score Analysis}\label{appendix:delta}

Figure~\ref{fig:delta_vs_size} (left) shows precision–recall curves of the $\Delta$-score stratified by phenotype. Blastic lesions exhibit strong separability, whereas lytic lesions produce a much noisier curve, consistent with the phenotype-specific results in the main text.

The scatter plots in Figure~\ref{fig:delta_vs_size} (middle/right) relate lesion size to $\Delta$ on a per-lesion basis. Blastic true positives form a clear cluster of large lesions with high $\Delta$, while blastic false positives are mostly small with lower scores. Lytic lesions show greater overlap, but true positives still tend to achieve higher $\Delta$ than false positives.

Table~\ref{tab:size_analysis} further summarizes lesion sizes and the original vertebra-level malignancy probability computed before any edits. Blastic vertebrae show substantially higher malignant probabilities than lytic vertebrae, indicating stronger global evidence available to the classifier. Lytic false positives are also extremely small on average, mirroring the scatter plots and the noisier precision–recall behavior. Together, these findings illustrate that $\Delta$ provides a meaningful lesion-level signal for both phenotypes, with stronger separation in blastic disease.

\begin{figure}[htbp]
\centering
\includegraphics[width=0.3\linewidth]{figures/lesion_pr_curve_ours.png}
\includegraphics[width=0.3\linewidth]{figures/blastic_delta_vs_size.png}
\includegraphics[width=0.3\linewidth]{figures/lytic_delta_vs_size.png}
\caption{Left: precision–recall curves of the $\Delta$-score for blastic and lytic lesions. Middle/right: lesion size versus $\Delta$ for blastic and lytic lesions (TPs blue points, FPs orange x's). Blastic lesions show clearer separation between true and false positives, while lytic lesions exhibit greater overlap.}
\label{fig:delta_vs_size}
\end{figure}


\begin{table}[htbp]
\floatconts
  {tab:size_analysis}
  {\caption{Lesion sizes (mean ± SD) and original vertebra-level malignancy probabilities, computed before any Hide-and-Seek editing. Blastic vertebrae show higher malignant probabilities, indicating stronger global classifier evidence compared to lytic vertebrae.}}
  {%
  %\resizebox{0.6\textwidth}{!}{%
  \begin{tabular}{lcccc}
    {\bfseries Metric} 
      & {\bfseries Blastic TP} 
      & {\bfseries Blastic FP} 
      & {\bfseries Lytic TP} 
      & {\bfseries Lytic FP} \\ \hline

    $p_{\text{orig}}$ 
      & 0.932 $\pm$ 0.112 
      & 0.853 $\pm$ 0.158 
      & 0.777 $\pm$ 0.214 
      & 0.728 $\pm$ 0.252 \\

    Size (pixels) 
      & 225.9 $\pm$ 181.8 
      & 23.7 $\pm$ 29.6 
      & 52.4 $\pm$ 48.2 
      & 14.3 $\pm$ 16.0 \\

  \end{tabular}
  }%}
\end{table}

\newpage

\subsection{Ablation studies} \label{appendix:ablations}
\subsubsection{Reconstruction and Edit Quality}
\label{appendix:recon}

To assess whether the DAE preserves vertebral anatomy and produces plausible healthy edits, we report reconstruction and edit similarity metrics in Table~\ref{tab:dae_recon_quality} and provide qualitative examples in Fig.~\ref{fig:recon}. The DAE achieves high fidelity when reconstructing the original, and healthy edits remain structurally consistent with the input while removing malignancy.

\begin{table}[htbp]
\floatconts
  {tab:dae_recon_quality}%
  {\caption{Reconstruction and healthy–edit similarity.  
  Lower LPIPS and higher SSIM indicate greater structural fidelity.}}%
  {\begin{tabular}{l cc}
    \bfseries Setting & \bfseries LPIPS $\downarrow$ & \bfseries SSIM $\uparrow$ \\ \hline
    Reconstruction (orig $\leftrightarrow$ recon) 
      & 0.0675 $\pm$ 0.0319 
      & 0.9731 $\pm$ 0.0123 \\
    Healthy edit (orig $\leftrightarrow$ edit) 
      & 0.1197 $\pm$ 0.0432 
      & 0.8336 $\pm$ 0.0641 \\
  \end{tabular}}
\end{table}

\begin{figure}[htbp]
\floatconts
  {fig:recon}
  {\caption{Qualitative examples of DAE behavior.  
  From left to right: original slice, DAE reconstruction, and healthy edit produced by removing the malignant lesion.}}%
  {\includegraphics[width=0.8\linewidth]{figures/reconstruction_quality.png}}
\end{figure}

\subsubsection{Initial lesion candidate generation}
\label{appendix:initial}

Per-image mean binarization of the difference maps is used to derive initial lesion candidates. This choice is motivated by the substantial variability in difference-map intensities across samples (mean 0.064 ± 0.047; range: 0.001–0.265). To verify that this design choice is not a source of systematic false negatives, we evaluated lesion coverage, defined as whether initial candidate overlaps a ground-truth lesion, and observed similar  coverage when using alternative heuristics such as median- and Otsu-based binarization (mean: 0.90, median: 0.88, Otsu: 0.89), indicating that missed lesions are not attributable to the specific  criterion.

\newpage

\subsubsection{Delta-score analysis}\label{appendix:delta_vs_prob}

To assess whether Hide-and-Seek Attribution is necessary, we evaluated a variant that uses the initial difference-map-based masks directly, without candidate isolation or $\Delta$-score filtering. As shown in Table~\ref{tab:ablation_no_hide_seek}, this variant performs substantially worse than the full method, with clear drops in instance-level Dice, panoptic Dice, and global Dice, and higher surface distances, particularly for lytic lesions. This demonstrates that validating candidates in isolation is essential.

Because the $\Delta$-score is used as a lesion decision criterion, we further analyze its behavior across decision thresholds. As reflected by the ROC curves (Fig.~\ref{fig:delta_vs_prob_roc}, blue and orange), performance varies smoothly, without evidence of a narrow or unstable threshold region.

Finally, we compared the $\Delta$-score with the classifier’s raw malignancy probability for distinguishing true from false positives. As shown in Fig.~\ref{fig:delta_vs_prob_roc}, performance is similar for blastic lesions (ROC–AUC 0.93 vs.\ 0.90) but clearly improved for lytic lesions (0.62 vs.\ 0.50), with higher true-positive rates at comparable false-positive levels. This demonstrates that normalizing each lesion’s predicted probability by the malignancy of the original unedited image yields a more informative lesion-level signal than using the classifier probability alone.

\begin{figure}[htbp]
\floatconts
  {fig:delta_vs_prob_roc}
  {\caption{ROC curves comparing the $\Delta$-score and classifier probability for separating true and false positive regions. Results are shown for blastic and lytic lesions vs. chance.}}%
  {\includegraphics[width=0.4\linewidth]{figures/roc_curves_delta_vs_prob.png}}
\end{figure}

\begin{table}[htbp]
\floatconts
  {tab:ablation_no_hide_seek}%
  {\caption{Ablation without Hide-and-Seek Attribution.  
  Performance when using the initial predicted masks directly, without candidate isolation and $\Delta$-score filtering. Results are reported as mean $\pm$ std.}}%
  {\resizebox{\textwidth}{!}{%
  \begin{tabular}{l ccccc}
    \bfseries Lesion type 
    & \bfseries RQ (F1) $\uparrow$ 
    & \bfseries SQ (Dice) $\uparrow$ 
    & \bfseries PQ$_{\text{D}}$ $\uparrow$ 
    & \bfseries ASSD $\downarrow$ 
    & \bfseries Global Dice $\uparrow$ \\ \hline
    Blastic lesions
      & 0.82 $\pm$ 0.20
      & 0.73 $\pm$ 0.13
      & 0.60 $\pm$ 0.19
      & 1.77 $\pm$ 1.09
      & 0.76 $\pm$ 0.10 \\
    Lytic lesions
      & 0.69 $\pm$ 0.20
      & 0.56 $\pm$ 0.15
      & 0.38 $\pm$ 0.14
      & 1.95 $\pm$ 0.97
      & 0.50 $\pm$ 0.14 \\
  \end{tabular}}}
\end{table}

\pagebreak

\subsubsection{Lesion interactions}\label{appendix:interactions}

\begin{figure}[htbp]
\floatconts
  {fig:interactions}
  {\caption{Distribution of the pairwise joint-effect statistic $S_{ij}$ over candidate pairs in multi-lesion vertebrae}}%
  {\includegraphics[width=0.6\linewidth]{figures/Sij_hist_multilesion.png}}
\end{figure}

To assess potential joint lesion effects, we analyzed pairwise combinations of predicted lesions in multi-lesion vertebrae. Let $\Delta_i$ and $\Delta_j$ denote the individual malignancy scores of candidates $M_i$ and $M_j$, and $\Delta_{ij}$ the score obtained when evaluating their union. We quantify the additional contribution obtained when evaluating the two jointly, beyond the stronger candidate alone, as
\begin{align}
S_{ij} = \max\!\bigl(0,\ \Delta_{ij} - \max(\Delta_i,\Delta_j)\bigr).
\end{align}
Large values of $S_{ij}$ would indicate lesions that become malignant only through joint presence and could therefore be missed by one-at-a-time evaluation.

For computational tractability, we restricted the analysis to the top $K=4$ candidates per vertebra ranked by $\Delta$-score (90\% of vertebrae had at most four annotated lesions). Fig.~\ref{fig:interactions} shows the distribution of $S_{ij}$ over lesion pairs in multi-lesion vertebrae. The values are concentrated near zero (mean $0.025$), with the 90th and 95th percentiles at $0.065$ and $0.090$, respectively, indicating that malignancy is largely captured by individual lesions rather than by strong joint interactions for candidate selection.

\newpage

\subsubsection{Projection of occluded images}\label{appendix:projection}

Fig.~\ref{fig:harmonization_examples} shows examples of the original slice, the occluded image, and the corresponding projected image. Direct occlusion creates sharp masking artifacts that are visually unnatural. The projection step restores anatomically plausible appearance, producing edits that resemble real CT data.

\begin{figure}[htbp]
\centering
\includegraphics[width=0.9\linewidth]{figures/harmonization_1.png}\\[-0.5em]
\includegraphics[width=0.9\linewidth]{figures/harmonization_2.png}
\caption{Examples illustrating the effect of projecting occluded images back onto the data manifold. Projection removes masking artifacts and restores plausible structure.}
\label{fig:harmonization_examples}
\end{figure}

\end{document}

