% Requires: llncs.cls and splncs04.bst from Springer (https://www.springer.com/gp/computer-science/lncs/conference-proceedings-guidelines).
\documentclass[runningheads]{llncs}

\usepackage[T1]{fontenc}
\usepackage{graphicx}
\usepackage{amsmath,amssymb}
\usepackage{booktabs}
\usepackage{xspace}
\usepackage[hidelinks]{hyperref}
\usepackage{color}
\usepackage{marvosym}

% small macros
\newcommand{\Ea}{E_{a}}
\newcommand{\Es}{E_{s}}
\newcommand{\Vc}{\mathbf{c}}
\newcommand{\Vsv}{\mathbf{v}_{s}}
\newcommand{\swap}{\mathrm{swap}}

\title{Disentangled Retinal Fundus Synthesis with Non-Circular Vessel-Topology Evaluation}
\titlerunning{Non-Circular Disentanglement Evaluation for Fundus Synthesis}

\author{Syed Abdullah Basit\textsuperscript{(\Letter)} \and Tanvir Alam}
\authorrunning{S. A. Basit and T. Alam}
\institute{College of Science and Engineering, Hamad Bin Khalifa University, Doha, Qatar\\
\email{syba62904@hbku.edu.qa}}

\begin{document}
\maketitle

\begin{abstract}
Self-supervised disentanglement of vessel anatomy from appearance is commonly validated by an encoder-projection cosine between source and swap. \textbf{This validation is circular}: it measures invariance of the very encoder used to generate the swap, not preservation of anatomy. A totally vessel-destroyed image (Gaussian blur, $\sigma{=}8$) still reads as $0.92$ ``anatomy retention'' while a non-circular vessel-topology metric --- clDice via an independently-trained FIVES segmenter --- collapses to $0.003$, below an unrelated-pair floor of $0.11$; ablations move the cosine by $+0.08$ and $-0.15$ while clDice stays flat. We propose segmenter-based clDice, read against an unrelated-pair floor, as the evaluation protocol for vessel-bearing medical image synthesis. Applying it to a latent-diffusion testbed, a vessel-mask ControlNet, and a MUNIT-style GAN shows that conclusions drawn under it depend on controls that are easy to omit: a clDice margin inverts once resolutions are matched, since resolution dominates the metric, and the mask baseline exhibits a \emph{second} circularity mode --- it is conditioned on the evaluating segmenter's own output. Non-circularity is therefore necessary but not sufficient: a topology metric must be paired with matched resolution and compute, an audit of conditioning--evaluator independence, and complementary evidence beyond a single scalar.

\keywords{Evaluation methodology \and Disentangled representation learning \and Retinal vessel topology \and Latent diffusion \and Metric circularity.}
\end{abstract}

%==============================================================================
\section{Introduction}
\label{sec:intro}
%==============================================================================

Retinal fundus images underpin non-invasive screening for diabetic retinopathy, glaucoma, and hypertensive disease, where automated assessment hinges on faithful preservation of the \emph{vessel tree}~\cite{cite-vessel-clinical}. Synthesising fundus images that preserve a patient's vessel topology while varying illumination, colour, and capture artefacts serves data augmentation, privacy-preserving sharing, and controlled studies of acquisition shift~\cite{cite-fundus-synth-survey}. This requires \emph{disentanglement}: separately exposing vessel anatomy and appearance so that one transfers without disturbing the other. Throughout, ``anatomy'' denotes vessel topology specifically, not retinal anatomy in general; §\ref{sec:disc} quantifies where that scope falls short (optic disc).

A natural self-supervised approach learns an anatomy encoder $\Ea$ and an appearance encoder $\Es$ jointly, with an invariance objective on $\Ea$ (e.g., NT-Xent contrastive~\cite{cite-simclr}) and a variance objective on $\Es$ (e.g., VICReg~\cite{cite-vicreg}); the obvious sanity check for anatomy preservation under the swap is then the cosine $\cos(\Ea(A), \Ea(\swap))$ between the same encoder's own embeddings, which we term ``anatomy retention''. \textbf{We show that this check is circular.} The swap is generated conditional on $\Ea(A)$, and $\Ea$ is trained to be invariant to view augmentation, so the cosine measures the encoder's invariance --- a property it was optimised to satisfy --- not whether the swap's vessel tree matches $A$'s. Gaussian-blurring at $\sigma{=}8$ destroys the vessel tree (clDice $0.003$, below the $0.11$ floor) yet the cosine still reports $0.924$, monotonically blind to vessel destruction.

We therefore replace it with a \textbf{non-circular} protocol: clDice~\cite{cite-cldice}, the centerline-Dice topology metric, computed on vessel masks produced by an \emph{independently-trained} U-Net segmenter (FIVES~\cite{cite-fives}), and always reported against an unrelated-pair floor so preservation reads as a margin over chance. The segmenter is never exposed to the generative model, so the protocol measures vessel preservation without invoking the encoder it was designed to evaluate. To exercise it we build a self-supervised latent-diffusion testbed (§\ref{sec:framework}) and compare it against a vessel-mask ControlNet and a MUNIT-style GAN.

\paragraph{Contributions.} \textbf{(i)} We diagnose encoder-cosine circularity along two orthogonal axes: the blur sweep above, and component ablations in which the cosine \emph{rises} ($+0.08$) when the vessel-fidelity loss is removed and \emph{falls} ($-0.15$) when contrastive invariance is removed, while clDice stays flat. \textbf{(ii)} We propose the non-circular protocol above and validate its stability under a second, architecturally-independent segmenter. \textbf{(iii)} We show non-circularity is necessary but not sufficient: a clDice margin inverts once resolutions are matched, and the mask baseline exhibits \emph{conditioning--evaluator coupling}, a second circularity mode a non-circular metric does not remove. We give a checklist --- match resolution and compute, audit conditioning--evaluator independence, report complementary non-topology evidence.

%==============================================================================
\section{Related Work}
\label{sec:relwork}
%==============================================================================

\paragraph{Disentangled image-to-image translation.} A line of work decomposes an image into content and style codes with separate encoders, recombining them for translation; MUNIT~\cite{cite-munit}, DRIT++~\cite{cite-dritpp}, and StarGAN-v2~\cite{cite-stargan2} are prototypical. These papers evaluate primarily with distributional and perceptual scores (FID, LPIPS diversity) rather than a source-to-swap encoder cosine. Our target is the cosine check itself --- the content-preservation diagnostic any self-supervised content/style encoder pair invites, since source and swap already live in the embedding space used to condition the swap --- which to our knowledge has not been stress-tested against ground-truth structure. Biases of disentanglement metrics more broadly have been studied~\cite{cite-csbias}; our critique targets this self-referential cosine wherever it validates an encoder-conditioned swap, medical or otherwise.

\paragraph{Medical synthesis, spatial conditioning, and topology metrics.} Retinal generation preserving clinical content has used vessel-conditioned GANs~\cite{cite-vessel-cond-gan}, cycle-consistent translation~\cite{cite-cyclegan-medical}, and diffusion models~\cite{cite-medical-diffusion}, mostly evaluated with pixel-wise or perceptual losses plus mask-based comparisons. None systematically addresses the circularity that arises when the disentanglement encoder is itself the evaluator --- nor, as we show in §\ref{sec:baseline}, the related failure when a model's conditioning signal comes from the evaluator. ControlNet~\cite{cite-controlnet} injects spatial conditioning into a frozen UNet via zero-initialised residuals; our multi-scale anatomy injector follows this principle, and a vessel-mask ControlNet variant serves as a comparison system. clDice~\cite{cite-cldice} is a centerline-Dice variant for tubular structures; earlier work pairs it with topology-aware losses~\cite{cite-topo-loss}, whereas we use it strictly for evaluation, via a segmenter independent of the generative model.

%==============================================================================
\section{Method}
%==============================================================================

We first summarise the synthesis testbed used to exercise the protocol (§\ref{sec:framework}), then state the protocol itself (§\ref{sec:protocol}).

\subsection{Synthesis Testbed: Anatomy-Style Disentangled Latent Diffusion}
\label{sec:framework}

The testbed exercises the protocol rather than being a contribution in itself; its full configuration is in the supplementary material.

\paragraph{Setup and encoders.} Inputs are RGB fundus images $x$ normalised to $[-1,1]$, $H{=}W{=}512$. A frozen Stable Diffusion VAE~\cite{cite-ldm} maps $x \mapsto z = \mathrm{VAE_{enc}}(x)\cdot s$, $s{=}0.18215$, giving the diffusion target $z \in \mathbb{R}^{4 \times H/8 \times W/8}$. Two trainable encoders consume $x$: an anatomy encoder $\Ea$ producing a spatial map $\mathbf{m}_a \in \mathbb{R}^{32 \times h \times w}$ and a projection $\mathbf{p}_a$ (used only for contrastive training), and an appearance encoder $\Es$ producing $\Vsv \in \mathbb{R}^{768}$ plus its projection. A $1{\times}1$ convolution bottleneck with InstanceNorm, $\mathrm{Bot}: \mathbb{R}^{32}\!\to\!\mathbb{R}^{8}$, gives a vessel-rich low-channel code $\Vc = \mathrm{Bot}(\mathbf{m}_a)$ at latent resolution; its narrowness matters for the comparison in §\ref{sec:munit}.

\begin{figure}[t]
\centering
\includegraphics[width=1\linewidth]{figs/schematic_v5_short.png}
\caption{Testbed conditioning pipeline. Encoders $\Ea,\Es$ and a frozen VAE consume $x$; anatomy ($\Vc$, paths (i)--(ii)) and appearance ($\Vsv$, paths (iii)--(iv)) reach the UNet through four routes (§\ref{sec:framework}).}
\label{fig:schematic}
\end{figure}

\paragraph{UNet and conditioning.} A UNet $\epsilon_\theta$ predicts the noise added to $z_t$. Conditioning enters through four paths (Figure~\ref{fig:schematic}): anatomy by \emph{input-concat} of $\Vc$ with $z_t$ and by zero-initialised \emph{multi-scale ControlNet residuals} at every UNet scale; style by \emph{cross-attention} on $\Vsv$ expanded to $N{=}4$ tokens and by \emph{AdaGN} in every ResNet block. Anatomy and style conditioning are dropped independently during training, so three UNet passes combine at inference into compositional classifier-free guidance on anatomy ($\alpha_a$) and style ($\alpha_s$) separately (supplementary).

\paragraph{Training objectives.} The loss combines (a) diffusion MSE; (b) anatomy contrastive NT-Xent~\cite{cite-simclr} on $\mathbf{p}_a$ across two augmented views; (c) appearance VICReg~\cite{cite-vicreg} on $\Vsv$; (d) style reconstruction --- a low-timestep $\hat z_0$ prediction, VAE-decoded and re-encoded by $\Es$, with cosine loss against a swapped reference style, forcing $\epsilon_\theta$ to use $\Vsv$ rather than the anatomy path; (e) vesselness fidelity --- a small decoder predicts a vesselness mask from $\Vc$, trained against the thresholded multiscale-Hessian vesselness~\cite{cite-frangi} of $x$'s green channel. Term (e) is the only loss explicitly requiring the anatomy stream to encode vessels; ablating it (§\ref{sec:ablations}) drives the encoder cosine \emph{up} while clDice stays flat. Loss weights and optimisation are in the supplementary material.

\subsection{Non-Circular Vessel-Topology Protocol}
\label{sec:protocol}

\paragraph{The circular metric.} Let $\swap(A,B)$ denote $\epsilon_\theta$'s sampled output conditioned on $\Vc_A = \mathrm{Bot}(\Ea(A))$ and $\mathbf{v}_{s,B} = \Es(B)$. The encoder-cosine ``anatomy retention'' reads $R_a(A,B) = \cos(\Ea(A), \Ea(\swap(A,B)))$. Here $\Ea$ is the same encoder that produced the conditioning, and it is trained for invariance to view augmentation; the cosine therefore measures how well that invariance generalises across the generative model's image manifold, not whether the swap's vessel tree matches $A$'s.

\paragraph{The non-circular replacement.} Let $\mathrm{Seg}: x \mapsto m \in \{0,1\}^{1\times H \times W}$ be a binary vessel segmenter --- a U-Net with a ResNet-34 encoder --- trained on the FIVES dataset~\cite{cite-fives} \emph{prior to and independently of} the generative model, then frozen. We report vessel topology preservation as
\begin{equation}
\mathrm{clDice}\big(\mathrm{Seg}(\swap(A,B)),\,\mathrm{Seg}(A)\big),
\end{equation}
where clDice is the harmonic mean of skeleton-versus-mask precision and sensitivity~\cite{cite-cldice}. The segmenter has the two properties the protocol needs: \emph{independence} --- it was never optimised jointly with $\epsilon_\theta$, $\Ea$, or $\Es$; and \emph{topology sensitivity} --- clDice penalises connectivity errors more strongly than dense Dice. We additionally report the unrelated-pair floor $\mathrm{clDice}(\mathrm{Seg}(B),\mathrm{Seg}(A))$, so preservation reads as a margin over chance.

\paragraph{Two circularity modes.} Write $C$ for a model's conditioning signal, $M$ for the evaluator. \emph{(i) Evaluator--generator coupling}: $M = \Ea$, the encoder that produced $C$ --- the encoder cosine, removed by choosing an independent segmenter. \emph{(ii) Conditioning--evaluator coupling}: $C = M(A)$ --- the model is conditioned on the evaluator's own output, so $M$ grades a model built from its own predictions, right or not. A non-circular $M$ does \emph{nothing} about (ii): it relocates the circularity to the conditioning path. Any mask-conditioned model scored by its own segmenter exhibits it, including the ControlNet of §\ref{sec:baseline}, so auditing $C \perp M$ is part of the protocol.

%==============================================================================
\section{Experiments}
%==============================================================================

\subsection{Setup}
\label{sec:setup}

\paragraph{Data and segmenters.} We use $41{,}390$ fundus images from two public DR-screening corpora, Kaggle EyePACS~\cite{cite-eyepacs} and APTOS 2019~\cite{cite-aptos}, quality-filtered by a Laplacian-variance gate and resized to $512^2$. The evaluating segmenter is a U-Net with a ResNet-34 encoder (segmentation-models-pytorch~\cite{cite-smp}) trained on FIVES's $800$ ground-truth pairs~\cite{cite-fives} to validation Dice $>0.7$, then frozen; \emph{Seg2} is a ResNet-18 counterpart (test Dice $0.889$; supplementary).

\paragraph{Training and sampling.} AdamW at $10^{-4}$ peak LR, bf16, on one RTX 3090 ($24$\,GB); a $512^2$ run takes $\sim$30\,h and each $256^2$ ablation $\sim$6\,h. The $512^2$ mask baseline (§\ref{sec:baseline}) is warm-started from its $256^2$ checkpoint and fine-tuned at $3\times10^{-5}$, matching our model's resolution and sampler. Loss weights, parameter groups, and schedule are in the supplementary material. Sampling uses DDIM $20$ steps with $\alpha_s{=}1.0$, $\alpha_a{=}3.0$ unless stated otherwise.

\paragraph{Reported quantities.} Each cell in Table~\ref{tab:ablation} is a mean over $N{=}300$ paired swaps (seed $42$, identical pair indices across rows). PSNR/SSIM are A$\to$A self-reconstruction; ``Style'' is $\cos(\Es(B),\Es(\swap))$ and ``EncRet'' the circular $\cos(\Ea(A),\Ea(\swap))$ we critique, both on the encoders' contrastive projection heads --- the spaces trained for invariance, which is what makes EncRet circular.

\subsection{The Encoder Cosine Is Blind to Vessel Destruction}
\label{sec:blur}

To show the encoder cosine does not measure what its name implies, we construct a controlled degradation \emph{outside the generative model}: Gaussian-blur the input with $\sigma \in [0,16]$ and measure both segmenter clDice and encoder cosine between original and blurred. The blur eliminates fine vessels while preserving the coarse layout the encoder is invariant to, so any valid ``anatomy retention'' metric must decline with clDice.

At $\sigma{=}8$ the segmenter-measured topology has collapsed to clDice $0.003$, below the $0.11$ unrelated-pair floor, yet the encoder cosine still reports $0.924$; even at $\sigma{=}16$ (a smooth gradient) it reports $0.78$. Across the sweep clDice falls from $0.999$ to $0.0003$ while the cosine never drops below $0.78$ ($N{=}40$; plotted in the supplementary material). The two decouple monotonically: the encoder cosine cannot serve as a vessel-preservation metric because, by design, it does not see vessels at the scale where vessels live.

\subsection{Disentanglement Quality and Ablations}
\label{sec:ablations}

Table~\ref{tab:ablation} shows our main result ($512^2$, top row) with a from-scratch $256^2$ reference and three component ablations. The vessel-mask ControlNet baseline appears at both resolutions and is analysed in §\ref{sec:baseline}, where the resolution-matched comparison reverses the ranking we reported at $256^2$.

\begin{table}[t]
\centering
\caption{Evaluation over $N{=}300$ paired swaps (seed $42$, identical pairs across rows). \emph{CI}: bootstrap 95\% interval on mean clDice. \emph{Seg2}: a second, architecturally-independent segmenter (ResNet-18), tracking clDice on every row. \emph{fl}: unrelated-pair floor. \emph{CDist}: non-encoder colour-histogram distance to $B$ (lower better). \emph{OD}: classical-CV optic-disc overlap (floor $.094$/$.105$). \emph{Style}/\emph{EncRet}: the circular encoder cosines we critique (undefined for mask baselines). Bold = best per column; the testbed leads only on PSNR and OD.}
\label{tab:ablation}
\resizebox{\textwidth}{!}{%
\begin{tabular}{lcccccccccc}
\toprule
Variant & clDice $\uparrow$ & 95\% CI & Seg2 $\uparrow$ & Dice $\uparrow$ & fl & SSIM $\uparrow$ & PSNR $\uparrow$ & CDist $\downarrow$ & OD $\uparrow$ & Style/EncRet \\
\midrule
\textbf{Ours, full ($512^2$)}    & 0.559 & [.548,.569] & 0.554 & 0.591 & .109 & 0.615 & \textbf{20.43} & 0.067 & \textbf{0.476} & 0.877 / 0.830 \\
Ours, full ($256^2$)             & 0.314 & [.307,.321] & 0.308 & 0.396 & .112 & 0.616 & 22.00 & \textbf{0.060} & 0.664 & 0.783 / 0.818 \\
$-$\,multi-scale residuals        & 0.293 & [.285,.300] & 0.292 & 0.368 & .112 & 0.603 & 21.43 & 0.061 & 0.661 & 0.821 / 0.824 \\
$-$\,vesselness fidelity loss     & 0.342 & [.335,.349] & 0.338 & 0.417 & .112 & \textbf{0.622} & 22.43 & 0.065 & 0.706 & 0.649 / \textbf{0.899} \\
$-$\,contrastive                  & 0.338 & [.331,.345] & 0.323 & 0.418 & .112 & 0.594 & 21.02 & 0.065 & 0.633 & 0.739 / \textbf{0.664} \\
Mask ControlNet baseline ($256^2$) & 0.239 & [.235,.243] & 0.241 & 0.289 & .112 & 0.536 & 16.04 & 0.092 & 0.236 & 0.663 / --- \\
Mask ControlNet baseline ($512^2$) & \textbf{0.582} & [.577,.588] & \textbf{0.579} & \textbf{0.617} & .109 & 0.608 & 16.99 & 0.066 & 0.193 & \textbf{0.905} / --- \\
\bottomrule
\end{tabular}}
\end{table}

Resolution dominates vessel topology: clDice rises from $0.314$ at $256^2$ to $0.559$ at $512^2$ (§\ref{sec:resconfound} isolates resolution itself from rendering quality). The remaining components shift clDice by at most $\pm0.03$ --- yet the encoder cosine moves substantially and \emph{in opposite directions}. Removing the vesselness fidelity loss ($\lambda_\mathrm{struct}{=}0$) leaves clDice flat ($0.314\!\to\!0.342$) while the cosine \emph{rises} to $0.899$: with no structural pull on the bottleneck, $\Ea$ becomes more invariance-dominated, exactly what the cosine rewards. Removing contrastive invariance ($\lambda_\mathrm{contrastive}{=}0$) again leaves clDice flat ($0.314\!\to\!0.338$) while the cosine \emph{falls} to $0.664$: $\Ea$ is no longer optimised for invariance, so its self-cosine drifts toward the inter-image baseline.

The cosine thus swings $+0.08$ and $-0.15$ while clDice barely moves, with the ordering unchanged under \emph{Seg2} --- a controlled demonstration of circularity independent of §\ref{sec:blur}'s degraded-input one. Secondarily, multi-scale residual injection adds $+0.02$ clDice, and style transfer drops when either the vesselness ($0.78\!\to\!0.65$) or contrastive ($0.78\!\to\!0.74$) loss is removed: both aid disentanglement \emph{quality} even where neither is needed for topology.

\subsection{Resolution Confound and Distributional Quality}
\label{sec:resconfound}

Since resolution dominates the clDice gain (§\ref{sec:ablations}), it is unclear whether $256^2{\to}512^2$ improves vessel \emph{rendering} or merely the segmenter's sensitivity to sharper input. We separate the two on real images alone, with no generative model involved: for $N{=}200$ real images, $\mathrm{clDice}(\mathrm{Seg}(x), \mathrm{Seg}(\downarrow_{256}\!\uparrow_{512}(x)))$ is the score a \emph{perfectly preserved} vessel tree would receive after a $256$px bottleneck. This ceiling is $0.909$, far above both generative numbers ($0.314$, $0.559$): the segmenter tolerates $256$px inputs, so the gap reflects rendering quality, not segmenter unreliability, and matching resolution before comparing systems is mandatory rather than cosmetic. FID between $200$ real and $200$ swaps at $512^2$ is $148.5$, a sanity check only at this $N$.

\subsection{Vessel-Mask ControlNet Baseline}
\label{sec:baseline}

The most directly competitive baseline replaces the learned anatomy encoder with a hard-coded vessel mask from the same independent FIVES segmenter, projected to the same bottleneck channel slot, with everything else identical; with no learned anatomy projection to be invariant on, the contrastive loss is disabled for this row and EncRet is undefined. Compared at $256^2$, the baseline underperforms the testbed on every metric ($+0.07$ clDice, $+6$\,dB PSNR, better ColorDist and OD). \textbf{This comparison is confounded by resolution}: the testbed's headline configuration runs at $512^2$, and §\ref{sec:resconfound} shows resolution dominates clDice. We therefore warm-start the baseline to $512^2$ (the nets are fully convolutional), matching the testbed's resolution, optimiser, and schedule.

\textbf{At matched resolution the ranking inverts and the baseline wins on vessel topology}: clDice $0.582$ $[0.577,0.588]$ against $0.559$ $[0.548,0.569]$, non-overlapping intervals, with the gap holding under the second segmenter ($0.579$ vs. $0.554$), on dense Dice, and on style cosine. The testbed leads on reconstruction fidelity ($+3.4$\,dB PSNR) and, decisively, on optic-disc preservation (OD $0.476$ vs. $0.193$, floor $0.094$). A clDice margin measured across a resolution mismatch is therefore uninterpretable, however non-circular the metric itself.

\textbf{This row also exhibits circularity mode (ii)} (§\ref{sec:protocol}): the baseline is \emph{conditioned on} $\mathrm{Seg}(A)$ and \emph{scored by} $\mathrm{clDice}(\mathrm{Seg}(\swap),\mathrm{Seg}(A))$, i.e. $C = M(A)$, so its clDice rewards reproducing the evaluator's own predictions --- what a mask-conditioned model is trained to do. The score is not invalid, but neither is it clean anatomical fidelity. \emph{Seg2} does not settle it: trained on the same FIVES ground truth, its masks correlate with the first's, so it tests segmenter architecture rather than $C \perp M$; a conclusive test needs a conditioning segmenter trained on disjoint data, beyond our budget. The resolution confound and the OD/PSNR gaps hold independently of the coupling.

\subsection{A Classical GAN, and Downstream Utility}
\label{sec:munit}

\paragraph{A classical GAN.} To situate both models against the literature our critique targets, we add a single-domain MUNIT-style~\cite{cite-munit} content/style autoencoder-GAN at $256^2$, trained for $15$k iterations. It reaches clDice $0.706$ ($[0.698,0.714]$, floor $0.112$), above both the testbed ($0.559$) and the resolution-matched mask baseline ($0.582$) despite lower resolution and budget; a ColorDist diagnostic excludes the degenerate explanation that its decoder copies $A$, and we attribute the margin to its far wider content path. Mode (ii) does \emph{not} apply --- it never sees the evaluating segmenter --- so under clDice alone the testbed ranks last of the three, a ranking the encoder cosine would not reveal.

\label{sec:downstream}
\paragraph{Downstream utility.} Topology preservation is not utility. Augmenting a class-balanced DR-grading train set ($500$/class) with one testbed swap per real image gives a \emph{mixed} result on an identical held-out real set: accuracy $0.520\!\to\!0.512$, quadratic-weighted $\kappa$ $0.732\!\to\!0.759$. DR grade depends on non-vessel pathology a vessel/appearance decomposition cannot carry across a swap --- the gap §\ref{sec:disc} finds at the optic disc --- so a model can preserve topology well and still be unhelpful downstream. Both protocols are in the supplementary material.

%==============================================================================
\section{Discussion and Limitations}
\label{sec:disc}
%==============================================================================

\paragraph{Optic-disc fidelity.} The optic disc renders as an OD-sized bright blob at roughly the right location but without internal structure (vessel-emergence boundary, cup-to-disc cues). A classical-CV OD detector, independent of both the segmenter and the generative model, gives OD-Dice $0.476$ at $512^2$ against a floor of $0.094$ --- a real margin, but far below our clDice on the vessel tree. The resolution-matched mask baseline scores $0.193$, barely above the same floor: whatever the learned anatomy stream costs in clDice, it is what carries non-vessel macro-anatomy through the swap at all. The bottleneck, trained by NT-Xent invariance and a vesselness loss that suppresses blob-like structures, under-represents this anatomy, and §\ref{sec:downstream}'s mixed result follows.

\paragraph{Seed variance and segmenter dependence.} Over $n{=}4$ independent $256^2$ trainings, seed std is small (clDice $\pm0.016$, EncRet $\pm0.024$): the EncRet ablation swings ($+0.08$, $-0.15$) are $3$--$7\sigma$ while clDice barely moves, placing the circularity well outside seed noise. The $512^2$ runs are single ($\sim$30\,h each), bounded by the $256^2$ seed std. The protocol also depends on its segmenter; a different one shifts absolute numbers, which we mitigate by always reporting the floor. Recomputing every row under \emph{Seg2} (Table~\ref{tab:ablation}) leaves the ablation ordering and the baseline comparison unchanged --- establishing robustness to segmenter \emph{architecture}, not conditioning--evaluator independence (§\ref{sec:baseline}); an ensemble over segmenters trained on disjoint corpora would settle both, and remains future work.

\paragraph{Style transfer is also an encoder cosine.} Our ``Style'' column, $\cos(\Es(B),\Es(\swap))$, is circular in the same way, which is why we pair it with the non-encoder ColorDist. The two diverge: the $512^2$ mask baseline posts the highest Style cosine of any row ($0.905$) on an unremarkable ColorDist ($0.066$). Style's ordering among close variants should not be over-interpreted; we report it only for continuity with prior work.

%==============================================================================
\section{Conclusion}
%==============================================================================

The encoder-cosine ``anatomy retention'' metric is \emph{circular}: it measures invariance of the encoder that generated the swap, not preservation of anatomy. Our non-circular replacement --- clDice via an independently-trained segmenter, read against an unrelated-pair floor --- is stable under a second segmenter, but applying it across three architectures shows non-circularity is necessary and not sufficient. We recommend that work evaluating conditional medical-image synthesis (i) prefer segmenter-based topology metrics over encoder-projection cosines, (ii) match resolution and compute before reading any margin, (iii) audit whether a model's conditioning signal and its evaluator share a source, and (iv) report complementary evidence so no single scalar carries the claim. Conditions (ii) and (iii) are not hypothetical: each changes a conclusion above.

\section{Acknowledgement}
This study was supported by Qatar Research Development and Innovation (QRDI) Council fund under Path Towards Precision Medicine PPM-07-0418-240048.

\bibliographystyle{splncs04}
\bibliography{refs}

\end{document}
