% Supplementary material for:
%   "Disentangled Retinal Fundus Synthesis with Non-Circular Vessel-Topology Evaluation"
\documentclass[runningheads]{llncs}

\usepackage[T1]{fontenc}
\usepackage{graphicx}
\usepackage{amsmath,amssymb}
\usepackage{booktabs}
\usepackage[hidelinks]{hyperref}
\usepackage{marvosym} 

\newcommand{\Ea}{E_{a}}
\newcommand{\Es}{E_{s}}
\newcommand{\Vc}{\mathbf{c}}
\newcommand{\Vsv}{\mathbf{v}_{s}}
\newcommand{\swap}{\mathrm{swap}}

\title{Supplementary Material\\Disentangled Retinal Fundus Synthesis with\\Non-Circular Vessel-Topology Evaluation}
\titlerunning{Supplementary: Non-Circular Disentanglement Evaluation}
\author{Syed Abdullah Basit\textsuperscript{(\Letter)} \and Tanvir Alam}
\authorrunning{S. A. Basit and T. Alam}
\institute{College of Science and Engineering, Hamad Bin Khalifa University, Doha, Qatar\\
\email{syba62904@hbku.edu.qa}}

\begin{document}
\maketitle

%==============================================================================
\section{Blur Sweep: Full Curve}
\label{sup:blur}
%==============================================================================

Figure~\ref{fig:sup-blur} plots the full Gaussian-blur sweep summarised in §4.2 of the main paper. The two curves decouple monotonically over the whole range of $\sigma$: independent-segmenter clDice between the original and blurred image collapses from $0.999$ to $0.0003$, crossing the unrelated-pair floor of $0.11$ at approximately $\sigma{=}6$, while the encoder cosine never falls below $0.78$. The degradation is applied to the input image directly, outside the generative model, so no property of $\epsilon_\theta$ is involved: the blur preserves the coarse layout the encoder is trained to be invariant to while destroying the fine vessel structure the protocol is meant to measure, which is precisely the regime in which a circular metric and a non-circular one must disagree.

\begin{figure}[h]
\centering
\includegraphics[width=0.52\linewidth]{figs/fig2_blur_sweep.png}
\caption{The encoder cosine is blind to vessel destruction. As Gaussian-blur $\sigma$ grows, independent-segmenter clDice between original and blurred image collapses from $0.999$ to $0.0003$ --- far below the $0.11$ unrelated-pair floor --- while the encoder cosine stays above $0.78$. Mean$\pm$std over $N{=}40$ images.}
\label{fig:sup-blur}
\end{figure}

%==============================================================================
\section{Testbed: Guidance and Optimisation}
\label{sup:training}
%==============================================================================

This section details the testbed configuration summarised in §3.1 and §4.1 of the main paper.

\paragraph{Classifier-free guidance.} During training, with probability $0.1$ we zero both $\Vc$ and the multi-scale residuals; independently and with the same probability we zero $\Vsv$. At inference we run three UNet passes --- null, anatomy-only, full --- and combine
\begin{equation}
\epsilon = \epsilon_u + \alpha_a(\epsilon_a - \epsilon_u) + \alpha_s(\epsilon_{as} - \epsilon_a),
\end{equation}
giving compositional guidance on anatomy ($\alpha_a$) and style ($\alpha_s$) separately. Independent dropout of the two conditioning streams is what makes the two guidance scales separable at inference; reported results use $(\alpha_s,\alpha_a){=}(1.0,3.0)$ from a coarse sweep at the main checkpoint.

\paragraph{Loss weights.} $\lambda_\mathrm{diff}{=}1.0$, $\lambda_\mathrm{contrastive}{=}0.2$, $\lambda_\mathrm{style\_var}{=}1.0$, $\lambda_\mathrm{style\_recon}{=}2.0$, $\lambda_\mathrm{struct}{=}0.5$. These were set once by coarse manual balancing so that no single term dominated the diffusion loss early in training, and were not tuned per dataset or per ablation; every row of Table~1 in the main paper uses the same values, so the ablation contrasts isolate the presence of a term rather than its weight.

\paragraph{Optimisation.} AdamW at $10^{-4}$ peak learning rate, 10-epoch warmup, cosine decay to $10^{-5}$. Three parameter groups with per-group learning-rate multipliers (UNet $1\times$, $\Es$ $5\times$, $\Ea$ $0.1\times$): the appearance encoder is trained faster because VICReg's variance term otherwise collapses early, and the anatomy encoder slower because the contrastive objective is the only thing holding the bottleneck stable. Per-group gradient clipping at $1.0$, bf16 mixed precision, UNet gradient checkpointing, one NVIDIA RTX 3090 ($24$\,GB). A $512^2$ run takes $\sim$30\,h; each $256^2$ ablation $\sim$6\,h.

%==============================================================================
\section{Qualitative Testbed Swaps}
\label{sup:qual}
%==============================================================================

\begin{figure}[h]
\centering
\includegraphics[width=0.62\linewidth]{figs/fig3_qualitative_2row.png}
\caption{Qualitative swaps from the latent-diffusion testbed at $512^2$. Two randomly drawn pairs (rows). Columns: source $A$ (vessel anatomy), source $B$ (appearance), and $\swap(A,B)$, at $(\alpha_s,\alpha_a){=}(1.0,3.0)$ with $50$ DDIM steps. Vessel topology follows $A$; colour, exposure, and texture follow $B$. The optic disc is rendered as a bright blob at approximately the correct location but without internal structure --- the limitation quantified as OD-Dice in the main paper (§5) and analysed there.}
\label{fig:sup-qual}
\end{figure}

%==============================================================================
\section{Classical GAN Baseline: Architecture and Training}
\label{sup:munit}
%==============================================================================

This section details the MUNIT-style baseline summarised in §4.6 of the main paper.

\paragraph{Architecture.} A single-domain content/style autoencoder-GAN following MUNIT's decomposition:
\begin{itemize}
  \item \textbf{Content encoder.} Two stride-2 downsampling convolutions ($3\!\to\!64\!\to\!128\!\to\!256$) followed by $4$ residual blocks, all with InstanceNorm --- which removes per-channel first- and second-order statistics and so suppresses style information. Output: a $256$-channel map at $1/4$ input resolution.
  \item \textbf{Style encoder.} Four stride-2 convolutions with no normalisation (normalisation would discard exactly the statistics style must retain), then global average pooling and a $1{\times}1$ convolution to an $8$-dimensional style code.
  \item \textbf{Decoder.} Four AdaIN residual blocks whose per-layer affine parameters are produced by a 3-layer MLP from the style code, followed by two nearest-neighbour upsampling stages and a $\tanh$ output.
  \item \textbf{Discriminator.} A 2-scale PatchGAN (full and half resolution) with LSGAN loss.
\end{itemize}

\paragraph{Objective.} $\mathcal{L} = 10\,\mathcal{L}_{\mathrm{recon}}^{x} + \mathcal{L}_{\mathrm{recon}}^{c} + \mathcal{L}_{\mathrm{recon}}^{s} + \mathcal{L}_{\mathrm{adv}}$, with within-domain image reconstruction, latent content- and style-cycle reconstruction (encode the cross-synthesised image, recover $c_1$ and $s_2$), and the LSGAN adversarial term. Cross-synthesis pairs are formed by a random within-batch permutation.

\paragraph{Training.} Adam, learning rate $10^{-4}$, $\beta=(0.5, 0.999)$, weight decay $10^{-4}$, batch size $8$, $256^2$ resolution, $15$k iterations. This is roughly an order of magnitude below MUNIT's typical $100$k+ schedule, so the run constitutes a reduced-budget baseline rather than a converged MUNIT, and its clDice is a lower bound on what the architecture can reach.

\paragraph{Evaluation.} Identical to the main protocol: the same $N{=}300$ fixed pairs (seed $42$), the same frozen FIVES ResNet-34 segmenter, the same bootstrap procedure. Results appear in §4.6 of the main paper. This baseline is never exposed to the evaluating segmenter, so unlike the vessel-mask ControlNet it does not exhibit conditioning--evaluator coupling (circularity mode (ii), §3.2).

\paragraph{Style-transfer diagnostic.} A high clDice would be trivial if the decoder simply ignored the style code and copied $A$, so we check that appearance genuinely transfers. ColorDist$(\swap,B){=}0.064$ falls below the unrelated-pair floor ColorDist$(A,B){=}0.104$, and $\swap$ is colour-\emph{closer} to $B$ ($0.064$) than to $A$ ($0.074$). The degenerate copy explanation is therefore excluded: MUNIT-lite's clDice advantage is not bought by discarding the swap. We attribute it instead to content-channel capacity --- a dense $256$-channel content map at $1/4$ resolution against the testbed's $8$-channel bottleneck at $1/8$, further constrained by the contrastive and vesselness objectives. A wider, less-regularised content path preserves more vessel detail while guaranteeing nothing about content/style separation, which is precisely what the narrow bottleneck buys and what clDice alone cannot see.

%==============================================================================
\section{Downstream Utility: Full Protocol}
\label{sup:downstream}
%==============================================================================

This section details the DR-grading experiment summarised in §4.6 of the main paper.

\paragraph{Split.} Grades are taken from the source corpora's own labels (EyePACS + APTOS union, grades 0--4). We draw a class-balanced split: $500$ training images and $100$ validation images per class, disjoint, fixed seed $42$. Validation images are always \emph{real} and are never augmented, so the two conditions are scored on identical data.

\paragraph{Conditions.}
\begin{itemize}
  \item \emph{real-only}: $2{,}500$ real training images.
  \item \emph{real+synth}: the same $2{,}500$ real images plus $2{,}500$ synthetic copies. Each synthetic image is $\swap(A, B)$ where $A$ is the real training image (contributing vessel anatomy and, by assumption, its grade label) and $B$ is a randomly drawn other training image (contributing appearance only). Generated with the $256^2$ checkpoint, DDIM $20$ steps, $(\alpha_s, \alpha_a) = (1.0, 3.0)$.
\end{itemize}

\paragraph{Classifier.} ResNet-18, ImageNet-initialised, $224^2$ inputs, AdamW at $10^{-4}$ with cosine decay, weight decay $10^{-4}$, batch size $32$, $15$ epochs, random horizontal flip. Both conditions use identical hyperparameters and seed; only the training set differs.

\paragraph{Metrics.} Accuracy, quadratic-weighted Cohen's $\kappa$ (the standard DR-grading metric, which penalises distant misgradings more heavily), and per-class recall.

\begin{table}[h]
\centering
\caption{DR-grading downstream utility. Identical real held-out validation set for both rows.}
\label{tab:sup-downstream}
\begin{tabular}{lcccccccc}
\toprule
Condition & Acc. & QWK & \multicolumn{5}{c}{Per-class recall} \\
\cmidrule(lr){4-8}
 & & & G0 & G1 & G2 & G3 & G4 \\
\midrule
real-only  & \textbf{0.520} & 0.732 & 0.53 & \textbf{0.50} & \textbf{0.37} & 0.56 & 0.64 \\
real+synth & 0.512 & \textbf{0.759} & 0.53 & 0.48 & 0.30 & 0.56 & \textbf{0.69} \\
\bottomrule
\end{tabular}
\end{table}

\paragraph{Reading the result.} Accuracy is flat-to-slightly-down while quadratic $\kappa$ improves, as expected when errors shift from distant to adjacent grades without becoming less frequent. Per-class recall is mixed. The result is therefore \emph{mixed rather than positive}, and does not establish augmentation utility for DR grading.

DR grade is driven substantially by non-vessel pathology --- haemorrhages, exudates, cotton-wool spots, neovascularisation --- which a vessel-anatomy/appearance decomposition has no explicit mechanism to preserve when appearance is swapped. A swap that faithfully carries $A$'s vessel tree may nonetheless discard the lesions that determine $A$'s label, in which case the synthetic image is mislabelled and acts as training noise. This is consistent with the optic-disc finding (§5), and motivates reporting downstream evidence alongside topology metrics rather than treating the latter as a proxy for utility.

\paragraph{Strengthening the experiment.} A lesion-preservation metric analogous to OD-Dice, or a downstream task depending directly on vessel topology (e.g. vessel-density-based hypertensive-retinopathy screening) rather than on lesion detection. Both are future work.

%==============================================================================
\section{Second Segmenter}
\label{sup:seg2}
%==============================================================================

The \emph{Seg2} column of Table~1 uses a U-Net with a ResNet-18 encoder (vs. ResNet-34 for the primary segmenter), independently initialised (seed $1234$) and trained on the same FIVES train split with the same recipe: AdamW at $3\times 10^{-4}$, weight decay $10^{-4}$, BCE $+$ Dice loss, batch size $4$, $40$ epochs, $512^2$ inputs. Final held-out test Dice: $0.889$ (primary segmenter: $>0.7$ by the same measure, reported in §4.1).

Seg2 tests whether conclusions depend on the \emph{architecture} of the evaluating segmenter. It does not test conditioning--evaluator independence (circularity mode (ii)), because it is trained on the same FIVES ground-truth masks as the primary segmenter and its predictions are therefore strongly correlated with them. Disentangling those two questions requires a segmenter trained on a disjoint vessel-annotation corpus, which we identify as the principal open item in §4.5.

\end{document}
