\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution
\usepackage{array}

%  THIS IS FOR TRACK CHANGES...
\usepackage{cbar}
\cbtrue

\usepackage{}
\newcommand{\PreserveBackslash}[1]{\let\temp=\\#1\let\\=\temp}
\newcolumntype{C}[1]{>{\PreserveBackslash\centering}p{#1}}
\usepackage{mwe} % to get dummy images
\jmlrvolume{-- 234}
\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025}
\editors{Accepted for publication at MIDL 2025}

% \title[Context based kernel harmonization]{Context based harmonization of low-dose lung cancer computed tomography reconstruction kernels using multipath cycleGAN}
\title[Anatomically guided kernel harmonization]{Anatomy-Guided Multi-Path CycleGAN for Lung CT Kernel Harmonization}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Aravind R. Krishnan\nametag{$^{1}$}}  \Email{aravind.r.krishnan@vanderbilt.edu}\\
\Name{Thomas Z. Li}\nametag{$^{1}$} \Email{thomas.z.li@vanderbilt.edu}\\
\Name{Lucas W. Remedios}\nametag{$^{1}$} \Email{lucas.w.remedios@vanderbilt.edu}\\
\Name{Kaiwen Xu\nametag{$^{2}$}} \Email{kaiwen.xu@insitro.com}\\
\Name{Lianrui Zuo\nametag{$^{1}$}} \Email{lianrui.zuo@vanderbilt.edu}\\
\Name{Kim L. Sandler\nametag{$^{3}$}} \Email{kim.sandler@vumc.org}\\
\Name{Fabien Maldonado}\nametag{$^{3}$} \Email{fabien.maldonado@vumc.org}\\
\Name{Bennett A. Landman}\nametag{$^{1,3}$} \Email{bennett.landman@vanderbilt.edu}\\
\addr $^{1}$ Vanderbilt University, Nashville, TN, USA \\
\addr $^{2}$ Insitro, South San Francisco, CA, USA \\
\addr $^{3}$ Vanderbilt University Medical Center, Nashville, TN, USA \\
}

\begin{document}

\maketitle

\begin{abstract}
Accurate quantitative measurement in lung computed tomography~(CT) imaging often relies on consistent kernel reconstruction across scanners and manufacturers. 
Harmonization can reduce measurement variability caused by heterogeneous reconstruction kernels; however, harmonization across different manufacturers and scanners remains challenging due to significant differences in reconstruction protocol and positional alignment of subjects, often resulting in anatomical hallucinations.
%\Added{\textbf{ADDED CONTENT BASED ON REVIEWER 1}}{R1}{ADDED}
%\Deleted{\textbf{deleted content}}{R1}{DELETED}
% Harmonizationof images reconstructed with kernels from the same manufacturer for a given subject is feasible due to the presence of a one-to-one pixel correspondence between the images. 
% However, cross-manufacturer harmonization presents challenges due to differences in reconstruction protocol and positional alignment of subjects, resulting in hallucinations of specific anatomical regions. 
To address this, we propose a multi-path cycleGAN framework that incorporates multi-region anatomical labels and a tissue statistic loss as anatomical regularization to preserve structural integrity during harmonization.
We trained our model on $100$ scans each of four representative reconstruction kernels from the National Lung Screening Trial (NLST) dataset and evaluated it on $240$ withheld scans. 
Experimental results demonstrate superior performance of our method in both within-manufacturer harmonization and cross-manufacture harmonization:
Harmonizing hard-to soft-kernel images within a single manufacturer significantly reduces emphysema measurement discrepancies ($p<0.05$). 
Across manufacturers, harmonizing all kernels to a reference soft kernel yields consistent emphysema quantification ($p>0.05$) and preserves anatomical structures, as demonstrated by improved Dice similarity coefficient in skeletal muscle and subcutaneous adipose tissue between harmonized and unharmonized images.
These findings demonstrate that segmentation-driven anatomical regularization effectively addresses cross-manufacturer discrepancies, ensuring robust quantitative imaging. We release our code and model at \href{https://github.com/MASILab/AnatomyconstrainedMultipathGAN}{https://github.com/MASILab/AnatomyconstrainedMultipathGAN}.
% incorporate anatomical regularization using multilabel segmentation masks and train a multipath cycleGAN model, capable of harmonizing across reconstruction kernels obtained from different manufacturers using domain specific encoders, decoders and discriminator models. Specifically, we train our model on 100 scans each of four representative reconstruction kernels obtained from the National Lung Screening Trial (NLST) dataset by incorporating a tissue statistic loss that computes the mean between the synthetic image and the real image for a given path to prevent anatomical hallucinations.
% Using 240 withheld scans, we harmonize a hard kernel to a corresponding soft kernel within a given manufacturer and quantify percent emphysema. Across manufacturers, we harmonize all kernels to a reference soft kernel, quantifying percent emphysema and anatomical consistency. The proposed approach is capable of minimizing differences in measurements for percent emphysema on reconstruction kernels obtained from a given manufacturer (p$<$0.05). Across manufacturers, harmonization of all kernels to a reference soft kernel minimizes differences in emphysema measurements (p$>$0.05). Furthermore, anatomical consistency is preserved in unpaired kernels, as observed from high Dice scores for skeletal muscle and subcutaneous adipose tissue. The segmentation labels obtained from TotalSegmentator provide context for kernel harmonization that alleviates anatomical hallucinations in cross-manufacturer harmonization.
\end{abstract}

\begin{keywords}
cycleGAN, harmonization, CT, emphysema, synthesis
\end{keywords}

\section{Introduction}
The reconstruction kernel in computed tomography (CT) is a scanner parameter that impacts the spatial resolution and signal to noise ratio~(SNR) of the image. 
The choice of kernel introduces a trade-off between spatial resolution and noise, where images reconstructed with a ``hard'' kernel have high spatial resolution but poor SNR.
On the other hand, images reconstructed with a ``soft'' kernel have decreased spatial resolution but improved SNR~\cite{schaller2003spatial}. Hard kernels highlight structures such as bone and lung, while soft kernels are useful for soft tissues~\cite{lasek2020ct}. 
However, the acquisition protocol varies across manufacturers, causing undesired variability in ``sharpness'' and ``softness'' of different kernels.
This flexibility of reconstruction kernels often introduces inconsistencies in quantitative imaging measurements, including percent emphysema~\cite{boedeker2004emphysema}, body composition assessment~\cite{troschel2020computed}, radiomic feature assessment~\cite{meyer2019reproducibility}, and coronary artery calcification~\cite{lessmann2017automatic}.
This issue is further compounded in multi-center and longitudinal studies, where acquiring CT images with consistent reconstruction kernels over time is challenging. 
To mitigate this issue, CT harmonization techniques have been proposed in recent years ~\cite{krishnan2024inter,krishnan2024lung}.\\
\indent Cross-vendor harmonization is beneficial in multi-center studies and longitudinal studies where paired data might be unavailable.
However, cross-vendor harmonization is challenging due to variability in reconstruction protocols and positional alignment of subjects. In recent times, the cycleGAN~\cite{zhu2017unpaired} has been adapted for cross-vendor harmonization ~\cite{gravina2022leveraging, yang2021continuous, selim2022cross}. Our group ~\cite{krishnan2024inter} introduced a multipath cycleGAN model with a shared latent space that could minimize differences in emphysema measurements across paired and unpaired kernels. However, the difference in the semantic field of view (FOV) across the images resulted in artifacts outside the FOV. To address these challenges, we modified the shared latent space, standardized preprocessing by enforcing circular FOVs, and developed a two-stage multipath cycleGAN model \cite{krishnan2025multipathcycleganharmonizationpaired}, hypothesizing that a shared latent space enforces consistency in emphysema measurements and preserves anatomy post harmonization. Additionally, we incorporated an identity loss to preserve radio-opacity post-harmonization. Stage one focused on harmonizing different kernel combinations from Siemens and GE scanners, while in Stage two, the pre-trained kernels were frozen and kernels from Philips and GE were harmonized across all combinations of kernels from both stages.\\
\indent The modified multipath cycleGAN achieved consistent emphysema measurements across paired and unpaired kernels. However, when the GE BONE kernels were harmonized to the reference Siemens B30f kernel, certain regions of the skeletal muscle and subcutaneous adipose tissue (SAT) intensities were inverted, resulting in anatomical ``hallucinations'' (\autoref{fig:problem-figure}).
This issue stems from the distribution matching losses in the objective function that introduces/removes features from the image ~\cite{cohen2018distribution,zuo2023haca3}. 
In the absence of additional anatomical constraints, quantitative imaging beyond the lung region, focused on body composition assessment and radiomic feature assessment becomes difficult.\\
\indent In this work, we hypothesize that skeletal muscle and SAT hallucinations can be mitigated by leveraging pre-computed   TotalSegmentator~\cite{wasserthal2023totalsegmentator} labels in the multipath cycleGAN model. Specifically, we replace the identity loss with a tissue based loss function that penalizes intensity shifts in anatomical structures
to preserve anatomy. 
We compare our approach against Stage one of the multipath cycleGAN model without anatomical guidance, the standard cycleGAN~\cite{zhu2017unpaired} and switchable cycleGAN~\cite{yang2021continuous} on the following tasks: a) emphysema on paired kernels b) emphysema on unpaired kernels and c) quantifying hallucinations in skeletal muscle and SAT post harmonization using Dice coefficients and effect sizes.

\begin{figure}[!tb]
\floatconts
  {fig:problem}
  {\caption{Distribution matching losses in cycleGAN and its variants result in anatomical hallucinations. Harmonization of the GE BONE image to the reference Siemens B30f kernel enforces consistent texture across the lung. However, we observe that in the highlighted regions and beyond, skeletal muscle and SAT tissue intensities are inverted, leading to inconsistent anatomy.}}
  {\includegraphics[width=0.7\linewidth, height=0.3\textwidth, keepaspectratio]{Problem-1.png}}
  \label{fig:problem-figure}
\end{figure}

\section{Methodology}
A standard cycleGAN model uses two generators and discriminators for unpaired image-to-image translation. In the multipath cycleGAN model, we split a ResNet generator~\cite{he2016deep} into an encoder, shared residual block and decoder such that each domain has an encoder and decoder pair that harmonizes through the shared residual block, behaving as the latent space. Additionally, each domain has a PatchGAN discriminator~\cite{isola2017image}  that promotes adversarial training by classifying 70 × 70 patches of images as real or synthetic. 
In Stage one of the multipath cycleGAN model, we use reconstruction kernels from the Siemens B50f(hard), Siemens B30f(soft), GE BONE (hard) and GE STANDARD (soft) kernels from the National Lung Cancer Screening Trial (NLST)~\cite{aberle2011reduced} dataset, harmonizing across six different combinations of reconstruction kernels. For each path, the cycleGAN operates as follows: in the forward path, the source encoder maps the real input to the shared latent space, and the target decoder produces a synthetic image in the target domain style. In the backward path, the target encoder maps the real image to the latent space, and the source decoder generates the synthetic image in the source domain style. The PatchGAN discriminator distinguishes between real and synthetic images to promote adversarial learning.\\
\indent Previously, we introduced an identity loss in addition to the adversarial and cycle-consistency losses to preserve the radio-opacity in the harmonized images. This identity loss, computed as the mean squared error between the downsampled versions of the real and synthetic images, smooths out kernel effects and encourages the model to preserve intensity across the entire image. However, the identity loss primarily focuses on preserving global structural consistency, which may overlook subtle intensity variations in localized structures. To address this, we replace the identity loss with a tissue statistic loss that penalizes the mean intensity difference for each anatomical structure between the real and synthetic images. The tissue statistic loss uses anatomical labels derived from TotalSegmentator {(version 2.1.0)}. The mean intensity is chosen as the statistic with the assumption that mean intensities between hard and soft kernels are similar.
As shown in Fig.~\ref{fig:method-figure}, our proposed approach follows the multipath cycleGAN architecture with the tissue statistic loss replacing the identity loss to preserve anatomy.

\begin{figure}[!tb]
\floatconts
  {fig:method}
  {\caption{For any given pair of reconstruction kernels, the generator is a ResNet, formed by a source encoder and target decoder in the forward path and a target encoder and source decoder in the backward path. Each generator produces a synthetic image with the style of either the source or target kernel. A PatchGAN is used as a discriminator for the corresponding domain to distinguish between real and synthetic images. The mean of all unique labels are computed using the multilabel masks for the real and synthetic image and are penalized such that the anatomy remains preserved in the harmonized image.}}
  {\includegraphics[width=0.9\linewidth, height=0.5\textwidth, keepaspectratio]{Method_v3-1.png}}
  \label{fig:method-figure}
\end{figure}
 
We train our proposed multipath cycleGAN model with anatomical guidance using 100 volumes per kernel, reconstructed with Siemens B50f, Siemens B30f, GE BONE  and GE STANDARD (soft).
We train on axial image slices and multilabel masks of size 512 × 512 pixels, clipping intensities to [-1024, 3072] Hounsfield units (HU) and normalizing to [-1,1]. The model is trained for 200 epochs in parallel on two NVIDIA A6000 RTX GPUs with a batch size of four, Adam~\cite{kingma2014adam} optimizer and a learning rate of 2 × $10^{-3}$, which remains constant for 100 epochs before linearly decaying. The generator and discriminator are governed by an adversarial loss using the LSGAN~\cite{mao2017least} loss function. We implement the cycle-consistency loss following the standard cycleGAN model, weighted by a factor of 10. The tissue statistic loss, which is computed as the mean squared error (MSE) between the mean intensities of anatomical labels in the real and synthetic images, is weighted by a factor of 0.01.

\section{Experiments}
\subsection{Baseline models}
We compare our proposed approach to three different baseline models: standard cycleGAN~\cite{zhu2017unpaired}, switchable cycleGAN~\cite{yang2021continuous} and the multipath cycleGAN without anatomical guidance~\cite{krishnan2025multipathcycleganharmonizationpaired}. We implement 4 individual standard cycleGAN models and switchable cycleGAN models for all paired kernels and unpaired kernels that harmonize to a reference soft kernel. We trained each standard cycleGAN for 200 epochs using a batch size of six, Adam optimizer for the generator and discriminator and learning rate of 2 × $10^{-3}$ . Each switchable cycleGAN was trained for 200 epochs using a batch size of 16, patch size of 128 × 128 pixels, Adam optimizer and a learning rate of of 1 × $10^{-5}$. 
The multipath cycleGAN model without anatomical guidance is trained with the same hardware and optimization settings as the anatomy guided multipath cycleGAN, with a batch size of two.

\subsection{Evaluation using percent emphysema}
Generative networks vary in performance during every epoch as a result of adversarial learning. We use 100 volumes from each representative kernel for validation that results in the checkpoint to be used during inference. We select the optimal model checkpoint for inference using percent emphysema as our validation metric. Using a publicly available algorithm~\cite{hofmanninger2020automatic}, 
we identify lung regions and compute percent emphysema by quantifying the percentage of voxels that have radio-opacity less than -950 HU. We consider the following paths for checkpoint selection: Siemens B50f to Siemens B30f, GE BONE to Siemens B30f and GE STANDARD to Seimens B30f. MSE is computed for emphysema scores on paired kernels, while Kullback-Leibler (KL) divergence is used for unpaired kernels. We rank all the scores, performing a weighted sum of ranks to obtain the overall rank for the given epoch using the equation: 
\begin{align}
\text{Epoch} = 
0.5 \cdot \text{MSE}_{\text{B50f}\to\text{B30f}} + 
0.25 \cdot \text{KL}_{\text{BONE}\to\text{B30f}} + 
0.25 \cdot \text{KL}_{\text{STANDARD}\to\text{B30f}}
\end{align}
The epoch with smallest rank is chosen as the optimal checkpoint for inference.
\subsection{Quantifying tissue hallucinations using TotalSegmentator}
We investigate the consistency in skeletal muscle and SAT post harmonization for the GE kernels. Using Dice coefficients, we quantify the magnitude of hallucinations on the unpaired kernels for every available model.
Cohen's d~\cite{cohen2013statistical} is computed on the Dice scores obtained for muscle and SAT to quantify the effect size between our proposed approach and the baseline models. Cohen's d quantifies the magnitude of differences between two groups by considering the means and pooled standard deviation. Effect sizes to quantify hallucinations are categorized as small (d$\leq$0.2), medium (0.2$\leq$d$\leq$0.5), and large (d$\geq$0.8). A positive d indicates a higher mean in group one compared to group two, while a negative d indicates the opposite.
\section{Results}
We compare performance of the proposed approach with the baseline models on emphysema quantification and anatomical consistency using 240 withheld volumes.

\subsection{Emphysema quantification on paired and unpaired kernels}
We investigate the impact of kernel harmonization on paired reconstruction kernels obtained from Siemens and GE manufacturers. We use bootstrapping with 1000 resamples to compute the median (RMSE) and $95\%$ confidence intervals. All before and after measurements are significant ($p<0.05$, Wilcoxon-signed rank test). All measurements are presented in Table ~\ref{tab:emphysema-paired}. Before harmonization, there is variability in the emphysema measurements for the Siemens (12.12$\%$ [11.76$\%$, 12.42$\%$]) and GE (9.91$\%$ [9.5$\%$, 10.33$\%$]) kernels. Harmonization minimizes differences in emphysema measurements across all models with the anatomy-guided multipath cycleGAN achieving the lowest RMSE for Siemens (1$\%$[0.9$\%$,1.11$\%$]) and the counterpart model without anatomical guidance achieving the lowest RMSE for GE (0.84$\%$ [0.75$\%$, 0.95$\%$]).\\
\indent Across reconstruction kernels from all manufacturers, before harmonization, the B50f and BONE kernels have median emphysema scores of 20.02$\%$ and 12.75$\%$  while the B30f (reference) and STANDARD kernels have scores of 6.60$\%$ and 2.34$\%$. We present the minimum, maximum, and median emphysema scores in Table \ref{tab:emphysema-unpaired_reference_kernel}. Compared to the counterpart model without guidance, anatomical guidance improved consistency in emphysema measurements with a small improvement in the B50f kernel, while median emphysema scores for BONE and STANDARD were slightly lower. Across other baselines, the proposed model with anatomical guidance showed consistent emphysema measurements.

\begin{table}[tb!]
    \centering
    \caption{Percent emphysema measurements before and after harmonization across all models for paired kernels from the Siemens and GE manufacturers. Measurements are represented as RMSE and 95$\%$ confidence intervals. Differences before and after harmonization are significant ($p<0.05$, paired Wilcoxon signed-rank test). $^*p <0.05$ compared to other methods.}
    \label{tab:emphysema-paired}
    \resizebox{0.7\textwidth}{!}{%
    \begin{tabular}{l c c}
    \toprule
    ~ & \textbf{Siemens} & \textbf{GE} \\ \hline
    \textbf{Before harmonization} & 12.12$\%$ [11.76$\%$, 12.42$\%$] & 9.91$\%$ [9.5$\%$, 10.33$\%$] \\ \hline
    \textbf{CycleGAN} & 1.16$\%$ [1.05$\%$, 1.29$\%$] & 0.91$\%$ [0.81$\%$, 1.05$\%$] \\ \hline
    \textbf{Switchable CycleGAN} & 1.57$\%$ [1.41$\%$, 1.71$\%$] & 1.01$\%$ [0.89$\%$, 1.21$\%$] \\ 
    \hline \hline
    \textbf{Proposed (w/o guidance)} & 1.35$\%$ [1.22$\%$, 1.52$\%$] & \textbf{0.84$\%$ [0.75$\%$, 0.95$\%$]$^*$} \\ \hline
    \textbf{Proposed (w/ guidance)} & \textbf{1$\%$ [0.9$\%$, 1.11$\%$]$^*$} & 1.05$\%$ [0.94$\%$, 1.25$\%$] \\
    \bottomrule
    \end{tabular}%
}
\end{table}


\begin{table}[tb!]
    \centering
    \caption{Percent emphysema measurements for all kernels harmonized to the reference Siemens B30f with a median emphysema measurement of 6.60$\%$ (0.22$\%$, 40.54 $\%$) kernel across all models. All measures are expressed as median (minimum, maximum). Mann-Whitney U test is used to assess statistical significance before and after harmonization}
    \label{tab:emphysema-unpaired_reference_kernel}
    \resizebox{0.9\textwidth}{!}{%
    \begin{tabular}{l c c c}
    \toprule
    ~ & \textbf{Siemens B50f} & \textbf{GE BONE} & \textbf{GE STANDARD} \\ \hline
    \textbf{Before harmonization} & 20.02$\%$ (2.32$\%$, 43.08$\%$) & 12.75$\%$ (0.75$\%$, 43.59$\%$) &  2.34$\%$ (0.02$\%$, 42.38$\%$)\\ \hline
    \textbf{CycleGAN} & 7.19$\%$ (0.23$\%$, 40.78$\%$) & \textbf{6.00$\%$ (0.38$\%$, 50.07$\%$)} & 5.56$\%$ (0.30$\%$, 59.43$\%$)\\ \hline
    \textbf{Switchable CycleGAN} & 7.30.$\%$ (0.21$\%$, 42.36$\%$) & 10.01$\%$ (1.28$\%$, 55.54$\%$) & \textbf{6.53$\%$ (0.15$\%$, 56.37$\%$)}\\ 
    \hline \hline
    \textbf{Proposed (w/o guidance)} & 6.34$\%$ (0.27$\%$, 38.92$\%$) & 5.70$\%$ (0.11$\%$, 50.24$\%$) & 6.01$\%$ (0.22$\%$, 53.13$\%$)\\ \hline
    \textbf{Proposed (w/ guidance)} & \textbf{6.73$\%$ (0.24$\%$, 40.00$\%$)} & 5.11$\%$ (0.14$\%$, 46.03$\%$) & 5.25$\%$ (0.16$\%$, 45.77$\%$)  \\
    \bottomrule
    \end{tabular}%
}
\end{table}


\begin{figure}[h!]
    \centering
    \includegraphics[width=\linewidth, height=0.3\textheight, keepaspectratio]{Qualitative_figure-1.png}
    \caption{Unpaired kernel harmonization using distribution matching losses enforces consistency in the lung. However, in the cycleGAN and multipath cycleGAN models, skeletal muscle and SAT tissue intensities are inverted, resulting in anatomical hallucinations. Using the anatomical guidance as a regularization constraint, we observe that hallucinations in the regions of interest disappear.}
    \label{fig:qualitative-figure}
\end{figure}

\begin{figure}[h!]
    \centering
    \includegraphics[width=1.0\linewidth,keepaspectratio]{dice_scores_unpaired.png}
    \caption{In the GE kernel, Dice scores computed on skeletal muscle and SAT highlight the degree of hallucinations. Anatomical guidance improves Dice scores on muscle and SAT compared to our counterpart model and the cycleGAN model. While Dice scores are better on skeletal muscle, the switchable cycleGAN shows slightly superior performance.}
    \label{fig:dice-effect-size}
\end{figure}

 



\begin{table}[!tb]
    \caption{Cohen's d to quantify anatomical hallucinations in skeletal muscle and SAT. Effect sizes to quantify hallucinations are categorized as small ($d \leq 0.2$), medium ($0.2 \leq d \leq 0.5$), and large ($d \geq 0.8$). Numbers are calculated between proposed with anatomy guide ``\texttt{Proposed w/ guide}'' and other comparison methods.}
    \label{tab:effect_sizes}
    \centering
    \resizebox{0.9\textwidth}{!}{
    \begin{tabular}{p{4cm} C{2cm} C{2cm} C{2cm} C{2cm} C{2cm} C{2cm}}
    \toprule
     \textbf{Proposed w/ guide } & \multicolumn{2}{c}{\bf CycleGAN}  
     & \multicolumn{2}{c}{\bf Proposed w/o guide}  
     & \multicolumn{2}{c}{\bf Switch. CycleGAN} \\ 
     \cmidrule(lr){2-3} \cmidrule(lr){4-5} \cmidrule(lr){6-7}
     & {Muscle} & {SAT} & {Muscle} & {SAT} & {Muscle} & {SAT} \\
     \cmidrule(lr){2-7}
     \textbf{BONE} $\rightarrow$ \textbf{B30f} & $2.05$ & $1.77$ & $0.47$ & $1.01$ & $0.34$ & $-0.07$ \\
     \textbf{STD.} $\rightarrow$ \textbf{B30f} & $6.19$ & $2.45$ & $1.69$ & $1.39$ & $0.27$ & $-0.29$ \\
     \bottomrule
    \end{tabular}}
% \floatconts
%   {tab:effect_sizes}% Label for the table
%   {\caption{Cohen's d to quantify anatomical hallucinations in skeletal muscle and SAT. Effect sizes to quantify hallucinations are categorized as small (d $\leq$ 0.2), medium (0.2 $\leq$d$\leq$0.5), and large (d $\geq$ 0.8)}}% Caption
%   {\begin{tabular}{|p{3cm}|p{1.5cm}|p{1.5cm}|p{1.5cm}|p{1.5cm}|p{1.5cm}|p{1.5cm}|}
%   \hline
%   \textbf{Kernel Pair} & \multicolumn{2}{p{3cm}|}{\textbf{multipathGAN with guidance vs cycleGAN}} & \multicolumn{2}{p{3cm}|}{\textbf{multipathGAN with guidance vs multipathGAN}} & \multicolumn{2}{p{3cm}|}{\textbf{multipathGAN with guidance vs switchable cycleGAN}} \\ \hline
%    & \textbf{Muscle} & \textbf{SAT} & \textbf{Muscle} & \textbf{SAT} & \textbf{Muscle} & \textbf{SAT} \\ \hline
%   \textbf{BONE to B30f} & 2.05 & 1.77 & 0.47 & 1.01 & 0.34 & -0.07 \\ \hline
%   \textbf{STANDARD to B30f} & 6.19 & 2.45 & 1.69 & 1.39 & 0.27 & -0.29 \\ \hline
%   \end{tabular}}
\end{table}
% \begin{table}[!tb]
% \floatconts
%   {tab:effect_sizes}% Label for the table
%   {\caption{Cohen's d to quantify anatomical hallucinations in skeletal muscle and SAT. Effect sizes to quantify hallucinations are categorized as small (d $\leq$ 0.2), medium (0.2 $\leq$d$\leq$0.5), and large (d $\geq$ 0.8)}}% Caption
%   {\begin{tabular}{|p{3cm}|p{1.5cm}|p{1.5cm}|p{1.5cm}|p{1.5cm}|p{1.5cm}|p{1.5cm}|}
%   \hline
%   \textbf{Kernel Pair} & \multicolumn{2}{p{3cm}|}{\textbf{multipathGAN with guidance vs cycleGAN}} & \multicolumn{2}{p{3cm}|}{\textbf{multipathGAN with guidance vs multipathGAN}} & \multicolumn{2}{p{3cm}|}{\textbf{multipathGAN with guidance vs switchable cycleGAN}} \\ \hline
%    & \textbf{Muscle} & \textbf{SAT} & \textbf{Muscle} & \textbf{SAT} & \textbf{Muscle} & \textbf{SAT} \\ \hline
%   \textbf{BONE to B30f} & 2.05 & 1.77 & 0.47 & 1.01 & 0.34 & -0.07 \\ \hline
%   \textbf{STANDARD to B30f} & 6.19 & 2.45 & 1.69 & 1.39 & 0.27 & -0.29 \\ \hline
%   \end{tabular}}
% \end{table}

\subsection{Quantifying tissue hallucinations in unpaired kernel harmonization}
The baseline models show consistent lung texture for the GE kernels when they are harmonized to the reference B30 kernel. However, hallucinations are introduced in specific regions of the skeletal muscle and SAT tissues where the intensities are labeled as lung (\autoref{fig:qualitative-figure}). For the images shown in Fig ~\ref{fig:qualitative-figure}, the proposed anatomical loss maintains the tissue intensity, effectively preserving the underlying anatomy. The anatomical guidance improved the median Dice scores to 0.90 for GE BONE and 0.95 for GE STD on skeletal muscle. Similarly, on SAT, median Dice improved to 0.90 for GE BONE and 0.94 for GE STD (\autoref{fig:dice-effect-size}). However, performance on Dice was variable when compared to the switchable cycleGAN. We present all Cohen's d statistic values that determine effect size in Table ~\ref{tab:effect_sizes}. For the BONE kernel, effect sizes were large for skeletal muscle and SAT on cycleGAN, and medium for muscle and large for SAT on the multipath cycleGAN. For the STANDARD kernel, effect sizes were large for both models. Effect sizes ranged from small to medium for the switchable cycleGAN model across both kernels.

\subsection{Performance of proposed approach on external dataset}
We evaluate our proposed model on the NoduleVU dataset, a newly developed multimodal dataset from Vanderbilt University Medical Center. This dataset includes imaging and electronic health records acquired under Vanderbilt IRB $\#$140274. For our study, we select ten scans reconstructed from the Siemens B50f, Siemens B30f, GE BONE, and GE STANDARD kernels after quality assurance. These scans are completely unpaired, meaning no ground truth exists between or across vendors. We assess our model’s performance in emphysema quantification and muscle/SAT assessment after harmonizing all kernels to match the style of the Siemens B30f kernel.The B30f kernel images had a distribution of 0.39$\%$ (0.01$\%$, 10.11$\%$). Before harmonization, the emphysema distributions for B50f, BONE and STANDARD images were 3.84$\%$ (1.00$\%$, 22.44$\%$), 3.37$\%$ (0.26$\%$,22.83$\%$) and 1.86$\%$ (0.01$\%$, 8.66$\%$). Post harmonization, the emphysema distributions for B50f, BONE and STANDARD were 1.12$\%$ (0.60$\%$, 5.33$\%$), 2.10$\%$ (0.23$\%$, 18.32$\%$) and 3.56$\%$ (0.25$\%$, 14.47$\%$). The B50f and BONE kernel images showed minimization in emphysema scores while the STANDARD kernel images showed a larger range of scores after harmonization. For the assessment of skeletal muscle and SAT, the BONE kernel and STANDARD kernel showed median Dice scores of 0.93, 0.95 for skeletal muscle and 0.91, 0.95 for SAT.


\section{Discussion}
In this work, we propose a tissue statistic loss in the multipath cycleGAN model that serves as anatomical guidance. The tissue statistic loss is implemented by penalizing mean intensity differences in local anatomical structures, guiding the model to preserve anatomical structures. While our proposed approach obtained consistent measurements for emphysema on the unpaired GE BONE and STANDARD kernels, the scores compared to the counterpart model were slightly lower. We believe that since the proposed loss function penalizes deviations from the mean, it may smooth out extreme intensity values beyond the threshold for emphysema quantification, reducing the number of detected emphysematous voxels. Future work could address this by incorporating robust higher-order statistical loss functions. On the external dataset, our approach minimized emphysema scores in the images reconstructed with the Siemens B50f and GE BONE kernels while the STANDARD kernel images showed slightly larger emphysema scores. The large median Dice scores on the external dataset highlight the preservation of anatomical structures post harmonization. Although our model leverages all possible anatomical labels from TotalSegmentator, the generalizability to other clinical tasks and organs requires further exploration. By leveraging precomputed anatomical labels from TotalSegmentator, our proposed method achieves higher Dice scores and shows large effect sizes on muscle and SAT compared to our counterpart model, demonstrating that the tissue statistic loss improved anatomical consistency post harmonization. 



% \begin{table}[tb!]
%     \centering
%     \caption{Effect sizes for anatomical consistency across models for muscle and SAT across different kernel pairs}
%     \label{tab:effect_sizes}
%     \resizebox{0.7\textwidth}{!}{%
%     \begin{tabular}{|p{3cm}|p{1.5cm}|p{1.5cm}|p{1.5cm}|p{1.5cm}|p{1.5cm}|p{1.5cm}|}
%     \toprule
%     \textbf{Kernel Pair} & \multicolumn{2}{p{3cm}|}{\textbf{multipathGAN with guidance vs cycleGAN}} & \multicolumn{2}{p{3cm}|}{\textbf{multipathGAN with guidance vs multipathGAN}} & \multicolumn{2}{p{3cm}|}{\textbf{multipathGAN with guidance vs switchable cycleGAN}} \\ \hline
%     & \textbf{Muscle} & \textbf{SAT} & \textbf{Muscle} & \textbf{SAT} & \textbf{Muscle} & \textbf{SAT} \\ \hline
%     \textbf{BONE to B30f} & 2.04 & 1.77 & 0.47 & 1.01 & 0.34 & -0.07 \\ \hline
%     \textbf{STANDARD to B30f} & 6.18 & 2.44 & 1.69 & 1.39 & 0.27 & -0.29 \\ \bottomrule
%     \end{tabular}%
%     }
% \end{table}





\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
\midlacknowledgments{This research was funded by the National Cancer Institute (NCI) grant R01 CA253923-04, R01 CA 253923-04S1. This work was also supported in part by the Integrated Training in Engineering and Diabetes grant number T32 DK101003. This research is also supported by the following awards: National Science Foundation CAREER 1452485; NCI U01 CA196405; UL1 RR024975-01 of the National Center for Research Resources and UL1 TR000445-06 of the National Center for Advancing Translational Sciences; Martineau Innovation Fund grant through the Vanderbilt-Ingram Cancer Center Thoracic Working Group; NCI Early Detection Research Network grant 2U01CA152662. The Vanderbilt Institute for Clinical and Translational Research (VICTR) is funded by the National Center for Advancing Translation Science Award (NCATS), Clinical Translational Science Award (CTSA) Program, Award Number 5UL1TR002243-03. The content is solely the responsibility of the authors and does not necessarily represent the official views of the NIH. We use generative AI to create code segments based on task descriptions, as well as debug, edit, and autocomplete code. Additionally, generative AI technologies have been employed to assist in structuring sentences and performing grammatical checks. It is imperative to highlight that the conceptualization, ideation, and all prompts provided to the AI originate entirely from the authors’ creative and intellectual efforts. We take accountability for the review of all content generated by AI in this work.}


\bibliography{midl25_234}

\clearpage
\appendix
\section{Multipath cycleGAN architecture}
\begin{figure}[h!]
    \centering
    \includegraphics[width=\linewidth,height=\textheight,keepaspectratio]{Figure_for_reviewer_novelty.png}
    \caption{Multi-domain kernel harmonization between paired and unpaired reconstruction kernels can be accomplished through a multipath cycleGAN model housing a shared latent space, domain specific encoders, decoders and discriminators. For a given path, the source encoder maps the input to the latent space that is translated to an image in the style of the target domain by the target decoder.}
    \label{fig:idea-figure}
\end{figure}

\clearpage
\section{Inspection of visual quality on harmonized images}
\begin{figure}[h!]
    \centering
    \includegraphics[width=\linewidth,height=0.25\textheight,keepaspectratio]{Paired_visual_check.png}
    \caption{In paired reconstruction kernels, there exists a one-to-one pixel correspondence between the hard and soft kernel for a given vendor with differences in pixel noise. The hard kernel sharpens the lung parenchyma while the soft kernel smoothens it. Harmonization to the corresponding soft kernel image enforces consistent texture in the lung parenchyma across the baseline models and the proposed multipath cycleGAN model with anatomical guidance.}
    \label{fig:paired-visual-check}
\end{figure}

\begin{figure}[h!]
    \centering
    \includegraphics[width=\linewidth,height=0.25\textheight,keepaspectratio]{Unpaired_visual_check.png}
    \caption{Unpaired reconstruction kernels exhibit differences in the positional alignment of subjects in addition to variations in reconstruction protocol. Harmonization to a reference Siemens B30f soft kernel enforces consistent texture in the lung parenchyma that benefits emphysema quantification. While some of the baseline models show tissue hallucinations in the muscle and fat regions, the proposed multipath cycleGAN model with anatomical guidance and the switchable cycleGAN model preserve the structure of the muscle and fat regions.}
    \label{fig:unpaired_visual}
\end{figure}

\section{Quality of emphysema maps on harmonized images}
\begin{figure}[h!]
    \centering
    \includegraphics[width=\linewidth,height=\textheight,keepaspectratio]{Emphysema_overlay_reviewer.png}
    \caption{Hard kernels overestimate emphysema as compared to soft kernels. The PSNR and SSIM between the hard and soft kernel images were 35.72 dB and 0.89. Harmonization improves the PSNR and SSIM to 48.16 dB and 0.99, indicating that image quality improved post harmonization. Harmonization minimizes differences in emphysema measurements, enforcing similar emphysema patterns between the soft kernel and harmonized image.}
    \label{fig:emphysema-maps}
\end{figure}

\section{Analyzing failure modes}
\begin{figure}[h!]
    \centering
    \includegraphics[width=\linewidth,height=0.4\textheight,keepaspectratio]{Good_bad_ugly_GE_images.png}
    \caption{Reconstruction kernels show variability in emphysema scores. Harmonization of unpaired kernels to the style of the reference enforces consistency in emphysema score. However, consistency in emphysema is variable across different subjects.}
    \label{fig:good-bad-ugly}
\end{figure}

\section{Computational overhead of proposed approach}
We train our multipath cycleGAN model with anatomical constraints using axial slices and the corresponding multilabel mask of size 512 × 512 pixels. We train our model for 200 epochs using a batch size of four, the Adam optimizer, a learning rate of 2 × $10^{-3}$ and a linear learning rate scheduler where the learning rate remains constant for the first 100 epochs and decays linearly for the remaining 100 epochs. We train this model on two NVIDIA RTX A6000 GPUs (48 GB memory each).  Our model consists of 56.156 million trainable parameters compared to 28.258 million for a standard cycleGAN and 40.132 million for a switchable cycleGAN. Given the number of trainable parameters and complex model architecture, training on large datasets consisting of 16343 images for the Siemens kernel and 14614 images for the GE kernels would require two weeks. We speed up training by randomly sampling 20$\%$ of the entire training dataset in each epoch such that our model covers all data samples throughout training. This approach reduced the training time to approximately 92 hours. We stabilize model training by leveraging PyTorch mixed precision to minimize memory consumption. During inference, our model harmonizes a given image within seconds on an NVIDIA RTX A6000 GPU using the corresponding encoder-decoder models. While our approach has a large computation overhead, our model enables training across diverse datasets through a shared latent space, facilitating multi-domain kernel harmonization.

\end{document}
