\documentclass[runningheads]{llncs}

\usepackage[T1]{fontenc}
\usepackage{lmodern}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{booktabs}
\usepackage{array}
\usepackage{tikz}
\usepackage{url}
\usetikzlibrary{arrows.meta,positioning,fit}
\setlength{\textfloatsep}{8pt plus 2pt minus 2pt}
\setlength{\intextsep}{8pt plus 2pt minus 2pt}
\usepackage{xurl}
\usepackage{hyperref}

\begin{document}

\title{Large-Volume Conditioned 3D Latent Diffusion Models for CT Metal Artifact Suppression}
\titlerunning{3D Image-Domain LDMs for CT Metal Artifact Suppression}

\author{Xabier Moreno Casado\inst{1,2}\orcidID{0009-0009-2532-3504} \and
Jef Vandemeulebroucke\inst{1,2,3}\orcidID{0000-0001-5714-3254} \and
Jakub Ceranka\inst{1,2}\orcidID{0000-0002-0241-7737}}
\authorrunning{X. Moreno Casado et al.}
\institute{Department of Electronics and Informatics (ETRO.RDI), Vrije Universiteit Brussel (VUB), Pleinlaan 2, 1050 Brussels, Belgium  \and
imec, Kapeldreef 75, 3001 Leuven, Belgium \and 
Department of Radiology, Universitair Ziekenhuis Brussel (UZ Brussel), Vrije Universiteit Brussel (VUB), Laarbeeklaan 101, 1090 Brussels, Belgium \\}
\maketitle
\let\thefootnote\relax\footnotetext{Corresponding author: \email{xabier.moreno.casado@vub.be}}

% MICCAI/LNCS paragraph style: no extra space between paragraphs
\setlength{\parskip}{0pt}
\setlength{\parindent}{1em}

\begin{abstract}
Metal artifacts in computed tomography (CT) significantly degrade anatomical visibility and can compromise downstream image analysis tasks, including segmentation, registration, implant migration assessment, and image-guided surgical planning. Many effective metal artifact reduction methods use projection-domain or dual-domain information, but raw sinograms are often unavailable in retrospective or public clinical CT datasets. In this work, we present the first 3D image-domain framework for large-volume CT metal artifact suppression using conditional latent diffusion models (LDM). First, a VQ-VAE-GAN compresses CT volumes to latent representations. Secondly, a 3D conditioned diffusion U-Net operating in a latent space on crops of \(448\times 448\times 256\) suppresses CT metal artifacts by synthesizing non-artifacted versions of the input. We compare two conditional models: an anatomy conditioned LDM and an anatomy-metadata conditioned LDM with extra information on anatomical region, laterality, and implant material. Training is performed on paired synthetic data generated by an anatomy-aware metal artifact generation pipeline that inserts implant masks into clean CT volumes, simulates polychromatic projection artifacts, and reconstructs artifact-corrupted volumes with paired non-artifacted image. On 50 test volumes, the anatomy-metadata conditioned model achieved the strongest structural and perceptual metrics (RMSE 0.021, PSNR 39.430 dB, SSIM 0.992, LPIPS 0.028), while anatomy-only conditioning showed slightly stronger visual artifact suppression. On postoperative real CTs with metal artifacts, the method successfully reduced visible artifacts in qualitative examples. The results support large-volume 3D image-domain LDMs as a promising direction for CT metal artifact suppression.

\keywords{Metal Artifact Suppression \and Computed Tomography \and Latent Diffusion Models \and Large-Volume 3D Synthesis  \and Image-Domain}
\end{abstract}

\section{Introduction}

%ONE SENTENCE ON PREVALENCE OF METAL ARTIFACTS + REF.
The clinical prevalence of computed tomography (CT) images with metal artifacts is increasing year to year~\cite{wangPrevalenceMetalImplants2025}. Metallic implants in CT, such as hip prostheses, spinal screws, plates, clips, and dental metal objects, produce streaking, shading, cupping, and apparent implant enlargement through beam hardening, photon starvation, scatter, and reconstruction nonlinearities \cite{boasCTArtifactsCauses2012}. These artifacts obscure implant-adjacent anatomy and reduce the reliability of downstream image computing and computer-assisted intervention workflows, affecting both human interpretation and automated processing \cite{gjestebyMetalArtifactReduction2016a}.

Metal artifact reduction (MAR) methods are commonly grouped into image-domain, projection-domain, and dual-domain approaches \cite{gjestebyMetalArtifactReduction2016a}. Image-domain methods restore already reconstructed artifact-corrupted CT images; projection-domain methods correct metal traces before reconstruction; and dual-domain methods combine projection and image-domain information. 

A classic projection-domain example is NMAR~\cite{meyerNormalizedMetalArtifact2010}, which replaces corrupted metal traces using interpolation after normalization with a prior image. More recent projection-domain methods use denoising diffusion probabilistic models (DDPMs) to inpaint missing sinogram data \cite{wuPRAISENetDeepProjectionDomain2025}. These methods can improve reconstruction quality, but they require accurate metal-trace identification as errors in the trace or sinogram completion can propagate to secondary artifacts.

Dual-domain methods combine sinogram and image information. CNN-MAR \cite{zhangConvolutionalNeuralNetwork2018a} uses synthetic metal insertion and CNN-based priors for MAR, while InDuDoNet+ \cite{wangInDuDoNetDeepUnfolding2023} optimizes sinogram and image-domain corrections through physics-informed iterative updates. Recent diffusion methods such as DCDiff~\cite{shenDCDiffDualDomainConditional2024} and Dual Domain Diffusion Guidance~\cite{choiDualDomainDiffusion2024} further exploited learned generative priors while maintaining projection-domain constraints. A practical limitation of dual-domain methods is that they require sinograms or scanner-geometry information, which is often unavailable in retrospective clinical datasets. Training with both sinogram and image data also increases computational cost \cite{choiDualDomainDiffusion2024}.

Image-domain diffusion methods avoid sinogram dependence and can be applied retrospectively to reconstructed volumes. DiffMAR \cite{caiDiffMARGeneralizedDiffusion2024} learns an image-domain diffusion restoration process for CT MAR; however, pixel-space DDPMs remain computationally expensive, slow at inference, and may leave fine residual streak artifacts.

Latent diffusion models (LDMs) reduce diffusion cost by denoising compressed image representations rather than full-resolution voxels \cite{rombachHighResolutionImageSynthesis2022}. MAISI~\cite{guo2025maisi} and MedLoRD~\cite{seyfarthMedLoRDMedicalLowResource2025} showed that high-dimensional 3D CT synthesis up to \(512 \times 512 \times 768\) voxels can be performed with feasible resources by combining image compression and 3D latent diffusion. Since diffusion models have shown image synthesis quality competitive with or superior to GANs \cite{dhariwal2021diffusion}, applying this generative prior to MAR may help achieve strong artifact suppression and texture preservation compared with earlier CNN/GAN-based approaches \cite{shenDCDiffDualDomainConditional2024}.

LDMs have recently been used for MAR: Choi \textit{et al.}~\cite{choiMetalArtifactReduction2025} proposed a conditional LDM for dental CBCT MAR. However, this approach is limited to 2D slice-wise CBCT images and does not exploit 3D volumetric context.

In this work, we propose the first large-volume 3D image-domain latent diffusion framework for CT metal artifact suppression using low-resource latent backbone. We adapt the principle of conditioned CT synthesis to paired image-domain MAR, targeting large 3D volumes around \(448 \times 448 \times 256\) voxels. The method was trained on a synthetic paired artifact-generation pipeline with anatomy-aware implant insertion, full 3D projection simulation, and paired non-corrupted ground truth images; and evaluated on paired synthetic data and qualitative real-case testing. The implementation, synthetic data generation framework and trained models are available at: \url{https://github.com/ETRO-MIT/3D-LDMs-for-CT-MAR}.

%%CNN-MAR \cite{zhangConvolutionalNeuralNetwork2018a} uses synthetic metal insertion and a CNN prior within a projection-domain framework. InDuDoNet+ unfolds an interpretable dual-domain optimization process \cite{wangInDuDoNetDeepUnfolding2023}. Recent diffusion MAR methods, including DCDiff (DUAL DOMAIN) [REF] and dual-domain DDPM variants [REF], exploit learned generative priors while maintaining projection-domain constraints \cite{shenDCDiffDualDomainConditional2024,xiaDualDomainDenoisingDiffusion2025}. These methods demonstrate strong MAR potential but require sinograms or information on scanner-geometry that are often unavailable in clinical research settings and retrospective or public clinical CT datasets. 
% Classical metal artifact reduction (MAR) and recent deep learning methods often correct corrupted projection data or use a combination of sinogram and image domain input \cite{zhangConvolutionalNeuralNetwork2018a,wangInDuDoNetDeepUnfolding2023,shenDCDiffDualDomainConditional2024}. These designs are powerful when acquisition geometry and raw projections are available. In many clinical research settings, however, only reconstructed CT volumes are accessible. Image-domain MAR is therefore practical, but many learning-based methods are two-dimensional, which can produce inconsistent corrections across adjacent slices and does not fully exploit the 3D anatomical context of large CT volumes.
%Image-domain only methods avoid sinogram dependence and can be applied retrospectively to reconstructed volumes. DiffMAR \cite{caiDiffMARGeneralizedDiffusion2024} learns an image-domain diffusion restoration process for CT MAR. Recently, conditional latent diffusion models (LDM) \cite{choiMetalArtifactReduction2025} have been used for dental CBCT MAR, however, the approach was limited to a 2D slice information. Existing 3D methods are often GAN-based, CBCT-specific, or require projection data \cite{choiDualDomainDiffusion2024}.
%Diffusion models have shown superior performance in image generation \cite{dhariwal2021diffusion} and restoration, but voxel-space diffusion is computationally expensive \cite{khaderDenoisingDiffusionProbabilistic2023}. LDMs \cite{rombachHighResolutionImageSynthesis2022} reduce the cost of diffusion by denoising compressed image representations instead of full-resolution images in voxel space. Implementations such as MAISI \cite{guo2025maisi} and MedLoRD \cite{seyfarthMedLoRDMedicalLowResource2025}, showed that high-dimensional 3D CT volumes up to \(512 \times 512 \times 768\) is possible with reasonable computational resources by combining VQ-VAE-GAN compression and 3D latent diffusion. 

% The contributions are:

% \begin{enumerate}
% \item A first large-volume 3D image-domain latent diffusion framework for CT metal artifact suppression using low-resource latent backbone.
% \item A synthetic paired artifact-generation pipeline with anatomy-aware implant insertion, full 3D projection simulation, and paired non-corrupted ground truth images.
% \item A focused comparison of image-conditioned and image-and-metadata-conditioned LDMs for 3D CT MAR.
% \item Extensive quantitative and qualitative evaluation on paired synthetic data and qualitative real-case testing.
% \end{enumerate}

\section{Methods}

\subsection{Data and Synthetic Artifact Generation}

The dataset contains 867 3D-CT volumes from five open-source datasets: CLINIC ($n=103$) and CLINIC-metal ($n=75$) from CTPelvic1K~\cite{liuDeepLearningSegmentPelvic2021}, KiTS19 ($n=299$)~\cite{hellerKiTS19ChallengeData2019}, and the liver ($n=200$) and colon ($n=190$) tasks of the Medical Segmentation Decathlon (MSD)~\cite{antonelliMedicalSegmentationDecathlon2022}. CLINIC, KiTS19, MSD liver, and MSD colon contain clean volumes for synthetic paired data generation; CLINIC-metal contains 75 real postoperative CTs with metal implants for qualitative validation only. Volumes were reoriented to RAS, resampled to 1 $mm$ isotropic spacing, and stored as NIfTI.

Paired and spatially aligned clinical CT volumes acquired before and after implant placement are generally unavailable. We therefore generate supervised pairs synthetically. \texttt{TotalSegmentator}\footnote{\url{https://github.com/wasserth/totalsegmentator}} is used to obtain organ and bone masks for anatomy-aware implant placement \cite{wasserthalTotalSegmentatorRobust2023}. An implant library of 29 binary masks is built from high-intensity components extracted from real CLINIC-metal volumes using a 2500 HU threshold and from voxelized public STL meshes\footnote{\url{https://grabcad.com/library/tag/hip}}. The library contains hip implants ($n=11$), pelvic screws ($n=6$), plates ($n=3$), spine implants ($n=2$), and spine screws ($n=7$).

For each clean volume, a region-plausible implant is inserted. Titanium and stainless steel are simulated with 3000 HU and 4000 HU values, respectively. The implant-only ground truth is created by replacing voxels inside the metal mask while leaving acquisition artifacts absent. Artifacted images are synthesized using a 3D ASTRA~\cite{vanAarleFastFlexible2016} cone-beam projection and reconstruction workflow inspired by CNN-MAR \cite{zhangConvolutionalNeuralNetwork2018a}. Water, bone, and metal components are forward-projected, a polychromatic Beer-Lambert signal is simulated, Poisson noise is added, and the projection data are reconstructed. The metal-induced reconstruction difference is added to the clean CT inside the body mask. Each sample stores the artifacted CT, implant-only ground truth, binary metal mask, and JSON file with metadata. The supervised paired MAR set contains 668 training pairs, 74 validation pairs, and 50 held-out test pairs.

%\begin{figure}
%\centering
%\includegraphics[width=\textwidth]{figures/Metal_artifact_4.png}
%\caption{A schematic overview of synthetic paired MAR data generation. Clean 3D CT volumes are segmented, matched with anatomy-aware implant masks from the implant library, assigned material properties, projected and reconstructed with simulated metal effects, and saved as artifacted CT/implant-only target pairs with masks and metadata.}
%\label{fig:pipeline}
%\end{figure}

\subsection{Large-Volume Latent Diffusion}

The proposed framework follows the MedLoRD two-stage design \cite{seyfarthMedLoRDMedicalLowResource2025} (see Fig.~\ref{fig:model}). First, a VQ-VAE-GAN encodes CT volumes into a compact latent representation and decodes restored latents back to image space. Intensities are clipped to \([-1000,4000]\) HU and scaled to \([-1,1]\). The VQ-VAE-GAN is trained on \(128^3\) patches with random small-angle rotations and flips. Its encoder has two 3D convolutional downsampling levels with 128 and 256 channels, followed by residual layers. The spatial downsampling factor is 4, producing an \(8 \times 32 \times 32 \times 32\) latent for a \(128^3\) patch, and a codebook of 16,384 embeddings of dimension 8.

Second, LDMs are trained in the VQ-VAE-GAN latent space. For a target latent \(z_0\), cosine-scheduled diffusion samples
\begin{equation}
z_t = \sqrt{\bar{\alpha}_t} z_0 + \sqrt{1-\bar{\alpha}_t}\epsilon ,
\end{equation}
where \(z_t\) is the noisy latent at diffusion step \(t\), \(\epsilon \sim \mathcal{N}(0,I)\), and \(\bar{\alpha}_t\) is the cumulative signal coefficient of the cosine noise schedule. The 3D U-Net predicts the velocity target
\begin{equation}
v_t = \sqrt{\bar{\alpha}_t}\epsilon - \sqrt{1-\bar{\alpha}_t}z_0,
\end{equation}
using a Huber loss \cite{meyerAlternativeProbabilistic2021},
\begin{equation}
\mathcal{L}_{LDM} = \mathrm{Huber}\left(v_{\theta}(z_t,c,t), v_t\right).
\end{equation}
The condition \(c\) denotes the artifacted-image latent and, for the metadata-conditioned model, the metadata cross-attention context. 

The denoiser uses feature channels 64, 128, 256, and 512 with attention in the two deepest levels. T=500 steps was selected for the conditional models based on validation performance. Training and inference use crops of \(448 \times 448 \times 256\) voxels.

\begin{figure}
\centering
\includegraphics[width=\textwidth]{figures/Architecture_model.png}
\caption{Architecture of the proposed conditional large-volume 3D latent diffusion for image-domain MAR. The VQ-VAE-GAN encoder compresses CT volumes into latent space, where the conditional U-Net is trained to denoise target latents. In inference, the denoised latent is decoded back to the CT image space. Both conditioning variants use the artifacted CT image; the metadata-conditioned model additionally injects implant/anatomy information.}
\label{fig:model}
\end{figure}

\subsection{Conditioning Strategies}


The two proposed conditioning strategies (see Fig.~\ref{fig:model}) share the same VQ-VAE-GAN and 3D diffusion U-Net with a difference in the type of information provided to the denoiser.

\subsubsection{Anatomy conditioned LDM.}
The artifacted CT is encoded as \(z_{\mathrm{art}}\). During each denoising step, the U-Net receives the concatenation \([z_t,z_{\mathrm{art}}]\) and predicts \(v_t\), learning \(p(z_0 \mid z_{\mathrm{art}})\). In the implemented configuration, this gives a 16-channel LDM input and an 8-channel latent output.

\subsubsection{Anatomy-metadata conditioned LDM.}
The second model uses the same spatial condition and additionally encodes non-spatial metadata as \(e_{\mathrm{meta}}\): anatomical region \texttt{[unknown, spine, hip, knee, shoulder]}, side \texttt{[unknown, left, right, midline, bilateral]}, and metal name \texttt{[unknown, titanium, steel]}. Simulation parameters are excluded because they are constant across synthetic cases. Each categorical metadata value is mapped to a learned embedding and passed through a multi-layer perceptron to form the cross-attention context for the U-Net, learning \(p(z_0 \mid z_{\mathrm{art}}, e_{\mathrm{meta}})\).

\subsection{Evaluation and Computational Setup}

Artifact-suppressed test cases are evaluated against implant-only ground truths using MAE, RMSE, PSNR, SSIM and LPIPS \cite{wangImageQualityAssessment2004,zhangUnreasonableEffectivenessDeep2018,heuselGANsTrained2017}. MAE and RMSE are computed on normalized \([-1,1]\) volumes. PSNR uses dynamic range \(L=2\). SSIM and LPIPS are computed on central axial, coronal, and sagittal slices. 
%FID is computed over aggregate slice distributions and treated descriptively rather than as an independent per-case statistic.

%In addition to quantitative metrics, six reviewers with medical image processing expertise performed a blinded visual review. For each test case, reviewers inspected the artifacted input, the implant-only ground truth, and anonymized model outputs in axial, coronal, and sagittal views. Model order was randomized and decoded only after review. Each output was scored from 0 to 5 for three criteria: artifact suppression, anatomical preservation, and overall image quality. For artifact suppression, 0 means no visible artifact reduction and 5 means no visible residual metal artifact. For anatomical preservation, 0 means unreliable or strongly distorted anatomy and 5 means well-preserved anatomy. For overall image quality, 0 means the image is dominated by artifacts, blur, or noise and 5 means excellent image quality close to the target.

In addition to quantitative metrics, six reviewers with medical image processing expertise performed a blinded visual review. For each test case, reviewers inspected the artifacted input, the implant-only ground truth, and anonymized MAR model outputs in axial, coronal, and sagittal views. Conditioning strategies were randomized and decoded only after review. Each output was scored from 0 to 5 using the criteria in Table~\ref{tab:visual_review_scale}.

\begin{table}
\caption{Visual-review scoring scale for artifact suppression, anatomical preservation, and overall image quality.}
\label{tab:visual_review_scale}
\centering
\fontsize{8}{9}\selectfont
\begin{tabular}{@{}>{\centering\arraybackslash}p{0.07\textwidth}@{\hspace{0.8em}}p{0.25\textwidth}p{0.30\textwidth}p{0.27\textwidth}@{}}
\toprule
\textbf{Score} & \textbf{Artifact Reduction} & \textbf{Anatomy Preservation} & \textbf{Overall Quality} \\
\midrule
5 & None residual & Preserved & Target-like \\
4 & Small residual & Minor changes & Good \\
3 & Moderate residual & Noticeable changes & Acceptable \\
2 & Strong residual & Partial distortion/removal & Poor \\
1 & Very strong residual & Severe distortion & Very poor \\
0 & No reduction & Distorted/unreliable & Dominated by noise/blur \\
\bottomrule
\end{tabular}
\end{table}


%Experiments were run on the anonymized HPC cluster using GPU nodes, with Python, PyTorch, and MONAI \cite{cardosoMONAIOpenSourceFramework2022}. The VQ-VAE-GAN training lasted 90.73 h on one GPU job. The selected 500-step image-conditioned and image-and-metadata-conditioned LDMs each used one GPU and \(448 \times 448 \times 256\) crops with batch size 2. Training jobs were submitted with 64 GB of system RAM, and test inference jobs with 80 GB. Including resumed jobs after Slurm limits, recorded wall-clock training times were approximately 5 d 21 h 21 min and 5 d 4 h 4 min, respectively. Test-set inference for 50 cases lasted 3 h 1 min 46 s and 3 h 10 min 37 s, respectively.
% TODO: Replace the generic Ampere/Hopper GPU-family description with exact GPU model identifiers if cluster job logs with card names become available.

Experiments used Python, PyTorch, and MONAI \cite{cardosoMONAIOpenSourceFramework2022} on the anonymized HPC cluster. The training and inference used a NVIDIA H200 GPU 140 GB. VQ-VAE-GAN was trained for 222 epochs with a batch size of 15, while LDM was trained for 250 epochs with a batch size of 2. Test inference jobs averaged 3.64 min per subject for the anatomy-conditioned LDM and 3.81 min per subject for the anatomy-metadata conditioned LDM.

% All training jobs requested 64 GB of system RAM. Peak GPU memory allocation was 77.42\% for the VQ-VAE-GAN, 88.54\% for the image-conditioned LDM, and 94.09\% for the image-and-metadata-conditioned LDM, corresponding to approximately 61.94 GB, 70.83 GB, and 75.27 GB of VRAM on an 80 GB GPU, respectively. The VQ-VAE-GAN stopped at epoch 222, and both LDMs were trained for 250 epochs. Test inference jobs requested 80 GB of system RAM and averaged 3.64 min per subject for the image-conditioned LDM and 3.81 min per subject for the image-and-metadata-conditioned LDM.

\section{Results}

\subsubsection{Synthetic Paired Test Set}
As a preliminary validation of the latent compression stage, the VQ-VAE-GAN reconstruction was evaluated on the test set across the original CT, synthetic artifacted CT, and implant-only ground truth images, achieving a low combined MAE of (0.0068 \(\pm\) 0.0020). The lowest error was obtained for original CT images (0.0060 \(\pm\) 0.0009), while the highest error was observed for artifacted images (0.0085 \(\pm\) 0.0024), supporting the use of the learned latent representation for downstream diffusion-based MAR.

Table~\ref{tab:quant} reports results on the 50 paired synthetic test cases and 300 reviewer-case ratings per model. Both conditional models improved RMSE, PSNR, SSIM and  LPIPS over the artifacted input and qualitatively suppressed metal artifacts (Fig.~\ref{fig:synthetic_results}). The anatomy-metadata conditioned model achieved the best RMSE, PSNR, SSIM and LPIPS. The anatomy conditioned model had slightly lower MAE among learned models, while the raw artifacted input had the lowest MAE overall. This is expected for a global voxel-wise metric because most voxels are unaffected by metal artifacts and remain identical to the target.

\begin{figure}
\centering
\includegraphics[width=0.92\textwidth]{figures/Figure_2.png}
%\caption{Successful paired synthetic MAR examples. Each row shows artifacted input, implant-only target, image-conditioned output, and image-and-metadata-conditioned output.}
\caption{Paired synthetic MAR examples, with columns showing artifacted input, implant-only target without artifacts, anatomy conditioned output, and anatomy-metadata conditioned output. Each row corresponds to a different simulated patient case.}
\label{fig:synthetic_results}
\end{figure}

\begin{table}
\centering
\caption{Final paired synthetic test results on 50 volumes. Quantitative metrics are median [bootstrap 95\% CI]. Visual-review scores are mean \(\pm\) standard deviation on a 0--5 scale; the raw artifacted input was shown to reviewers but not scored as a model output.}
\label{tab:quant}
\fontsize{8}{9}\selectfont
\setlength{\tabcolsep}{0.5pt}
\begin{tabular}{@{}llll@{}}
\toprule
Metric & Raw & Anatomy & Anatomy-metadata \\
\midrule
MAE \(\downarrow\) & \textbf{0.006[0.005,0.007]} & 0.008[0.008,0.009]\textsuperscript{*} & 0.009[0.008,0.009] \\
RMSE \(\downarrow\) & 0.031[0.028,0.033] & 0.023[0.021,0.024] & \textbf{0.021[0.021,0.023]}\textsuperscript{***} \\
PSNR \(\uparrow\) & 36.29[35.62,37.20] & 38.89[38.47,39.81] & \textbf{39.43[38.94,39.72]}\textsuperscript{***} \\
SSIM \(\uparrow\) & 0.981[0.974,0.985] & 0.991[0.988,0.992] & \textbf{0.992[0.991,0.993]}\textsuperscript{***} \\
LPIPS \(\downarrow\) & 0.062[0.040,0.081] & 0.037[0.029,0.043] & \textbf{0.028[0.025,0.034]}\textsuperscript{***} \\
%FID \(\downarrow\) & 87.394 & 52.733 & \textbf{45.504} \\
\midrule
Artifact suppression \(\uparrow\) & -- & \textbf{3.543 \(\pm\) 1.107}\textsuperscript{***} & 3.397 \(\pm\) 1.103 \\
Anatomical preservation \(\uparrow\) & -- & 3.477 \(\pm\) 0.966 & \textbf{3.633 \(\pm\) 0.988}\textsuperscript{***} \\
Overall quality \(\uparrow\) & -- & 3.387 \(\pm\) 1.007 & \textbf{3.433 \(\pm\) 0.994}\textsuperscript{n.s.} \\
\bottomrule
\end{tabular}
\par\vspace{2pt}
\parbox{\textwidth}{\fontsize{7}{8}\selectfont Superscripts report the direct comparison between the two LDMs: * \(p_{\mathrm{adj}}<0.05\), ** \(p_{\mathrm{adj}}<0.01\), *** \(p_{\mathrm{adj}}<0.001\), n.s. not significant after Bonferroni correction.}
\end{table}

Normality of paired differences was tested with the Shapiro--Wilk. Paired \(t\)-tests for normal quantitative comparisons, Wilcoxon signed-rank tests for non-normal quantitative comparisons and visual-review scores, and Bonferroni correction within each metric were used. Both conditional models significantly improved over the raw artifacted input for RMSE, PSNR, SSIM, and LPIPS (\(p_{\mathrm{adj}}<0.001\)). In the direct comparison, the metadata-conditioned model was significantly better for RMSE, PSNR, SSIM, and LPIPS (\(p_{\mathrm{adj}}<0.001\)), while MAE favored the anatomy conditioned model (\(p_{\mathrm{adj}}=0.012\)). The visual review analysis showed a complementary trade-off: the anatomy conditioned model had slightly higher artifact suppression, whereas metadata conditioning improved anatomical preservation and slightly improved overall quality (see Fig.~\ref{fig:tradeoff_examples}).

\begin{figure}
\centering
\includegraphics[width=0.90\textwidth]{figures/Figure_3.png}\\[0.6em]
\caption{Additional examples of LDM MAR result images. \textbf{Top:} CT image where better anatomical preservation of anatomy-metadata conditioning is present (yellow arrows). \textbf{Bottom:} CT image with visible remaining artifacts more evident for anatomy-metadata conditioning method (green arrows - residual streak artifacts, orange arrows - residual shading/cupping artifacts).}
\label{fig:tradeoff_examples}
\end{figure}

\subsubsection{Real Metal Case Evaluation}

The 75 CLINIC-metal volumes contain real postoperative implants but no paired artifact-free targets, so they were used for qualitative evaluation only. We applied the selected metadata conditioned model to representative real metal cases (Fig.~\ref{fig:real_results}). The model reduces visible streaking and shading around implants, with residual artifacts near high-density metal, supporting qualitative transfer from synthetic training to clinical artifacts.

\begin{figure}
\centering
\includegraphics[width=0.92\textwidth]{figures/Figure_4.png}
\caption{Qualitative inference on three different real CLINIC-metal cases (columns). Each column compares a real artifacted CT slice (top) with the metadata-conditioned LDM output (bottom).}
\label{fig:real_results}
\end{figure}

\section{Discussion}

%MedLoRD makes large-volume 3D image-domain diffusion feasible by denoising compact latents rather than voxels, enabling \(448 \times 448 \times 256\) CT crops with volumetric context. This matters for MIC and CAI workflows, where inconsistent artifact correction can affect segmentation, registration, implant assessment, and image-guided planning.

%The two conditional models expose a useful design trade-off. Image conditioning gives strong patient-specific guidance and tends to suppress artifacts more aggressively. Metadata improves structural and perceptual metrics and anatomical preservation, likely because region, side, and material provide coarse artifact information. Future MAR systems may benefit from explicit anatomical priors, for example segmentation-guided ControlNet conditioning similar to MedLoRD.

%Evaluation remains the main limitation: quantitative testing uses synthetic paired data because real paired artifact-free targets are unavailable, and real cases are visual only. The synthetic pipeline includes 3D geometry, anatomy-aware placement, polychromatic projection, Poisson noise, and reconstruction, but cannot cover all scanner protocols, implant alloys, and anatomies. Direct SOTA comparison is future work because many methods require unavailable sinograms, dual-domain inputs, CBCT geometry, or 2D settings. The contribution is feasibility and design, not SOTA metric superiority.

%This work demonstrates feasible large-volume 3D image-domain LDM-based CT MAR on paired synthetic data, with qualitative transfer to real postoperative cases. Future work should add MAR baselines, artifact-local metrics, automatic metadata inference, and broader clinical validation.

We propose the first image-domain 3D LDM framework for CT metal artifact reduction. The MAR results prove that conditional latent diffusion can successfully reduce metal artifacts while preserving important surrounding anatomy (see Fig.~\ref{fig:synthetic_results}). Both conditional models improved quantitatively compared with the raw artifacted input. MAE was the exception, with the raw input obtaining the lowest value, since the diffusion model is introducing small intensity changes outside the main artifact region.

The anatomy-metadata conditioned model achieved the best quantitative performance, supporting the idea that metadata provides useful context beyond the image anatomy itself. However, the visual review showed a trade-off between the two conditioning strategies. The anatomy conditioned model stronger suppresses artifacts, whereas adding metadata improved anatomical preservation (see Fig.~\ref{fig:tradeoff_examples}). The anatomy conditioned model removes metal artifacts more aggressively but may alter fine anatomical details, while the metadata-conditioned model leaves some residual streaking but better maintains local anatomy. The anatomy-metadata conditioned model was selected for real-case inference as it gave stronger overall quantitative result and better anatomical preservation.

Some residual artifacts persist, especially near high-density metal, and the proposed method can alter the implant geometry, which may complicate image interpretation. Future work can explore post-processing metal-mask constraints or anatomy-aware (metal segmentation mask) conditioning to distinguish between artifact patterns that should be removed and anatomical/implant structures that should be preserved. MAR-specific metrics, such as artifact-region or metal-adjacent ROI measures, should also be considered. To assess whether an LDM trained solely on synthetic artifacted data can generalize to real postoperative CT acquisitions, we evaluated the anatomy-metadata conditioned model qualitatively on CT volumes with real metal artifacts (see Fig.~\ref{fig:real_results}). The results show that the model can suppress visible artifacts in real postoperative CT images despite being trained only on synthetic paired data. 
Overall, this work shows the potential of image-domain 3D conditional latent diffusion for CT metal artifact reduction. By improving the readability of CT images affected by metallic implants, this approach may support future diagnostic assessment, radiological planning, and computer-assisted workflows. 

%Future work should also include MAR-specific metrics, such as artifact-region ROI measures. The training data are also synthetic and include a limited set of implants, materials, and scanner conditions. This may limit generalization to unseen clinical implants, so external validation on broader real datasets is needed. Nevertheless, the qualitative real-case results in Fig.~\ref{fig:real_results} show that the model can reduce visible artifacts in clinical CT volumes despite being trained on synthetic paired data

\begin{credits}
\subsubsection{\ackname} This work was funded by VLAIO project HBC.2024.0696, AI-NIMO. Computational resources were provided by the Vlaams Supercomputer Centrum (VSC).

\subsubsection{\discintname} The authors have no competing interests to declare that are relevant to the content of this article.
\end{credits}

\bibliographystyle{splncs04_ordered}
\bibliography{references}

\end{document}
