%%
%% This is file `sample-sigconf.tex',
%% generated with the docstrip utility.
%%
%% The original source files were:
%%
%% samples.dtx  (with options: `all,proceedings,bibtex,sigconf')
%% 
%% IMPORTANT NOTICE:
%% 
%% For the copyright see the source file.
%% 
%% Any modified versions of this file must be renamed
%% with new filenames distinct from sample-sigconf.tex.
%% 
%% For distribution of the original source see the terms
%% for copying and modification in the file samples.dtx.
%% 
%% This generated file may be distributed as long as the
%% original source files, as listed above, are part of the
%% same distribution. (The sources need not necessarily be
%% in the same archive or directory.)
%%
%%
%% Commands for TeXCount
%TC:macro \cite [option:text,text]
%TC:macro \citep [option:text,text]
%TC:macro \citet [option:text,text]
%TC:envir table 0 1
%TC:envir table* 0 1
%TC:envir tabular [ignore] word
%TC:envir displaymath 0 word
%TC:envir math 0 word
%TC:envir comment 0 0
%%
%%
%% The first command in your LaTeX source must be the \documentclass
%% command.
%%
%% For submission and review of your manuscript please change the
%% command to \documentclass[manuscript, screen, review]{acmart}.
%%
%% When submitting camera ready or to TAPS, please change the command
%% to \documentclass[sigconf]{acmart} or whichever template is required
%% for your publication.
%%
%%
\documentclass[sigconf]{acmart}
\usepackage{graphicx}
\usepackage{titlesec}


%%
%% \BibTeX command to typeset BibTeX logo in the docs
\AtBeginDocument{%
  \providecommand\BibTeX{{%
    Bib\TeX}}}

%% Rights management information.  This information is sent to you
%% when you complete the rights form.  These commands have SAMPLE
%% values in them; it is your responsibility as an author to replace
%% the commands and values with those provided to you when you
%% complete the rights form.
\setcopyright{acmlicensed}
\copyrightyear{2024}
\acmYear{2024}
\setcopyright{acmlicensed}\acmConference[MM '24]{Proceedings of the 32nd ACM International Conference on Multimedia}{October 28-November 1, 2024}{Melbourne, VIC, Australia}
\acmBooktitle{Proceedings of the 32nd ACM International Conference on Multimedia (MM '24), October 28-November 1, 2024, Melbourne, VIC, Australia}
\acmDOI{10.1145/3664647.3681700}
\acmISBN{979-8-4007-0686-8/24/10}

%% These commands are for a PROCEEDINGS abstract or paper.
%%\acmConference[MM'24]{Make sure to enter the correct
%  conference title from your rights confirmation emai}{October 28--November 01,
%  2024}{Melbourne, VIC, Australia}
%%
%%  Uncomment \acmBooktitle if the title of the proceedings is different
%%  from ``Proceedings of ...''!
%%
%%\acmBooktitle{Woodstock '18: ACM Symposium on Neural Gaze Detection,
%%  June 03--05, 2018, Woodstock, NY}
%\acmISBN{978-1-4503-XXXX-X/18/06}


%%
%% Submission ID.
%% Use this when submitting an article to a sponsored event. You'll
%% receive a unique submission ID from the organizers
%% of the event, and this ID should be used as the parameter to this command.
%%\acmSubmissionID{123-A56-BU3}

%%
%% For managing citations, it is recommended to use bibliography
%% files in BibTeX format.
%%
%% You can then either use BibTeX with the ACM-Reference-Format style,
%% or BibLaTeX with the acmnumeric or acmauthoryear sytles, that include
%% support for advanced citation of software artefact from the
%% biblatex-software package, also separately available on CTAN.
%%
%% Look at the sample-*-biblatex.tex files for templates showcasing
%% the biblatex styles.
%%

%%
%% The majority of ACM publications use numbered citations and
%% references.  The command \citestyle{authoryear} switches to the
%% "author year" style.
%%
%% If you are preparing content for an event
%% sponsored by ACM SIGGRAPH, you must use the "author year" style of
%% citations and references.
%% Uncommenting
%% the next command will enable that style.
%%\citestyle{acmauthoryear}

%%
%% end of the preamble, start of the body of the document source.
\begin{document}
%%
%% The "title" command has an optional parameter,
%% allowing the author to define a "short title" to be used in page headers.
\title{Interpretable Matching of Optical-SAR Image via Dynamically Conditioned Diffusion Models}

%%
%% The "author" command and its associated commands are used to define
%% the authors and their affiliations.
%% Of note is the shared affiliation of the first two authors, and the
%% "authornote" and "authornotemark" commands
%% used to denote shared contribution to the research.

\author{Shuiping Gou}
\email{shpgou@mail.xidian.edu.cn}
\affiliation{%
	\institution{Xidian University}
  \city{Xi'an}
\country{China}
}


\author{Xin Wang}
\email{23171214441@stu.xidian.edu.cn}
\affiliation{%
	\institution{Xidian University}
	\city{Xi'an}
	\country{China}
}

\author{Xinlin Wang}
\authornote{Corresponding author.}
\email{wangxinlin@xidian.edu.cn}
%%\orcid{1234-5678-9012}
\affiliation{%
  \institution{Xidian University}
  \city{Xi'an}
  \country{China}
}

\author{Yunzhi Chen}
\email{2016010026@hzvtc.edu.cn}
\affiliation{%
	\institution{Hangzhou Vocational and Technical College}
	\city{Hangzhou}
	\country{China}
}



%%
%% By default, the full list of authors will be used in the page
%% headers. Often, this list is too long, and will overlap
%% other information printed in the page headers. This command allows
%% the author to define a more concise list
%% of authors' names for this purpose.
\renewcommand{\shortauthors}{Shuiping Gou, Xin Wang, Xinlin Wang, and Yunzhi Chen}

%%
%% The abstract is a short summary of the work to be presented in the
%% article.
\begin{abstract}
  Driven by the complementary information fusion of optical and synthetic aperture radar (SAR) images, the optical-SAR image matching has drawn much attention. However, the significant radiometric differences between them imposes great challenges on accurate matching. Most existing approaches convert SAR and optical images into a shared feature space to perform the matching, but these methods often fail to achieve the robust matching since the feature spaces are unknown and uninterpretable. Motivated by the interpretable latent space of diffusion models, this paper formulates an optical-SAR image translation and matching framework via a dynamically conditioned diffusion model (DCDM) to achieve the interpretable and robust optical-SAR cross-modal image matching. Specifically, in the denoising process, to filter out outlier matching regions, a gated dynamic sparse cross-attention module is proposed to facilitate efficient and effective long-range interactions of multi-grained features between the cross-modal data. In addition, a spatial position consistency constraint is designed to promote the cross-attention features to perceive the spatial corresponding relation in different modalities, improving the matching precision. Experimental results demonstrate that the proposed method outperforms state-of-the-art methods in terms of both the matching accuracy and the interpretability.
\end{abstract}

%%
%% The code below is generated by the tool at http://dl.acm.org/ccs.cfm.
%% Please copy and paste the code instead of the example below.
%%
\begin{CCSXML}
	<ccs2012>
	<concept>
	<concept_id>10010147.10010178</concept_id>
	<concept_desc>Computing methodologies~Artificial intelligence</concept_desc>
	<concept_significance>500</concept_significance>
	</concept>
	<concept>
	<concept_id>10010147.10010178.10010224</concept_id>
	<concept_desc>Computing methodologies~Computer vision</concept_desc>
	<concept_significance>500</concept_significance>
	</concept>
	<concept>
	<concept_id>10010147.10010178.10010224.10010245</concept_id>
	<concept_desc>Computing methodologies~Computer vision problems</concept_desc>
	<concept_significance>500</concept_significance>
	</concept>
	<concept>
	<concept_id>10010147.10010178.10010224.10010245.10010255</concept_id>
	<concept_desc>Computing methodologies~Matching</concept_desc>
	<concept_significance>500</concept_significance>
	</concept>
	</ccs2012>
\end{CCSXML}

\ccsdesc[500]{Computing methodologies~Artificial intelligence}
\ccsdesc[500]{Computing methodologies~Computer vision}
\ccsdesc[500]{Computing methodologies~Computer vision problems}
\ccsdesc[500]{Computing methodologies~Matching}

%%
%% Keywords. The author(s) should pick words that accurately describe
%% the work being presented. Separate the keywords with commas.
\keywords{Image matching, Synthetic aperture radar images, Diffusion probabilistic model}
%% A "teaser" image appears between the author and affiliation
%% information and the body of the document, and typically spans the
%% page.
%\begin{teaserfigure}
%  \includegraphics[width=\textwidth]{sampleteaser}
%  \caption{Seattle Mariners at Spring Training, 2010.}
%  \Description{Enjoying the baseball game from the third-base
%  seats. Ichiro Suzuki preparing to bat.}
%  \label{fig:teaser}
%\end{teaserfigure}

%\received{20 February 2007}
%\received[revised]{12 March 2009}
%\received[accepted]{5 June 2009}

%%
%% This command processes the author and affiliation and title
%% information and builds the first part of the formatted document.
\maketitle

\section{Introduction}
With the rapid development of remote sensing technology, different image sensors are constantly emerging and provide rich image data for the earth observation.
Owing to the complementary property between different modality images, the multi-source information integration is widely explored. Especially, the synthetic aperture radar (SAR) and optical sensing images have been increasingly explored and applied to mapping \cite{PearlRiver}, object detection \cite{wan2019post}, and etc. Therefore, effectively integrating and exploiting optical and SAR images is the focus, in which the optical-SAR image matching is the core issues.However, due to their different imaging mechanisms, there exist remarkable geometric differences and nonlinear radiometric variations between optical and SAR images. As a result, the optical and SAR image matching still remains challenging. 

To solve the task, researchers devote themselves to the challenge and propose various algorithms. In the early stages, traditional image matching methods are widely used, including region-based and feature-based methods. Region-based methods use similarity metrics, such as normalized mutual information (NMI) \cite{MI} and normalized cross-correlation (NCC) \cite{NCC}, to find the correspondence between the template and the reference images. The kind of approaches only leverage the global pixel intensity information within a window to calculate the similarity of the corresponding region, which is sensitive to the image intensity differences and the noise. To this end, feature-based image matching methods are developed, which aims to extract keypoints of each image and find matching points. The representative algorithm is the scale-invariant feature transform (SIFT) \cite{sift}, which detect keypoints in different scales by utilizing its description in terms of the scale, the gradient magnitude, and direction. Based on the SIFT algorithm, some variants, such as BFSIFT \cite{wang2011bfsift}, AAGSIFT \cite{wang2014aagsift}, and RIFT \cite{rift}, have been derived. In addition, to overcome the nonlinear radiometric differences in SAR and optical images, structural similarity-based descriptors have been proposed, such as histogram of oriented phase coherence (HOPC) \cite{hopc}, and channel feature of oriented gradients (CFOG) \cite{cfog}. However, the potential of  hand-designed features for improving the matching performance is very limited considering the remarkable differences between SAR and optical images.

Thanks to the considerable achievements made by the convolutional neural networks (CNNs) in computation vision tasks, and the launch of the publicly available remote sensing data \cite{spacenet,osdataset,sen12}, deep-learning-based cross-modal image matching approaches have constantly emerged\cite{matchnet, merkle2017exploiting, acnn, SCmatch, explorebeter}.   
Existing deep-learning-based optical-SAR image matching methods includes two kinds: the Siamese network-based feature mapping, and the generative adversarial network(GAN)-based cross-modality translation.
The Siamese network-based methods \cite{merkle2017exploiting, acnn, SCmatch, explorebeter} exploit the effective feature extraction ability of CNN to map the multi-modal images into the common feature spaces, and perform similarity metrics on the mapped features spaces. 
However, the mapping feature spaces are not visual, and lack of interpretability. Furthermore, these methods fail to extract useful features for the matching when there are more textureless regions in the images.
In contrast, the GAN-based approaches \cite{multiscaleGAN, nie2022dual} attempt to translate the SAR or optical image from one modality to the other by the adversarial learning between generators and discriminators.
But, GAN trains the model through the mutual game between the generator and discriminator, which easily causes the model to fall into a local minimum, generating unstable translations on the cross-modal images with large differences. Moreover, GAN-based matching methods are not end-to-end. 

Recently, the diffusion model \cite{ddpm} has received considerable attention on generative models, which has a diffusion process to gradually add Gaussian noise to the data and a denoising process to learn to remove it. 
Driven by its stable training manner and high-quality generation results, the diffusion models have been explored to the optical-SAR image translation. Bai et al. \cite{condsar} utilizes a conditional diffusion model to efficiently translate SAR images into optical images, and Shi et al. \cite{shi2024brain} proposes self-attention and long skip connections in denoising networks to enhance feature extraction, which demonstrates the potential of diffusion models in translating SAR images. 
However, existing diffusion model-based optical-SAR translation methods only focus on generating cross-modal images for enhancing human visual perception, and have not yet explored the downstream cross-modal image matching task. In fact, using the generated image for matching requires elaborate design of matching methods, since there still exist different attributes between the generated remote sensing images and the real images, which deteriorates the matching performance and the speed.

To address the above problem, this paper formulates the optical-SAR cross-modal image matching as a dynamically conditioned diffusion model (DCDM), which aims to learn the posterior distribution of regions with dense correspondences.
Specifically, the optical template and SAR search image pairs are taken as conditions to respectively provide the content for the better generation and the texture details for the accurate matching.
Moreover, a gated dynamic sparse cross-attention (GDSC) module is designed to dynamically inject reliable conditional information into the generative network and accelerate denoising process. 
On this basis, to enhance the perception of matching positions, we introduce the spatial position consistency constraint. 
In the matching, to reduce the computational effort, the latent features of the generated SAR are directly matched with features of the search SAR, instead of decoding them into images and then matching them.
The contributions of this paper are summarized as follows:

\begin{itemize}
	\item We propose an end-to-end cross-modal image matching framework, dynamically conditioned diffusion model (DCDM). 
	It not only translates cross-modal images, but also completes the pixel-level matching in the latent space. 
	
	\item A gated dynamic sparse cross-attention module is present to perform the controlled and efficient cross-interaction between the template and the search, aiming to filter out the outlier matching regions while improving the computational efficiency.
	
	\item A spatial position consistency constraint is designed to enhance the detail perception of the cross-attention to generate more accurate cross-modal features for matching. Experimental results on two matching datasets quantitatively and qualitatively demonstrates the effectiveness and the interpretability of the proposed approach.
\end{itemize}


\section{RELATED WORKS}
%As noted in the introduction, the ``\verb|acmart|'' document class can
%be used to prepare many different kinds of documentation --- a
%double-anonymous initial submission of a full-length technical paper, a
%two-page SIGGRAPH Emerging Technologies abstract, a ``camera-ready''
%journal article, a SIGCHI Extended Abstract, and more --- all by
%selecting the appropriate {\itshape template style} and {\itshape
	%  template parameters}.
%
%This document will explain the major features of the document
%class. For further information, the {\itshape \LaTeX\ User's Guide} is
%available from
%\url{https://www.acm.org/publications/proceedings-template}.
\titlespacing*{\subsection}{0pt}{3ex}{1ex}
\subsection{Learning-based multimodal remote sensing image matching}
In learning-based multimodal image matching models often leverage intensive interactions between two modalities to capture effective matching features. % data from both modalities.
The studies \cite{gao2024learning, tracking} apply the cross attention to perform long-range interactions of cross-modal features, thus capturing features suitable for matching. Other methods \cite{SCmatch, VSmatch} slide the template features on the reference features pixel by pixel to calculate the similarity heatmap, which is computationally intensive, especially for large images. Fang et al. \cite{fftUnet} leverages U-Net \cite{unet} to extract high-resolution features, and use the Fast Fourier Transform (FFT) to implement the NCC similarity metric in the frequency domain to accelerate, which has been widely used. 
Mu et al. \cite{SCmatch} proposes a two-stage feature extraction network to achieve precise localization from coarse to fine, which designs a suppression network and a triple loss to suppress false matching position.
Zhang et al. \cite{explorebeter} presents to fuse low-level fine-grained localization features with high-level semantic features to enhance feature discrimination.
Michele et al. \cite{MARUnet} extracts features from full-size and half-size images, and then fuses these features to construct pixel-level features for matching. 


Despite the advances in matching accuracy, these methods still face challenges of the uninterpretable feature spaces, the indiscriminate matching for textureless regions, and the large amounts of computation requirements.
%Moreover, they often fail to match the plain area with less texture. 
In contrast, we propose the gated dynamical sparse attention under the latent diffusion paradigm to efficiently extract cross-modal features with consistent representation, achieving interpretable and robust matching.


\begin{figure*}[t]  
	\centering 
	\scalebox{0.8}{ 
	\includegraphics[width=0.85\textwidth]{./Figures/Fig_2.pdf}}  
	\caption{The pipeline of the proposed dynamically conditioned diffusion model, which aims to generate and match SAR template in the latent space conditioned on the optical template and the search SAR images. }
\label{fig2}
\end{figure*}
\subsection{Denoising diffusion model}

The emergence of the denoising diffusion probabilistic model (DDPM) \cite{ddpm} has led to the widespread use of diffusion models in computer vision \cite{ddpmvison1}, natural language processing \cite{ddpmlanguage}, interdisciplinary applications \cite{interdisciolinary1}, and audio processing \cite{audio1}, which outperforms the current GAN-based generative models in image synthesis \cite{nichol2021improved}. DDPM is a parametric Markov chain that incrementally adds noise to the data during forward diffusion until the original signal is completely corrupted, and then reconstructs the signal during reverse diffusion. The denoising diffusion implicit model (DDIM) \cite{ddim} is evolved from DDPM, which introduces non-Markov chain diffusion process. This innovation reduces the number of steps required in the inference process. Furthermore, to train denoising diffusion models on limited computational resources while maintaining their quality and flexibility, the latent diffusion model (LDM) \cite{ldm} applies DDPM to the latent space extracted by the powerful pre-trained autoencoders. Compared to other diffusion models, LDM significantly reduces computational requirements, and achieves efficient cross-modal generation.
Therefore, our approach adopts LDM as the base framework to develop the optical-SAR matching algorithm.  
%Our approach is developed based on the LDM, but differs in two aspects. Firstly, to achieve fast and promising matching results under a low computation complexity, the encoder is only used when matching. Secondly, a dynamic sparse gated attention is designed to effectively inject conditional information into the denoising network to further accelerate matching speed.


\subsection{Sparse Attention}
Over the past few years, the transformers have been exploded in the computer vision community\cite{endtoend, 1616words}. 
Contrary to the convolution operation that extract local features, transformers exploit self-attention mechanisms to capture long-distance dependencie, and have global receptive fields\cite{attention}. However, such a property comes at the cost of having a high computational complexity and a large memory footprint. To mitigate this problem, the sparse attention \cite{aparsetransformer} has been proposed, in which each query focuses on only a small number of key-value pairs instead of all key-value pairs. 
Several hand-crafted sparse patterns have been proposed, such as limiting attention in localized windows \cite{swin}, expanding the windows \cite{crossformer}, and etc. 
Recently, a novel dynamic sparse attention, Biformer, \cite{biformer} has been proposed, whose two-layer routing architecture performs the dynamic computational allocation through information awareness, effectively reducing the computational complexity. %In a nutshell, all of the above approaches focus on reducing the number of key values while neglecting to sparse the query.

%The above methods focus on reducing the number of key values while ignoring the effect of the query in the self-attention, whereas for the cross-modal remote sensing image matching issue, the cross attention is required. Thus, the sparsity of queries has a significant effect on computational complexity in the matching task.
%All of the above approaches focus on reducing the number of key values while neglecting to sparse the query.

%All of the above approaches focus on designing sparse self-attention by only reducing the number of key-value tokens. % while neglecting to sparse the query. 
%In the cross-modal remote sensing image matching task, the cross-attention is generally used.
%%The sparsity of remote sensing images makes query sparsity particularly important.
%%where the template is treated as queries, and the search image is key-value pairs.
%Hence, this paper presents a gated dynamic sparse cross-attention by dynamically selecting both effective queries and keys for efficient computation and matching.

% the sparsity of the query has a significant impact on the computational complexity in the matching task, and more importantly, \hl{not all of the queries that are in a localized region in the remote sensing task are qualified} not all queries are useful for matching.  
%Different from the self-attention that the key, value, and query are from the same input,  



All of the above approaches focus on designing sparse self-attention by reducing the number of key-value tokens.
In fact, for the cross-attention in remote sensing image matching task, filtering queries related to the template is also important, since invalid queries bring interferences.
Hence, this paper presents a gated dynamic sparse cross-attention by dynamically selecting both effective queries and keys for efficient computation and matching.


%\section{PRELIMINARIES}
%Most diffusion models are based on the framework of DDPMs \cite{ddpm}, which consist of a forward diffusion process and a reverse generation process, both of which are modelled as Markov chains. During forward diffusion, Gaussian noise is slowly added to the original data ${x}_{0}$ according to a fixed variance schedule $\left\{\beta_{1}, \beta_{2} \ldots \beta_{T}\right\}$ in $T$ steps. Finally, a sequence of noisy samples \{${x}_{0}, ..., {x}_{t}, ..., {x}_{T}$\} is produced, where $x_T \sim N(0, \mathbf{I})$. Each diffusion step is formulated as: 
%\begin{equation}
%	q\left(x_t \mid x_{t-1}\right):=N\left(x_t ; \sqrt{1-\beta_t} x_{t-1}, \beta_t \mathbf{I}\right)
%\end{equation}
%
%By setting the $\alpha_t=1-\beta_t$ and the $\overline{\alpha_t}=\Pi_{t=1}^T \alpha_i$, $x_T$ can be obtained at any moment $t$ by the following equation:
%\begin{equation}
%	q\left(x_t \mid x_0\right):=N\left(x_t ; \sqrt{\bar{\alpha}_t} x_0,\left(1-\bar{\alpha}_t\right) \mathbf{I}\right)
%\end{equation}
%
%During the reverse generation process $p\left(x_{t-1} \mid x_t\right)$, the noise is gradually removed and the original data is reconstructed. To solve the problem, the DDPM learns the parametric Gaussian transform $p_\theta\left(x_{t-1} \mid x_t\right)$. Essentially, it predicts the mean of the Gaussian distribution $\mu_\theta\left(x_t, t\right)$. The reverse process is expressed as follows:
%\begin{equation}
%	p_\theta\left(x_{t-1} \mid x_t\right)=N\left(x_{t-1} ; \mu_\theta\left(x_t, t\right), \sigma_t^2 \mathbf{I}\right)
%\end{equation}
%
%During training, the denoising neural network $f_\theta\left(x_t, t\right)$ is trained to minimize the training objective via L2 loss, that is, predicting $x_0$ from the $x_t$ \cite{ddpm} :
%\begin{equation}
%	L=\left\|f_\theta\left(x_t, t\right)-x_0\right\|^2
%\end{equation}
%In the inference phase, data samples $x_0$ are reconstructed from the noise $x_T$ in an iterative manner using the model $f_\theta$ and updating rules \cite{ddpm}.


\section{THE PROPOSED METHOD}
%In the optical-SAR image matching task, we treat the optical images as templates, and the SAR images as search images.
To achieve the cross-modal optical-SAR image matching, we formulate a dynamically conditioned diffusion model to translate the optical templates into SAR templates, and perform the matching. The pipeline of the proposed method is illustrated in Figure \ref{fig2}.
Firstly, we continuously add Gaussion noise to the ground-truth SAR template in each diffusion step, and then generate the SAR template via training a U-Net-based denoising network. 
To generate more realistic SAR images, the corresponding optical template is adopted as a condition to provide the scene content.
Simultaneously, a task-oriented condition, the search SAR image, is introduced to provide the texture details for the accurate matching.
Afterwards, apply the gated dynamic sparse cross-attention module and the spatial position consistency constraint to achieve the effective and efficient cross-modal feature interaction and aggregation.
Finally, FFT-based NCC \cite{fftUnet} is adopted to perform the matching between the generated SAR template and the search SAR in the latent space.


%To achieve the efficient and effective cross-interaction between the template and the search data, we design a gated dynamic sparse cross-attention (GDSC).
%Moreover, a spatial position consistency constraint is proposed to promote the cross-attention map to focus on the accurate matching position.


%%%% too long
%The pipeline of the proposed method is illustrated in Figure \ref{fig2}.
%In the optical-SAR image matching task, the optical images are treated as templates, and the SAR images are search images.
%To achieve the cross-modal optical-SAR image matching, we intend to translate the optical images into SAR images, and perform the matching using the generated SAR and the search SAR images.
%Considering that the optical images contains rich content information, and the search SAR images provides \hl{(scattering characteristics) }matching position information, this paper takes the two images as conditions to guide the generation and matching in the latent space. 
%It is composed of three stages. 
%\textbf{\textit{The diffusion process: }} The ground-truth template SAR image is encoded as feature embeddings $x_0$, and is continuously added noise in each diffusion step $t$.
%\textbf{\textit{The denoising process: }} Train the U-Net-based denoising network to predict the added noise between $x_t$ and $x_{t-1}$ given the conditional embeddings. 
%Specifically, we firstly concatenate the noising ground-truth SAR template embedding $x_t$ with the optical template embedding \hl{$x_t$} in channel dimension as the template embeddings.
%Then, input the template embeddings into U-Net. Simultaneously, we input the search SAR embeddings into U-net, and design a gated dynamic sparse cross-attention (GDSC) to achieve the efficient cross-interaction between the template embeddings and the search SAR embeddings. 
%The cross-attention is between template and search images. Hence, the cross-attention map should also learn the accurate matching position.
%To this end, a spatial position consistency constraint is proposed to promote the cross-attention map to focus on the accurate matching position.
%\textbf{\textit{The matching process: }} In the denoising process of latent space, the model has captured effective features of generated template SAR images and the SAR search images. Therefore, the features are directly used for the similarity calculation and the matching, instead of generating SAR template images. It avoids performing the matching in pixel space requiring large computation resources. In the paper, FFT-based NCC \cite{fftUnet} ia adopted to calculate the feature similarity.
%%%%%%%%%%%%%%%%%%%%%%%%%%%% end

%Moreover, to efficiently and effectively inject conditional information into the generative network, a gated dynamic sparse cross-attention (GDSC) module is designed. 
%spatial position consistency constraint structures to accelerate denoising process and extract matching features. in the latent space and use the images of the two modalities to be matched as additional conditions, where the template image is used as the generation prior and the search image is used as the matching prior. The generated latent features are used directly for alignment during inference without the use of a decoder. In order to reduce the amount of computation and to filter texture-free regions as well as non-shared content, we introduce sparse attentional attention modules with gating. To introduce a fine-grained matching prior and to combine the advantages of features with different granularities, cross attention is used in the denoising network on a granularity-by-granularity basis. Cross-granularity use of the cross-attention graph as a positional guide for similarity computation instead of direct cross-granularity fusion, as well as the use of consistent attention graph constraints to obtain accurate spatial positional correspondences.


\subsection{SAROPT-conditioned Latent Diffusion}

%Assuming that the optical images are template images, and the SAR images are search images, 
The optical-SAR cross-modal image matching task aims to find the corresponding spatial position of the optical image in the SAR image. To reduce the influence of modality differences, this paper proposes a dynamically conditioned diffusion model (DCDM) to formulate the generative process to translate optical images.
%Motivated by the LDM \cite{ldm} generative method, we propose a DCDM approach, which aims to generate SAR images by formulates the generative process in the latent space with conditions. 
However, using noise only to generate SAR image is intractable, since remote sensing images contains rich targets. 
Fortunately, optical images contain object information, which can provide the content information. 
Thus, the optical template $T_{O}$ is treated as the condition to guide the generation of the real SAR template.
%to generate SAR images given optical and SAR search images as an a prior guided diffusion model, and its basic architecture is referenced in \cite{ldm}, a U-net with a cross-attention layer for denoising. 
%Our core idea is to use multimodal remote sensing images to guide the model generation process.
%as most of the image-conditioned diffusion models,  
%In addition, SAR images are imaged based on the object scattering characteristics, which has large appearance and intensity differences with optical images. 
%The search SAR images contains scattering characteristics of SAR, which could supplement the texture and the intensity information of different objects. Simultaneously, the search SAR can be used for matching with the generated template SAR in the latent spaces, avoiding extracting features again. 
%Based on this, we further introduce the search SAR image as the condition. Notable, to speed up the perceptual compression, the search SAR images are splitted into smaller patches $S_{S}$ as input, and restored afterwards.
In addition, the texture details is important for the matching. 
%SAR images are imaged based on the object scattering characteristics, which has large appearance and intensity differences with optical images. 
Considering that the generated SAR template is matched with the search SAR, we further take the search SAR image as another condition to supplement the texture details. 
%The search SAR images contains scattering characteristics of SAR, which could supplement the texture and the intensity information of different objects.}
%Based on this, we further introduce the search SAR image as another condition. 
%Simultaneously, the search SAR can be used for matching with the generated template SAR, avoiding extracting features again.
%The introduction of the search SAR simultaneously enables the cross-modal image matching to be end-to-end, rather than generating the template image and re-inputting them into another matching network. 

%However, the search SAR image has larger size, injecting its information requires high computation resources, and increases denoising time. 
%To better facilitate efficient and effective long-range interactions of multi-grained features between the cross-modal data.
%To this end, we design a gated dynamic sparse cross-attention module to accelerate the matching. Existing diffusion models use cross-attention to inject conditional information, which tends to be textual encodings with less information. Unlike those models, this paper injects downstream task-related condition encodings,  that change with the application scenario, our

%However, denoising with two conditions in pixel space is time-consuming and resource-shortcoming. 
%Inspired by the LDM \cite{ldm}, we utilize an auto-encoder \cite{vqgan} to learn a latent space of the perceptual compression of SAR images, and perform diffusion process in latent space to reduce the computational complexity.
%%It greatly reduces the computational complexity, which is compatible with the expectation that matching is performed quickly. 
%Specifically, the encoder is respectively applied to the real SAR template image $T_{S}$, the corresponding optical template image $T_{O}$, and the search SAR image patches $S_{S}$ to obtain its emdeddings $x_0$, $c_{opt}$, and $c_{sar}$.

However, denoising in pixel space is time-consuming and resource-shortcoming. 
Inspired by the LDM \cite{ldm}, we utilize an auto-encoder \cite{vqgan} to learn a latent space of the perceptual compression of SAR images, and perform diffusion process in latent space to reduce the computational complexity.
%It greatly reduces the computational complexity, which is compatible with the expectation that matching is performed quickly. 
Specifically, the encoder is respectively applied to the real SAR template image $T_{S}$ and the search SAR image patches $S_{S}$ to obtain their emdeddings $x_0$, and $c_{sar}$. Correspondingly, the optical template $T_{O}$ is encoded to $c_{opt}$ by two convolutional layers. % because it only provides content information.
Notable, to speed up the perceptual compression, the search SAR images are splitted into smaller patches $S_{S}$ to input the encoder, and stitched after the encoding.
In the forward diffusion process, continuously add Gaussian noise into $x_0$ in each diffusion step to obtain $x_t$.
In the reverse diffusion,  denoise $x_t$ given conditions $c_{opt}$ and $c_{sar}$, which is expressed as:
\begin{equation}
p_\theta\left(x_{t-1} \mid x_t, c_{opt}, c_{sar}\right)=N\left(x_{t-1} ; \mu_\theta\left(x_t, t, c_{op t}, c_{sar}\right), \sigma_t^2 \mathbf{I} \right)
\end{equation}

The SAROPT-conditioned latent diffusion model not only increases the denoising speed, but also make the matching to be end-to-end, instead of constructing the SAR template  and then extracting features for the matching.  %the matching  in the latent feature space, and although it is possible to 
%On the one hand,  
%It is not use the decoder of the auto-encoder to convert the generated image into a real SAR image and then perform template matching, this would slow down the speed of the alignment, and we directly using latent space features for FFT-based NCC methods \cite{fftUnet} matching, which is not against our original intention of cross-modal accurate translation because the latent space is as valid as the pixel space. 
%On the other hand, 
%Moreover, experiments illustrate that the compressed latent space enable to maintain local and global spatial distributions.



\begin{figure}[t]
\centering
\scalebox{0.65}{\includegraphics[width=\linewidth]{./Figures/Fig_3.pdf}}
\caption{The gated dynamic sparse cross-attention module.}
\label{fig3}
\end{figure}

\subsection{Gated Dynamic Sparse Cross-Attention}
Despite denoising in the latent space, it is still requires a large computations in conditional features interaction. Particularly, the search image is relatively larger. 
In addition, conditional remote sensing images contains a large number of textureless regions, resulting in tedious similarity calculation, and even a negative effect. 
Therefore, we present a gated dynamic sparse cross-attention module to dynamically select effective regions to perform the cross-modal conditional interaction, shown in Figure \ref{fig3}. It firstly exploits the coarse-grained routing block (CRB) to efficiently calculate the sparse cross-attention map, and then utilizes the attention calibration block (ACB) to smooth the cross-attention map, and finally leverages an attention gate (AG) to control the preference between the conditional information interaction and the denoising generation. The details are elaborated on the following.
% the three blocks. Coarse-Grained Routing Block to discard these image blocks with low information content while reducing the computational overhead.

\textbf{\textit{Coarse-Grained Routing Block. }} Recently, Biformer \cite{biformer} presents the sparse self-attention to save both computation and memory. Benefiting from the idea, we downsample the feature map in the U-Net encoder to obtain cross attention on coarse-grained feature map, and then filter out redundant similar feature regions. Afterwards, the fine-grained cross attention is performed on the remaining feature regions, thus reducing the computational overhead. 

As shown in Figure \ref{CGRB}, the template feature maps $X$, and the search image  feature maps $Y$ are respectively divided into $s$ and $S$ feature patches as $X \in \mathbb{R}^{s \times P \times C}$ and $Y \in \mathbb{R}^{S \times P \times C}$. Afterwards, the query $Q$, key $K$, and value $V$, are calculated through the linear projection $W_q$, $W_k$, $W_v$:
\begin{equation}
Q=X W_q,  \quad 	K=Y W_k,  \quad	V=Y W_k
\end{equation}

%Then, information aggregation features were calculated for each patch by averaging the under-worth information aggregation features for $Q$ and $K$, respectively  $Q_c\in \mathbb{R}^{s \times C}$ and $K_c \in \mathbb{R}^{S \times C}$ , and then the coarse-grained cross-attention map $A$ is calculated:

Then, find patches in the $K$ that should be attended for each given patch in the $Q$.
%find the important feature patches for the template features $X$ in the search image features' projection $K$.
Specifically, we obtain the coarse-grained features of $Q$ and $K$ in the encoder, denoted as $Q_c\in \mathbb{R}^{s \times C}$ and $K_c \in \mathbb{R}^{S \times C}$. Herein, the average of each feature patch is adopted to integrate its information. 
% 筛选Y的有效特征块 通过计算分数来实现
Thus, the importance of feature patches of projected features $K$ of $Y$ can be represented by the coarse-grained cross-attention map $Map_{Y}$, formulated as:
\begin{equation}
Map_{Y}=\frac{{bmm}\left(Q_c, {K_c}^{T}\right)}{\sqrt{C}}
\end{equation}
%One difference with their method is that our method does not compute the coarse-grained cross-attention maps at the decoder stage of the U-net, but instead passes the cross-attention maps before up-sampling of the denoised U-net to the next layer, since the computed similarity between the more semantically informative features before the up-sampling is much more reliable, making the computation of the moment more focused on capturing the finer-grained information. 
%% A的计算限制
Notably, in the decoder stage, we directly input the cross-attention map of the previous layer into the next layers, instead of recalculating the coarse-grained cross-attention map of next layer high-resolution features. It not only avoids duplicate calculations, but also exploits the cross-attention map obtained by rich semantics.

\begin{figure}[t]
\centering
\scalebox{0.55}{\includegraphics[width=0.9\linewidth]{./Figures/CRB.pdf}}
\caption{The coarse-grained routing block.}
\label{CGRB}
\end{figure}
Besides, we calculate the importance of each feature patch in the template features $X$. % to avoid incorrect perceptions. 
%The approach \cite{selfKeypointDetection} defines an additional feature extractor and zero-averaging of the local spatial domain to determine the matching keypoints thus filtering outliers, in our method we define a combination of learned and unlearned methods for peak computation of the local spatial domain based on patches.
Different from the calculation of $Y$, we define a score to measure the important feature patches of $X$.
%   the variance of each patch is calculated as the main score.
It is known that patches with rich matching information tend to have higher variance values. Hence, the variance of each patch is adopted as a metric of information importance.
But, some outlier feature patches inevitably obtain high variances, which causes negative effects. 
To avoid selecting patches with high variances but side effects, we further exploit two learnable weights, $M_c$ and $M_s$, to dynamically interact with each feature patch of $X$ in the spatial and channel dimensions, respectively. Thus, a learnable importance score is obtained. 
Finally, use the learnable score to refine the variance score. The process is expressed as follows:
\begin{equation} 
\begin{gathered}
{ score}_v=\frac{1}{C \cdot P} \sum_{j=1}^C \sum_{i=1}^P\left(X_{i, j}-\frac{1}{P} \sum_i^P X_{i, j}\right)^2 \\
{score}_l=\operatorname{Sigmoid}\left(\operatorname{Mean}\left(X M_c\right)+\operatorname{Mean}\left(X^T M_s\right)\right) \\
{score}_X={score}_v \times {score}_l
\end{gathered}
\end{equation}
where ${score}_v$ is the variance measured importance, ${score}_l$ is the learned importance, and $score_X \in \mathbb{R}^{1\times s}$ is the refined importance metric. $X \in \mathbb{R}^{s \times P \times C}$ represents template features. $M_c \in \mathbb{R}^{C \times M}$ and $M_s \in \mathbb{R}^{P \times M}$ denote learnable weights of linear layers (1D-Conv), and $M$ is the specified dimension. In experiments, we set $M=C$.

%According to the score, invalid tokens from $X$ are filtered out.
%To filter out a certain number of tokens from $X$ and $Y$, 

Based on the feature patch importance $Map_Y\in \mathbb{R}^{s \times S}$  and $score_X\in \mathbb{R}^{1\times s}$ of $Y$ and $X$, the feature patches are selected. In experiments, we set a hyperparameter $\gamma$ (0$\sim$1) to control the number of selected tokens. Assuming that the number of feature patches of a feature map is $N$, the top $N \times \gamma$ tokens, i.e., $N_{\gamma}$ tokens are selected. Moreover, $\gamma$ is in inverse proportion to the feature resolution to ensure smaller computation and progressive refinement in the decoder.
The token selection process is formulated as:
\begin{equation}
\begin{gathered}
I_x=\operatorname{topkIndex}\left({score}_X\right) \\
Map_y=\operatorname{gather}\left(Map_{Y}, I_x\right) \\
I_y=\operatorname{topkIndex}\left(Map_y\right)   %\operatorname
\end{gathered}
\end{equation}
where '$\text{topkIndex}$' is used to obtain the indexes of top $K$ values in a feature map. '$\text{gather}$' is used to select feature patches based on the index. $I_x \in \mathbb{R}^{s_{\gamma}}$ is the indexes of the top $s_{\gamma}=s \times \gamma$ feature patches in $X$. $Map_y$ represents the selected row tokens of $Map_Y$. 
%$I_y \in \mathbb{R}^{s_{\gamma} \times S_{\gamma}}$ is $S_{\gamma}$ indexes in $Y$ chosen for the $s_{\gamma}$ feature patches in $X$. 
$I_y \in \mathbb{R}^{s_{\gamma} \times S_{\gamma}}$ is indexes of $S_{\gamma}$ patches in $Y$ that should be attended for each patch in the sparsed $X$. 
Based on the coarse-grained index matrix $I_x$ and $I_y$, the fine-grained projection features of the query, key, and value are obtained:
\begin{equation}
\begin{gathered}
Q_f=\operatorname{gather}\left(Q, I_x\right) \\ 
K_f=\operatorname{gather}\left(K, I_y\right) \\ 
V_f=\operatorname{gather}\left(V, I_y\right)
\end{gathered}
\end{equation}
Then, the sparse fine-grained cross-attention map $Map_{f} \in \mathbb{R}^{s_{\gamma}P \times S_{\gamma}P}$ $ (S_{\gamma}P = S_{\gamma} \times P )$ is calculated using the $Q_f \in \mathbb{R}^{s_{\gamma} \times P \times C}$, $K_f \in \mathbb{R}^{s_{\gamma} \times S_{\gamma}P \times C}$, and $ V_f \in \mathbb{R}^{s_{\gamma} \times S_{\gamma}P \times C}$:
\begin{equation}
Map_{f}=\frac{{bmm}\left(Q_f, {K_f}^{T}\right)}{\sqrt{C}}
\end{equation}
where ${bmm}$ is the batch matrix multiplication, the $T$ denotes the transpose operation. 

\begin{figure}[t]
\centering
\scalebox{0.75}{\includegraphics[width=0.9\linewidth]{./Figures/ACBAGB.pdf}}
\caption{The attention calibration block and the attention gate.}
\label{ACBAGB}
\end{figure}
\textbf{\textit{Attention Calibration Block. }}
All the selected coarse-grained feature patches have similar semantics, but the corresponding multiple fine-grained features contain different information, which lead to the fact that the fine-grained features in the same coarse-grained feature patch focus on inconsistent positions after cross-attention. 
To overcome the problem, we use a calibration module to locally smooth the cross-attention map, as shown in Figure \ref{ACBAGB}. 

%the distribution of highlighted values in the cross-attention map is sparse and discrete,
Due to the fact that the cross-attention map represents the matching position of template features $Q_f$ to the search features $K_f$, the distribution of highlighted values in the cross-attention map is diagonal sparse and discrete, shown in Figure \ref{fig_4}. 
Therefore, each row of the cross-attention map ${Map}_{f}$ is firstly circularly shifted to achieve spatial alignment attention map ${Map}_{f}$ according to $I_x$ and $I_y$. 
Then, three convolution operations $K_1 \in \mathbb{R}^{1 \times C}$, $K_2 \in \mathbb{R}^{1 \times C}$, and $K_3 \in \mathbb{R}^{C \times 1}$ are performed on the spatially aligned attention map for smoothing, expressed as:
\begin{equation}
\begin{gathered}
{Map}_{fa} = \operatorname{Align}\left( {Map}_{f} \right) \\
{A}_w=\operatorname{Sigmoid}\left(\operatorname {Up}(\operatorname{AvgPool}({Map}_{fa}) * K_1)\right) \\
{Map}_{fa\_sm}=\left(\left({{Map}_{fa}} * K_2\right) \cdot {A}_w\right) * K_3
\end{gathered}
\end{equation}
where $\operatorname{AvgPool}(\cdot)$ is average pooling with pooling size 2$\times$2, and its step size is 2. $\operatorname{UP}(\cdot)$ is a bilinear interpolation. 

After obtaining the smoother attention map, the cross-attention map is reset to the origin spatial position by cyclic shifting to calculate the attention features, represented as:
\begin{equation}
\begin{gathered}
{Map}_{f\_sm} = \operatorname{InvAlign}\left( {Map}_{fa\_sm} \right) \\
X_{a}=\operatorname{bmm}\left(\operatorname{Softmax}({Map_{f\_sm}}), V_f\right) \\
X_{a0}=\operatorname{Padding}\left(X_{a}, 0\right)
\end{gathered}
\end{equation}
where $X_{a0}$ denotes the features padded the unselected feature regions with 0. It has the same size as the original feature $X$.  
As a note, the calibration module is lightweight, since it performed on 2-D cross-attention map.


\textbf{\textit{Attention Gate. }} 
The remote sensing images often contain plain areas, which is textureless. Hence, using the kind of images as conditions is ineffective and even cause a negative effect on the denoising generation. 
In the situation, the denoising network should focus on using the input itself to autonomously generate, instead of relying on the information in conditions.
Considering that, we design an attention gate to control the inflow of conditional information, as shown in Figure \ref{ACBAGB}.

In detail, the smaller the variance of the spatially aligned cross-attention map, the better the attention. Hence, the variance is calculated to measure the attention effect. 
Moreover, conditional features at different steps have different perceptual abilities, the time step $t$ is also introduced to control the usage of conditions in $t^{th}$ step.
Given the spatially aligned and smoothed cross-attention map ${Map_{fa\_sm}} \in \mathbb{R}^{I \times J}$ ($I=s_{\gamma}P$, $J=S_{\gamma}P$), the step time embedding $t_{emb} \in \mathbb{R}^{C/2}$, the linear encoding layers $L_1 \in \mathbb{R}^{1 \times C/2}$ and $L_2 \in \mathbb{R}^{C \times C}$, the attention gate is expressed as:

\begin{equation}
\begin{gathered}
{var}=\frac{1}{I\times J} \sum_{j=1}^J \sum_{i=1}^I\left({Map_{fa\_sm}}_{i, j}-\frac{1}{I} \sum_i^I {Map_{fa\_sm}}_{i, j}\right)^2 \\
O=X_{a0} \cdot \operatorname{Sigmoid}\left(L_2\left(\operatorname{concat}\left[t_{\text {emb}}, L_1\left({var}\right)\right]\right)\right)
\end{gathered}
\end{equation}


\subsection{Spatial position consistency constraint}
There exists great scattering characteristic differences in cross-modal images, the random matching position, and multiple similar matching, which leads to incorrect attention. 
Therefore, the paper further design a spatial position consistency constraint to constrain the correspondence relationships in the cross-attention map. 
We use the cross-entropy loss to constrain the predicted heatmap $\hat{p}$ and ground-truth attentional map $p$, express as:
\begin{equation}
\begin{gathered}
L_{spcc}= -\sum_i p_i \cdot \log \left(\hat{p}_i\right)+\left(1-p_i\right) \cdot \log \left(1-\hat{p}_i\right)
\end{gathered}
\label{eqloss1}
\end{equation}


\begin{figure}[t]
\centering
\scalebox{0.8}{\includegraphics[width=0.87\linewidth]{./Figures/Fig_4.pdf}}
\caption{The generation of ground-truth cross-attention maps. }
\label{fig_4}
\end{figure}



The ground-truth attention map $p$ is generated as shown in Figure \ref{fig_4}. 
Assuming that the template image (the red box) (128$\times$128) is matched on the position of (x=40, y=40) in the search image (256$\times$256), the ground-truth matching position in the 32$\times$ downsampling feature map is a decimal. In general, we regard the attention value of the matching region is 1, while the non-matching region's is 0. 
Hence, we divide 1 into different weights according to the ratio of each pixel of the template (4$\times$4) to each pixel of the search (8$\times$8), resulting in 16$\times$64 ground truth matches values to constrain the attention map.


\subsection{Loss functions}
The optimization of the proposed dynamically conditioned diffusion model involves two parts: the denoising network, and the spatial position consistency constraint.
To optimize the denoising network, the $L2$ loss and the SSIM\cite{ssim} loss are adopted, expressed as:  
\begin{equation}
\begin{gathered}
L_{den}=\left\|f_\theta\left(x_t, t\right)-x_0\right\|^2+\operatorname{SSIM}\left(f_\theta\left(x_t, t\right), x_0\right)
\end{gathered}
\end{equation}

In summary, the total loss $L$ is the sum of the denoising loss $L_{den}$ and the spatial position consistency constraint loss $L_{spcc}$ (refer to Equation (\ref{eqloss1}) ):

\begin{equation}
L=L_{den} + L_{spcc}
\end{equation}


\section{EXPERIMENTS AND ANALYSES}
\subsection{Datasets}

\textbf{Sentinel-1 and Sentinel-2 (SEN1-2) datasets: } The SEN1-2 dataset \cite{sen12} contains a total of 282,384 aligned SAR and optical image pairs with the size of 256$\times$256. The SAR images are acquired from the dual-polarized SAR data of Sentinel-1. The optical images are acquired from the multispectral images of Sentinel-2. The fourth, third and second bands are used to generate RGB images. The dataset is collected from four seasons. The images have a spatial resolution of 10m. In experiments, 4088 image pairs are randomly selected from the spring season data for evaluation. The training, testing and validation sets are splitted by a ratio of 7:2:1. The SAR images with the size of 256$\times$256 are treated as the search images, and the optical images cropped 128$\times$128 as the template.

\noindent
\textbf{OSdataset: } The OSdataset \cite{osdataset} consists 2673 image pairs with a resolution of 512×512, and 10692 image pairs with the size of 256$\times$256. This dataset collects images of scenes from cities around the world. The SAR images are captured by the sensor of Gaofen-3 (GF-3) multipolar C-band SAR satellite, and the optical images are obtained from the Google Earth platform with a spatial resolution of 1m. In experiments, 2673 image pairs are used for evaluation. The ratio of the training, validation and testing sets is 7:2:1. The SAR images with the size of 512$\times$512 are chosen as the search image, and the optical images cropped as 256$\times$256 are treated as the template.

\subsection{Experimental Settings}
\textbf{\textit{Implementation details: }}
The method is implemented based on the PyTorch framework, and run on Nvidia Geforce RTX4090 GPU and Core i7-12700KF CPU.
The feature stride of the autoencoder is 4. 
%In addition, this autoencoder can be used on the SEN1-2 dataset without training a new autoencoder. 
The feature strides of each feature layer in the denoising U-Net are \{1, 2, 4, 8\}.
The gated dynamical sparse cross-attention module is used at the end of the feature layer with stride of \{2, 4, 8\}. The sparsity parameter $\gamma$ is fixed to 1 when feature stride is 8, since this layer features have rich semantics and few tokens. 
To train the denoising U-Net, we set the minimum time step $t$ to 0.001. The model is trained using the AdamW optimizer for 30k iterations. The decay rate ranges from 5e-5 to 5e-6.  

\noindent
\textbf{\textit{Evaluation metrics: }}
%In the application \cite{evalMetrics}, 
In experiments, the Root Mean Square Error (RMSE) and Correct Matching Rate (CMR) are used as evaluation metrics.
The RMSE measures the average Euclidean distance between the prediction and the ground-truth.
The CMR denotes the correct matching rate when the RMSE is less than a given threshold $T$, denoted as CMR(T). In the heterogenous reomote sensing image matching task, the matching error is less than or equal to 5 pixels is regarded as the successful matching. Therefore, we choose CMR (T=5) and RMSE (T=5) to evaluate the proposed approach. 
To further evaluate the overall matching performance, the average mean squared error of all samples, RMSE (all), is also adopted.

\begin{table}[t]
\setlength{\tabcolsep}{1.8mm}
\renewcommand{\arraystretch}{0.85} 
\centering
\caption{The comparison of the state-of-the-arts on SEN1-2 dataset.}
\scalebox{1.0}{
\begin{tabular}{ccccc}
\toprule[1pt]
Methods & CMR(T=5) & RMSE(T=5) & RMSE(All) & Time(s) \\
\midrule
NCC   & 0.4068 & 2.2905 & 38.3845 & 60.1576 \\
NMI   & 0.5739 & 1.3213 & 26.8528 & 86.2218 \\
CFOG & 0.6667 & 1.5395 & 19.1622 & 0.1672 \\
RIFT & 0.8043 & 1.6044 & 15.7632 & 0.2114 \\
Psiam & 0.6884 & 1.986 & 22.2338 & 95.4723 \\
VSMatch & 0.7174 & \textbf{1.3156} & 20.6323 & 87.155 \\
OSMNet & 0.9168 & 2.3086 & 4.7457 & 0.0568 \\
MARU-Net & 0.9056 & 1.4007 & 5.2601 & \textbf{0.0346} \\
Ours  & \textbf{0.9302} & 1.3496 & \textbf{4.5227} & 0.0621 \\
\bottomrule[1pt]
\end{tabular}%
}
\label{ComSen1-2}%
\end{table}%

\subsection{Comparison with State-of-the-arts}
\noindent
\textbf{\textit{Quantitative comparison. }}
To evaluate the performance of the proposed method, we compare it with state-of-the-arts on SEN1-2 and OSdataset. The compared approaches include traditional methods, NCC \cite{NCC}, NMI \cite{MI}, CFOG \cite{cfog}, RIFT \cite{rift}, and deep-learning-based methods, PSiam \cite{Psiam}, VSMatch \cite{VSmatch}, SCMatch \cite{SCmatch}, OSMNet \cite{explorebeter}, MARU-Net \cite{MARUnet}. 
%in which OSdataset has higher spatial resolution and image resolution with higher matching difficulty.

Table \ref{ComSen1-2} shows the matching performance of state-of-the-arts on SEN1-2 dataset. It is seen that the proposed method achieves 93.02\% on CMR(T=5), performing best. Compared to the state-of-the-art approach OSMNet, our method improves 1.34\%. 
The deep-learning based method, VSMatch, achieves the best RMSE(T=5) of 1.3156, which has a slight improvement of 0.034 compared to our algorithm.  
For the matching time, the feature-based approaches, CFOG and RIFT, and the deep-learning based methods, OSMNet, MARU-Net, and our methods, takes less time. The MARU-Net only takes 0.0346s. 

Compared to the overall performance on the SEN1-2 dataset, the OSdataset has a drop, since images in the OSdataset have higher spatial resolution, containing noises. 
Table \ref{ComOS} compares the matching results on the OSdataset. 
It is obvious that the region-based methods, NCC and NMI, perform poorly and take more time. The NCC only obtains 10.83\% on CMR(T=5), and 2.8915 RMSE(T=5). 
Compared to region-based methods, the feature-based methods achieve a large improvement, where RIFT has 75.83\% on CMR(T=5).
% it is higher than the learning-based methods Psiam and VSmatch, and has a shorter matching time. 
The learning-based algorithms, Psiam, SCMNet and VSmatch are under 80\% on the CMR(T=5). Whereas, the SCMNet achieves the best result on RMSE(T=5), reaching 1.318. 
Moreover, the three methods have quite long inference times compared to other learning-based and feature-based methods,  
since they slide the template by pixel over the search image and feed into the matching model to compute the similarity of matches.
On the contrary, OSMnet, MARU-Net and our method take very short matching time. 
Furthermore, our method achieves 84.91\% on CMR(T=5), having a 1.34\% gains compare to the state-of-the art approach MARU-Net.

%Although our our method has a lower RMSE(T=5) compared to other intensively computed methods such as SCMNet, it has a higher accuracy than the other methods on  the two datasets, and we have a lower RMSE(all) than any of the other methods on the SEN1-2, which implies a lower matching error.




% Table generated by Excel2LaTeX from sheet 'Sheet1'
\begin{table}[t]
\setlength{\tabcolsep}{1.6mm}
\renewcommand{\arraystretch}{0.9} 
\centering
\caption{The comparison of the state-of-the-arts on OSdataset.}
\scalebox{0.9}{
\begin{tabular}{ccccc}
\toprule[1pt]
Methods & CMR(T=5) & RMSE(T=5) & RMSE(All) & Time(s) \\
\midrule
NCC & 0.1083 & 2.8915 & 79.7331 & 1034.1178 \\
NMI & 0.275 & 2.0746 & 62.1353 & 1250.2496 \\
CFOG & 0.5417 & 1.5922 & 22.9735 & 1.1465 \\
RIFT & 0.7583 & 1.9245 & 17.0226 & 6.6118 \\
Psiam & 0.5128 & 1.8952 & 27.6463 & 519.1755 \\
VSMatch & 0.6496 & 1.7437 & 24.2316 & 451.2569 \\
SCMNet & 0.7833 & \textbf{1.318} & 12.1056 & 63.4261 \\
OSMNet & 0.8043 & 2.4922 & 9.2731 & 0.0913 \\
MARU-Net & 0.8357 & 2.2495 & \textbf{6.9049} & \textbf{0.0895} \\
Ours  & \textbf{0.8491} & 2.29  & 7.61  & 0.1093 \\
\bottomrule[1pt]
\end{tabular}}%
\label{ComOS}%
\end{table}%

\begin{figure}[t]  
\centering  
\scalebox{0.8}{\includegraphics[width=0.485\textwidth]{./Figures/comparison_vision_1.pdf}}  
\caption{The comparison of two samples' matching similarity map.}  
\label{comparison_vision_1}
\end{figure}
\noindent
\textbf{\textit{Qualitative comparison of similarity maps. }}
Figure \ref{comparison_vision_1} qualitatively shows the similarity maps produced by different methods on the two samples. In the similarity map, the higher the response value, the brighter the color. The location corresponding to the peak value is the best matching position. 
The red boxes in the first column 'Ground truth' denote the ground-truth matching areas. The red dots in other approaches represent the ground-truth offset coordinates in the matching map.

%One sample is a rural scene, and the other is an urban area. 
The first-row sample shows a pair of optical-SAR image in the rural farmland. The image has little texture and multiple similar regions, which makes optical-SAR matching be challenging. The comparison methods obtain similar responses in parallel linear directions, which is difficult to produce focused similarity maps. Especially, there exists a large highlight regions of similarity map obtained by NMI. VSMatch fails to correctly focus on, and the focused peak of MARU-Net is relatively unremarkable.
Whereas the proposed method has low response values in non-matched regions and has a relatively prominent single peak.

The second-row sample depicts an urban area with dense geometrically similar buildings. 
The NMI and VSMatch are shifted from the correct matching point, where the VSMatch shows an undesired neighbouring bimodal characteristic. 
The peaks of the proposed method and MARU-Net correspond to the matched regions. 
But the response values of the similarity map achieved by our method are smoother in the non-matched region, which is due to the fact that generating the same modality features are more distinguishable.


\begin{figure}[t]  
\centering  
\scalebox{0.75}{\includegraphics[width=0.48\textwidth]{./Figures/comparison_vision_2.pdf}} 
\caption{The visualization comparison of extracted features and similarity maps.}  
\label{comparison_vision_2}
\end{figure}
\noindent
\textbf{\textit{Qualitative comparison of matching features. }}
Figure \ref{comparison_vision_2} visualizes the two samples' template feature maps (Template $O'$) and search feature maps (Search $S'$) obtained by the most representative state-of-the-art method MARU-Net and our approach. 
The first sample is a rural farmland area with less texture, and the second sample is a complex urban road area. 
Compare the feature maps and similarity maps produced by the two methods, it is observed that both the template and search feature maps of MAR-Net have very sparse texture features (especially for the first textureless regions) and inconsistent visualization, even though it attempts to map them into a shared feature space. These unreliable features lead to high correlation in non-matching regions. 
In contrast, the template and search feature maps generated by our method maintains all the texture information of the original image. This is due to the fact that our method focuses on generating feature maps of the same modality in latent space, which is more interpretable, and ensures that the similarity maps are single-peaked.


\subsection{Ablation Experiments}
To verify the effect of each proposed components and the parameters on the matching performance, we conduct ablation experiments on the SEN1-2 dataset.

\noindent
\textbf{\textit{The effect of key components. }}
Based on the LDM, we proposed three key components to improve the cross-modal remote sensing image matching performances. To verify the effectiveness of each proposed component, we conduct ablation experiments on the coarse-grained routing block (CRB), the attention calibration block (ACB), and the attention gate (AG). 
For the baseline and our approach, the denoising sampling step $t$ is set to 5, and the cross-attention module is added to feature layers with strides of \{2, 4, 8\}. 
For our method, the sparsity parameter, $\gamma$, is set to (1, 3/4, 3/8). 
%, it and N will be discussed in the later section. 
As shown in Table \ref{ACBAGBtable}, the baseline only achieves 84.57\% on the CMR(T=5). 
Compared to the cross-attention in the baseline, the proposed CRB outperforms the base attention by 2.15\% on the CMR(T=5), reaching 86.72\%. The attention calibration module obtains a remarkable gains of 4.51\% on CMR(T=5), reaching 91.23\%.  
The attention gate module improves 2.59\% on CMR(T=5), achieving 89.31\%. 
The proposed three modules achieve remarkable improvements on the CMR(T=5) and the RMSE(T=5).
Although introducing three modules, the inference time is not significantly increased, since the proposed modules are lightweight. 
%The experiments show that the proposed two modules are effective and all subsequent experiments will use the proposed two modules.
% Table generated by Excel2LaTeX from sheet 'Sheet1'
\begin{table}[t]
\setlength{\tabcolsep}{1.6mm}
\renewcommand{\arraystretch}{0.85} 
\centering
\caption{The ablation study on the SEN1-2 dataset.}
\scalebox{1.0}{
\begin{tabular}{c|ccccrrr}
\toprule[1pt]
Base & CRB & ACB   & AG   & CMR(T=5) & RMSE(T=5) & Time(s) \\
\midrule
$\surd$ &  &    &   & 0.8457  & 2.1341  &0.1021    \\
& $\surd$ & \multicolumn{1}{c}{} & \multicolumn{1}{c}{} & 0.8672 & 1.8904 & \textbf{0.0542} \\
& $\surd$ & $\surd$     & \multicolumn{1}{c}{} & 0.9123 & 1.5301 & 0.0567 \\
& $\surd$ & \multicolumn{1}{c}{} & $\surd$     & 0.8931 & 1.6577 & 0.0605 \\
& $\surd$ & $\surd$     & $\surd$     & \textbf{0.9302} & \textbf{1.3496} & 0.0621 \\
\bottomrule[1pt]
\end{tabular}%
}
\label{ACBAGBtable}%
\end{table}%

\begin{table}[t]
\setlength{\tabcolsep}{1.5mm}
\renewcommand{\arraystretch}{0.85} 
\centering
\caption{The effect of the sparsity parameter $\gamma$ on the SEN1-2. GDSCs is added to features with strides of \{2, 4, 8\}.}
\scalebox{1.0}{
\begin{tabular}{ccccc}
\toprule[1pt]
$\gamma$ & CMR(T=5) & RMSE(T=5) & RMSE(All) & Time(s) \\
\midrule
$\gamma$=(1, 1, 1/2) & 0.9175 & 1.4021 & 4.8412 & 0.0937 \\
$\gamma$=(1, 3/4, 3/8) & \textbf{0.9302} & \textbf{1.3496} & \textbf{4.5227} & 0.0621 \\
$\gamma$=(1, 1/2, 1/4) & 0.8843 & 1.6545 & 5.0463 & 0.0441 \\
$\gamma$=(1, 1/4, 1/8) & 0.8254 & 1.9874 & 6.7734 & \textbf{0.0312} \\
\bottomrule[1pt]
\end{tabular}%
}
\label{parameter_N}%
\end{table}%

\noindent
\textbf{\textit{The effect of the number of GDSCs and sample steps. }}
We investigate the effect of the number of GDSCs on the matching accuracy. 
In the denoising U-Net, the gated dynamical sparse cross-attention (GDSC) module are respectively introduced into the three feature layers corresponding to feature strides of \{2, 4, 8\}. The sparsity parameters $\gamma$ in the three feature layers are set as 1, 3/4, and 3/8, respectively. 
Figure \ref{layers_steps} illustrates the effect of the number of the GDSC module. It is found that the CMR(T=5) improves with the increase of the number of GDSCs, which is due to the fact that a larger number of attention layers enable the network to capture fine-grained information. 

In addition, it is observed that the denoising sample steps influence the matching performance. The higher the number of sample steps, the higher the value of CMR(T=5).
Nevertheless, there are a few increases when the sample steps greater than 5. 
For instance, given the added feature layers of GDSC as feature strides of \{2, 4, 8\}, 
the CMR (T=5) only increases by 0.5\% when the sampling steps increase from 5 to 10 (the green line in Figure \ref{layers_steps}), when the GDSC is introduced into features layers . 
To balance the trade-off between the speed and the performance, the sample steps is set as 5 in experiments.


\noindent
\textbf{\textit{The effect of the sparsity parameter $\gamma$. }}
We further analyze the effect of the sparsity parameter $\gamma$ on the matching accuracy and time. 
As shown in Table \ref{parameter_N}, the model performs best when the sparsity parameters $\gamma$=(1, 3/4, 3/8), where the CMR reaches to 93.02\% and the RMSE is 1.3496. 
% the achieves relatively short matching time. 
Compared to the best performance, the CMR(T=5) slightly decreases by 1.27\% and the RMSE(T=5) increases by 0.0525 when $\gamma$=(1, 1, 1/2), since larger $\gamma$ tends to introduce more error messages and greater matching delays. 
The CMR(T=5) decreases by 4.59\% when the sparse parameters $\gamma$=(1, 1/2, 1/4), and even decreased by 10.49\% when $\gamma$=(1, 1/4, 1/8). 
This is due to the fact that too smaller $\gamma$ causes insufficient cross-modal information interaction, which make the model be difficult to converge and thus degrading the performance.

\begin{figure}[t]
\centering
\includegraphics[width=0.34\textwidth]{./Figures/sample_steps.pdf}
\caption{The effect of the number of GDSCs and sample steps on CMR(T=5) with the SEN1-2.}
\label{layers_steps}
\end{figure}

\titlespacing*{\section}{0pt}{3ex}{1ex}
\section{CONCLUSION}
The paper proposes a dynamically conditioned diffusion model to achieve the interpretable and robust optical-SAR cross-modal image matching. 
Specifically, the gated dynamic sparse cross-attention module is used to guide the diffusion model to capture information from conditions through the efficient long-range cross-modal interactions, and thus filtering out outlier matching regions.
In addition, the spatial position consistency constraint promotes the cross-attention features to perceive the spatial corresponding relation in different modalities, and improves the matching accuracy. 
Experimental results on two datasets show that the proposed method outperforms state-of-the-art approaches in terms of the matching accuracy and the interpretability. 
The study provides an exploration for future researches on the cross-modal image matching or registration under the diffusion models. 

\begin{acks}
This work is supported by the project of Science and Technology Development Plan in Hangzhou under Grant No. 202202B38, in part by the Fundamental Research Funds for the Central Universities under Grant No.XJSJ24071, in part by the Key Laboratory of Cognitive Radio and Information Processing, Ministry of Education under Grant No.CRKL230204, in part by the Fundamental Research Funds for the Central Universities under grant No.XJSJ24072, in part by the National Natural Science Foundation of China under grant No. 62302355.
\end{acks}

%%
%% The next two lines define the bibliography style to be used, and
%% the bibliography file.
\bibliographystyle{ACM-Reference-Format}
\bibliography{sample-base}

\end{document}
\endinput
%%
%% End of file `sample-sigconf.tex'.
