% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams



\usepackage{bm}
\usepackage{threeparttable}
\usepackage{multirow}
\usepackage{multicol}
\usepackage{caption}
\usepackage{subfigure}
\usepackage{amssymb}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Fine-Grained Matching with Multi-Perspective Similarity Modeling for Cross-Modal Retrieval}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[ ]{{Xiumin Xie}{}}
\author[ ]{Chuanwen Hou}
\author[ ]{Zhixin Li \thanks{Zhixin Li is the corresponding author.}}
% Add affiliations after the authors
\affil[ ]{%
	Guangxi Key Lab of Multi-source Information Mining and Security, Guangxi Normal University, Guilin 541004, China
}
  
  \begin{document}
\maketitle


\begin{abstract}
	Cross-modal retrieval relies on learning inter-modal correspondences. Most existing approaches focus on learning global or local correspondence and fail to explore fine-grained multi-level alignments. Moreover, it remains to be investigated how to infer more accurate similarity scores. In this paper, we propose a novel fine-grained matching with Multi-Perspective Similarity Modeling (MPSM) Network for cross-modal retrieval. Specifically, the Knowledge Graph Iterative Dissemination (KGID) module is designed to iteratively broadcast global semantic knowledge, enabling domain information to be integrated and relevant nodes to be associated, resulting in fine-grained modality representations. Subsequently, vector-based similarity representations are learned from multiple perspectives to model multi-level alignments comprehensively. The Relation Graph Reconstruction (SRGR) module is further developed to enhance cross-modal correspondence by constructing similarity relation graphs and adaptively reconstructing them. Extensive experiments on the Flickr30K and MSCOCO datasets validate that our model significantly outperforms several state-of-the-art baselines.
\end{abstract}

\section{Introduction}\label{sec:intro}
Cross-modal retrieval refers to retrieving the most relevant text (image) by utilizing the image (text) as a query. Its core is to detect the potential correlation between different modalities and then measure cross-modal similarity to achieve relatively accurate matching \citep{Hou2021Unsupervised}.

Existing methods mainly learn global or local alignment between image and text for retrieval. The global alignment learning methods \citep{hardoon2004canonical,karpathy2015deep,zheng2020dual} aim to discover correspondences between entire image and text, but fail to investigate fine-grained correspondence between regions and words. As a result, local alignment learning methods \citep{niu2017hierarchical,chen2019cross,chen2020expressing} are presented that are used to model the region-word correspondence. However, global or local alignment is one-sided. Therefore, some researchers jointly model global and local correspondences to obtain multi-level alignments. The multi-level alignment methods \citep{huang2018bi,li2020combining,nguyen2021deep,yuan2021vsr++} can provide more complimentary as well as comprehensive semantic cues, thus improving performance. Furthermore, fine-grained alignment should account for both inter-modal and intra-modal correlations. %However, only a few approaches can explore fine-grained multi-level alignment.
\begin{figure}[tbp]
	\centering
	\subfigure[Coarse-grained alignment]{
		\includegraphics[width=1.5in]{fig1(a).pdf}
	}
	\subfigure[Fine-grained alignment]{
		\includegraphics[width=1.5in]{fig1(b).pdf}
	}
	\caption{Illustration of coarse and fine-grained alignment.} %  %大图名称
	\label{fig:1}
\end{figure}

More importantly, similarity representation and learning are key to cross-modal matching. Most approaches \citep{chen2020imram,wang2020consensus,li2019visual} compute scalar-based cosine distances to reflect the cross-modal similarity. However, this merely yields a constant scalar value and fails to adaptively refine the visual-semantic correspondence. Recently, some novel methods \citep{kuang2019fashion,diao2021similarity} have achieved excellent results by representing similarity with vectors rather than scalars. These methods, however, use a single form of similarity representation and fail to learn cross-modal similarity in a fine-grained manner.

In summary, there are several challenges for fine-grained image-text retrieval. To begin, it needs to consider both global and local alignment, as they facilitate the interaction between the "object" and the "global context", as shown in Figure \ref{fig:1}, where the region-global interaction "women on a tennis court". Second, existing methods fail to explore fine-grained intra-modal correlation which can provide richer semantic information. As illustrated in Figure~\ref{fig:1}(b), the interaction between the "racket", the "ball", and the women's "arm" regions corresponds to the phrase "playing tennis". 
Significantly, the representation of similarity and its learning should be considered from multiple perspectives. For example, "A woman is hitting a tennis." is semantically related to the image in Figure~\ref{fig:1}(vectors are in the same direction), yet they are mismatched (there are numerical differences).

Motivated by these, we propose a novel fine-grained matching with Multi-Perspective Similarity Modeling (MPSM) Network for cross-modal retrieval. Specifically, we first construct visual and textsual semantic knowledge graphs. Then, we introduce the Knowledge Graph Iterative Dissemination (KGID) module that learns fine-grained intra-modal correlations and modal representations by iteratively propagating the knowledge. Subsequently, we learn vector-based similarity representations from multiple perspectives separately to model multi-level alignment. The proposed similarity representations are learnable and can comprehensively explore image-text correspondences. Furthermore, we designed the Similarity-Relation Graph Reconstruction (SRGR) module to achieve more accurate matching by constructing similarity relation graphs and adaptively reconstructing similar relations. Our main contributions are summarized below: 
\begin{itemize}
	\item We propose a KGID module that integrates domain information of nodes and iteratively propagates semantic information to neighboring nodes to capture fine-grained local and global representations.
	\item We learn vector-based similarity from multiple perspectives, which allows for more comprehensive learning of multi-level correspondences. To our knowledge, this is the first work to considers both distance and direction similarity for similarity representation and learning.
	\item We design a SRGR module in which the similarity relational graphs are constructed and reconstructed adaptively to achieve information interaction between multi-level alignments, filter interference and enhance similarity, thus improve matching accuracy.
\end{itemize}

\section{Related Work}
\subsection{Image-Text Retrieval}
Existing methods can be roughly split into global alignment, local alignment and multi-level alignment learning methods.

The global alignment learning methods seek to learn correspondences between the entire image and text. \cite{frome2013devise} were the first to map the full image and text into a common space for semantic alignment. Some approaches are inspired by generative adversarial network (GAN)\citep{goodfellow2014generative}, \cite{wang2017adversarial} employs GAN to produce features. There were also methods focusing on optimization, \cite{DBLP:conf/bmvc/FaghriFKF18} presented an optimization scheme that increases the distance between samples and hard samples. Furthermore, \cite{wang2018learning} emphasized the need of maintaining both inter-modal and intra-modal correspondence. Nevertheless, the above mothods neglected fine-grained semantic associations between regions and words, as well as intra-modal associations. 

The local alignment learning methods explore region-word correspondence to acquire more accurate similarities. \cite{karpathy2014deep} made the first attempt by combining the alignment of related region-word pairings. \cite{lee2018stacked} used an attention mechanism to align each region with all words, verifying the efficiency of region-word alignment. Many of the following works were based on \citep{lee2018stacked}. For example, \cite{wang2019position} followed \citep{lee2018stacked} to model the region-word relation. Several motheds focused on both inter-modal and intra-modal relations, such as \citep{liu2019focus,zhang2020context,wei2020multi}. However, these mothods failed to comprehensively explore fine-grained visual-semantic similarity. Unlike, we dynamically explore intra-modal correlations and model multi-level correspondences for more complete alignment.

Recently, researchers are increasingly exploring both global correspondence and local correspondence to measure cross-modal similarity comprehensively\citep{qi2018cross,ma2019bidirectional,wen2020learning}. Some approaches first tried \citep{qi2018cross,ma2019bidirectional} to tackle the image-text matching in a multi-pathway, computing the global and local similarities, and combining them into the final similarity. However, these approaches ignore that a word or a region may have different semantics in different global contexts, while global contextual information can be used as a clue for semantically similar samples \citep{wei2021integrating,xian2022dual}. Based on this, \cite{ji2021step} implemented local-to-local, global-to-local, and global-to-global cross-modal alignments in turn. Further, \cite{diao2021similarity} computed similarity representations for all local and global representation pairs simultaneously. However, existing multi-level alignment methods were insufficient for the learning of similarity, capturing only limited information.

\subsection{Similarity Representation Learning}
The core of cross-modal retrieval is the learning of similarity between different modalities. As for global alignment methods, some \citep{DBLP:conf/bmvc/FaghriFKF18,wang2016learning,li2021matching} explored similarity by computing the cosine distance. Another branch \citep{vendrov2015order,huang2018learning,gu2018look} introduced ordered representations. As for local alignment, most methods \citep{liu2019focus,wang2019position} computed scalar-based cosine distance to reflect region-word similarity. Furthermore, most multi-level alignment methods \citep{qi2018cross,ma2019bidirectional} modeled local and global alignments separately by using scalar-based cosine distance and combined them to reflect the similarity. 

The above approaches' similarity representations are scalar values that cannot learn fine-grained visual-semantic correlations adaptively. 
Recently, some innovative approaches \citep{diao2021similarity} to similarity representation and learning have been developed, \cite{diao2021similarity} learned vector-based similarity representation to explore multi-level alignment, and achieved some improvement. However, for similarity, it appears to be one-sided and fails to learn the correlation between vectors in a thorough manner. Differently, we learns vector-based similarities from multiple perspectives to model multilevel alignment more comprehensively.

\subsection{The Difference with Other Method} 
Compared to the GSMN \citep{liu2020graph}, which also captures semantic information, while it lacks the mining of fine-grained intra-modal interactions. Instead of conducting basic matching, we consider integrating and spreading semantic knowledge among nodes. It allows for the dynamic mining of intra-modal correlations as well as the capturing of semantically rich features. Besides, comparison to SGRAF \citep{diao2021similarity}, which also utilizes graph reasoning. One of the key differences is that we not only mine the rich semantic information within the modality but also adaptively associates relevant nodes. It thus enables semantically rich intra-modal correlations to be included in inter-modal similarity learning and inference, which SGRAF \citep{diao2021similarity} does not make possible. 

Another aspect, we cleverly model multi-level alignments and perform similarity inference from multiple perspectives, whereas most methods simply analyze a single angle. To our knowledge, it is the first study to consider vector-based similarity representations and learning from multiple perspectives that are complementary. It enables more comprehensive learning of cross-modal correspondences.

\section{ METHODOLOGY}
As shown in Figure \ref{fig:2}, we first construct the semantic knowledge graphs. The KGID modules are then developed to learn fine-grained modal representations. Subsequently, vector-based similarity representations are learned from two perspectives: distance or direction similarity, to comprehensively explore multi-level correspondences. Finally, the SRGR module, which promotes the interaction between global and local alignments by constructing and adaptively reconstructing similarity relation graphs.
\begin{figure*}[htb]
	\centering
	\begin{minipage}[t]{1\linewidth}
		\centering
		\includegraphics[width=1\textwidth]{fig2.pdf}
	\end{minipage}%
	\centering
	\caption{An overview of our MPSM. It is made up of four modules: (1) Semantic knowledge graph construction: extracting features and constructing semantic knowledge graphs; (2) KGID module: nvestigating semantically relevant features; (3) Multi-Perspective Similarity Representation: learning vector-based similarity representation from distance (yellow path) or direction (red path) similarity; (4) SRGR module: enhancing relevant similarities.}
	\label{fig:2}
\end{figure*}

\subsection{Semantic Knowledge Graph Construction}

\subsubsection{Visual Semantic Knowledge Graph}
Given a raw image  ${\bm{I}}$, we use the Faster-RCNN \citep{krishna2017visual}, which is pre-trained on Visual Genome, to detect $ n $ ($ {n} = {36} $) prominent regions. Then, we add a fully connected layer to transform them into D-dimensional vectors to obtain region representations $ \bm{V}=\left\{\bm{v}_{1}, \bm{v}_{2}, \ldots, \bm{v}_{n}\right\}$.

Formally, we aim to create an undirected, fully connected visual semantic knowledge graph $\bm{G}_{v}=\left\{\bm{V}_{v}, \bm{E}_{v}\right\}$ for each image, with the detected regions set as nodes, and the edge denoted as a matrix $\bm{E}_{v}$. On the one hand, there are spatial dependencies between regions. For example, "people on the court" and "people outside the court" reflect the spatial location relationship between "people" and "court". Thus, we follow \citep{norcliffe2018learning} in modeling spatial dependencies between regions using polar coordinates and representing them as a spatial dependence matrix $\bm{P}_{v}$. On the other hand, there are also potential semantic correlations between regions. For example, the semantic information "playing tennis" is formed by associating the three regions: "women", "racket" and "tennis". 
Therefore, we calculate the semantic correlation matrix $\bm{r}^{v}$ between regions:
\begin{equation}
	\label{eq2}
	\bm{r}_{i j}^{v}=\frac{\exp \left(\lambda \bm{v}_{i}^\mathsf{T} \bm{v}_{j}\right)}{\sum_{j=0}^{v} \exp \left(\lambda \bm{v}_{i}^\mathsf{T} \bm{v}_{j}\right)},
\end{equation}
where $\lambda$ is the scale factor. $\bm{r}_{i j}^{v}$ denotes the correlation between the $i$-th region and the $j$-th region. The visual semantic knowledge graph is made up of the spatial interdependence and the semantic correlations between regions. We calculate the Hadamard product of $ \bm{r}^v $ and $\bm{P}_{v}$, then apply column L2-normalization to obtain the edge matrix $\bm{E}_{v}$.
\begin{equation}
	\label{eq3}
	\bm{E}_{v}=\left\|\bm{r}^{v} \odot \bm{P}_{t}\right\|_{2},
\end{equation}
where $ \left\| \cdot \right\|_{2} $ denotes column L2-normalization.

\subsubsection{Textual Semantic Knowledge Graph}
For a text $\bm{T}$ comprising $m$ words, we first represent each word as a continuous embedding vector. Then the word vectors are embedded into a bi-directional GRU \citep{cho2014learning}. Finally, textual word feature representation is obtained, denoted as $ \bm{T}=\left\{\bm{t}_{1}, \bm{t}_{2}, \ldots, \bm{t}_{m}\right\}$.

To construct a textual semantic knowledge graph $ \mathrm{\bm{G}}_{t}=\left\{\bm{V}_{t}, \bm{E}_{t}\right\} $, we set words as nodes, which are semantically related to each other. To obtain the syntactic dependency matrix $ \bm{P}_t $, we first utilize Stanford CoreNLP to find syntactic dependencies, and add self-loops (the matrix diagonal is 1). We compute the correlation $ \bm{r}^t $ between words. Similarly, the $ \bm{E}_t $ is then obtained by performing a column l2-normalizatio operation on the Hadamard product of $ \bm{r}^t $ and $ \bm{P}_t $ :
\begin{align}
	%\bm{r}_{i j}^{t}={softmax}_{j}\left(\lambda \bm{t}_{i}^\mathsf{T} \bm{t}_{j}\right), \label{eq4}\\
	\bm{r}_{i j}^{t}=\frac{\exp \left(\lambda \bm{t}_{i}^\mathsf{T} \bm{t}_{j}\right)}{\sum_{j=0}^{m} \exp \left(\lambda \bm{t}_{i}^\mathsf{T} \bm{t}_{j}\right)}, \label{eq4}\\
	\bm{E}_{t}=\left\|\bm{r}^{t} \odot \bm{P}_{t}\right\|_{2}, \label{eq5}
\end{align}

\subsection{Knowledge Graph Iterative Dissemination}
Considering that both images and texts are based on the KGID module for knowledge propagation, we first depict the knowledge propagation process on the visual semantic knowledge graph in detail, and then roughly on the textual.

\subsubsection{Visual KGID}
Given a visual semantic knowledge graph $\bm{G}_{v}=\left\{\bm{V}_{v}, \bm{E}_{v}\right\}$, its node representation denoted as $V$. We use the associated edges to associate each node with other nodes and propagate knowledge, resulting in a new visual semantic association feature map. Then, the softmax function is used to learn each region's knowledge weight coefficients and update the nodes' own features by element multiplication. Therefore, we obtain the the "knowledgeable" local representation by
\begin{equation}
	\label{eq6}
	\bm{V}_{k}^{(l)}=\rho\left(\bm{A}^{(l)} \bm{V} \bm{W}^{(l)}\right) \odot \bm{V},
\end{equation}
where $ \bm{A}^0=\bm{E}_{v} $, $ \bm{W}^{(l)} \in \mathbb{R}^{d \times d} $, $\rho$ is the softmax function.

In order to aggregate and enhance semantic correlations, we design an adaptive gate mechanism with fusion and reconstruction procedures in the process of knowledge propagation and reasoning. We obtain gating mask by combining $\bm{V}_{k}^{(l)}$ and $\bm{V}$. Then the gating mask are utilized to control the flow of information between $\bm{V}_{k}^{(l)}$ and $\bm{V}$. Therefore the image feature representation is dynamically updated by
\begin{align}
	g^{(l)}=\sigma\left(\left(\bm{V}_{k}^{(l)} \odot \bm{V}\right) + \bm{V}\right), \label{eq7}\\
	{\bm{V}_{kg}}^{(l)}=g^{(l)} \odot {\bm{V}_{k}}^{(l)} +(1-g^l) \odot \bm{V}, \label{eq8}
\end{align}
where $\sigma$ is the sigmoid function, $\odot$ is the Hadamard product. 

Finally, we perform aggregated inference followed by shortcut connection to enhance the local feature representation:
\begin{equation}
	\label{eq9}
	\bm{V}^{(l+1)}=\operatorname{ReLu}\left(\left(\bm{W}_{kg}^{(l)} \bm{V}_{kg}^{(l)}+\bm{b}^{(l)}\right)+\bm{V}\right),
\end{equation}
where $\bm{W}_{kg}^{(l)}\in \mathbb{R}^{d \times d}$. Moreover, we follow \citep{kuang2019fashion} to update the edge $A^{(l+1)}$ using the affinity of new nodes, i.e., by adaptively update the semantic knowledge through the affinity between regions:
\begin{equation}
	\label{eq10}
	\bm{A}^{(l+1)}=\operatorname{softmax}_{j}\left(\lambda (\bm{W}_{O}^{(l+1)}\bm{V}_{i}^{(l+1)}) \times (\bm{W}_{I}^{(l+1)}\bm{V}_{j}^{(l+1)})\right),
\end{equation}
where $\bm{W}_{I}^{(l+1)}\in \mathbb{R}^{d \times d}$ and $\bm{W}_{O}^{(l+1)} \in \mathbb{R}^{d \times d}$ are linear transformations of incoming and outgoing nodes, respectively. $A^{(l+1)}$ means that if two regions are highly correlated, edges with higher scores will connect the nodes.

We iteratively infer, update, and aggregate the visual semantic knowledge graph in $N$ steps, and use the output of the last step as the local inference representation $\bm{\tilde{v}}$ of the image.

To obtain the global feature, we perform a self-attentive operation on the local region features. Specifically, given the mean-pooled feature \texttt{$ \bm{v}_{m}=\frac{1}{N} \sum_{i=1}^{N} \bm{\tilde{v}}_{i} $}, with $ \bm{v}_{m}$ as the query, and $\bm{\tilde{v}}$ as the key and value, we first calculate the attention weight distribution \texttt{$ \bm{a}^{v}=\operatorname{softmax}\left(\bm{W}_{1}^{v}\left(\left(\bm{W}_{2}^{v} \bm{\tilde{v}}\right) \odot\left(\bm{W}_{3}^{v} \bm{v}_{m}\right)\right)\right) $} for all regions, where $\bm{W}_{1}^{v}\in \mathbb{R}^{1 \times d}$, $\bm{W}_{2}^{v}\in \mathbb{R}^{n \times d}$ and $\bm{W}_{3}^{v}\in \mathbb{R}^{n \times d}$. Then, the image global feature is obtained by \texttt{$\bm{\bar{v}}=\frac{1}{N} \sum_{i=1}^{N} \bm{a}_{i}^{v} \bm{\tilde{v}}_{i}$}.

\subsubsection{Textual KGID}
Similarly, given a textsual semantic knowledge graph $ \bm{G}_{t}=\left\{\bm{V}_{t}, \bm{E}_{t}\right\} $, its node representation is denoted as $\bm{T}$, we perform an $N$-step iterative propagation and inference to mine fine-grained local features $\bm{\tilde{t}}$. Subsequently, a self-attentive operation is performed to mine the global features $\bm{\bar{t}}$.

\subsection{Multi-Perspective Similarity Representation Learning}
We learn local and global similarity representations by using distance or directional differences between vectors. It enables a more comprehensive understanding of similarity.  

In order to enhance the visual representation, we use the words information from each sentence as cues to focus on all regions in each image. For each image, we first compute the cosine similarity between each region and each word to establish the relationship $\bm{R}$. The softmax function is then used to calculate the attention weights. Finally, we construct the augmented representation of images associated with the $j$-th word by $v_{j}^{t}=\sum_{i=1}^{m} \operatorname{softmax}\left(R_{i j}\right) \tilde{v}_{i}$.

\subsubsection{Distance Similarity}
To learn the distance similarity between vectors, we first calculate the vector-based squared Euclidean distance between vectors $\bm{x} \in \mathbb{R}^{d}$ and $\bm{y} \in \mathbb{R}^{d}$ by  $ \ dis(\bm{x},\bm{y})={{\left( \bm{x} -\bm{y} \right)}^{2}} $. Then, the distance similarity function denotes as
\begin{equation}
	\label{eq11}
	d(\bm{x}, \bm{y}, \bm{W}_d)=\frac{\bm{W}_d \cdot \ dis(\bm{x},\bm{y})}{\|\bm{W}_d \cdot \ dis(\bm{x}, \bm{y})\|_{2}},
\end{equation}
where $\boldsymbol{W}_{d} \in \mathbb{R}^{m \times d}$ is a learnable parameter matrix.

Using the Eq.(\ref{eq11}), we compute the local similarity representation $\bm{d}_{j}^{l}$ between $\bm{v}_{j}^{t}$ and $\bm{\tilde{t}_{j}}$, then calculate the global similarity representation $\bm{d}^{g}$ between $\bm{\bar{v}}$ and $\bm{\bar{t}}$:
\begin{align}
	\bm{d}_{j}^{l}=d\left(\bm{v}_{j}^{t}, \bm{\tilde{t}}_{j}, \bm{W}^l_{d}\right), \label{eq12}\\
	\bm{d}^{g}=d\left(\bm{\bar{v}}, \bm{\bar{t}}, \bm{W}^g_{d}\right), \label{eq13}
\end{align}

\subsubsection{Direction Similarity}
We consider the similarity representation learning based on the cosine distance between $\bm{x} \in \mathbb{R}^{\mathrm{d}}$ and  $\bm{y} \in \mathbb{R}^{\mathrm{d}}$. Thus, the "direction" similarity representation is defined as
\begin{equation}
	\label{eq14}
	\bm{c}(\bm{x}, \bm{y}, \bm{W}_c)=\frac{\bm{W}_c \ dir (\bm{x}, \bm{y})}{\|\bm{W}_c \ dir (\bm{x}, \bm{y})\|_{2}},
\end{equation}
where \texttt{$\ dir (\bm{x}, \bm{y})=\left(\bm{x} \cdot \bm{y}\right)/ \left(\|\bm{x}\| \cdot\|\bm{y}\|\right)$}.

We calculate the local similarity representation $\bm{c}_{j}^{l}$ between feature $\bm{v}_{j}^{t}$ and $\bm{\tilde{t}}_{j}$ with Eq.(\ref{eq14}), and calculate the global similarity representation $\bm{c}^{g}$ between $\bar{v}$ and $\bar{t}$:
\begin{align}
	\bm{c}_{j}^{l}=\bm{c}\left(\bm{v}_{j}^{t}, \bm{\tilde{t}}_{j}, \bm{W}^l_{c}\right), \label{eq15}\\
	\bm{c}^{g}=\bm{c}\left(\bm{\bar{v}}, \bm{\bar{t}}, \bm{W}^g_{c}\right), \label{eq16}
\end{align}

\subsection{Similarity-Relational Graph Reconstruction}

\subsubsection{Relational Graph Building}
Formally, we construct a directed relational weighted graph of similarity representations. Specifically, we denote all "distance" (or "direction") similarity representations as graph nodes $ \mathcal{N}=\left\{\bm{s}^g,\bm{s}_1^l,\bm{s}_2^l,...,\bm{s}_j^l \right\}$, where $\bm{s}^g$ denotes $\bm{d}^{g}$ (or $\bm{c}^{g}$), $\bm{s}^l_j$ denotes $\bm{d}^{l}_j$ (or $\bm{c}^{l}_j$). For any node, the relationship between nodes is extracted from node $\mathbf{s}_{a}$ to node $\mathbf{s}_{b}$ and is defined as a variable edge weight by
\begin{equation} \scriptsize
	\label{eq17}
	\begin{array}{l}
		\bm{E}_{({s}_{a},{s}_{b})}\left(\bm{W}_{out}, \bm{W}_{in}\right)=\frac{\sigma\left(B N\left(\mathbf{W}_{o u t} \mathbf{s}_{a}\right) \oplus BN\left(\mathbf{W}_{i n} \mathbf{s}_{b}\right)\right)}{\sum_{s_{i} \in \mathcal{N}} \sigma\left(B N\left(\mathbf{W}_{o u t} \mathbf{s}_{a}\right) \oplus BN\left(\mathbf{W}_{in} \mathbf{s}_{b}\right)\right)}  \\
	\end{array}
\end{equation}
where $\bm{W}_{out} \in \mathbb{R}^{m \times 1}$ and $\bm{W}_{in} \in \mathbb{R}^{1 \times m}$ are the linear transformations of outgoing and incoming nodes, the "$\oplus$" indicates concatenation. $R\left(\bm{s}_{a}, \bm{s}_{b}\right)=BN\left(\bm{W}_{out} \bm{s}_{a}\right) \oplus\left(B N\left(\bm{W}_{in} \bm{s}_{b}\right)\right)$ is the trend score of the node-node relationship, and the edge weights $\bm{E}_{({s}_{a},{s}_{b})}\left(\bm{W}_{out}, \bm{W}_{in}\right)$ can be calculated by using sigmoid function $\sigma$. Note that $\bm{s}_{a} \rightarrow \bm{s}_{b}$ differs from $\bm{s}_{a} \leftarrow \bm{s}_{b}$, i.e. the edges are directed.


\subsubsection{Relational Graph Reconstruction}
We perform a series of processes, such as propagation of similarity relations and gate mechanisms, to achieve the interaction of similar information and the reconstruction of the similarity-relational graph. Since the relational edges are directed, we take the outgoing and incoming inference, respectively, to implement bi-SRGR:
\begin{equation}
	\label{eq18}
	\begin{array}{l}
		\overrightarrow{\tilde{\bm{s}}_{a}}=\sum_{\bm{s}_{\mathrm{b}} \in \mathcal{N}} \overrightarrow{\bm{E}_{(s_{a},s_{b})}}\left(\bm{W}_{out}, \bm{W}_{in}\right) \cdot \bm{s}_{\mathrm{b}}, \\
		\overleftarrow{\tilde{\bm{s}}_{a}}=\sum_{\bm{s}_{\mathrm{b}} \in \mathcal{N}} \overleftarrow{\bm{E}_{(s_{a},s_{b})}}\left(\bm{W}_{in}, \bm{W}_{out}\right) \cdot \bm{s}_{\mathrm{b}},
	\end{array}
\end{equation}
where $\bm{W}_{out} \in \mathbb{R}^{m \times m}$ and $\bm{W}_{in} \in \mathbb{R}^{m \times m}$. The edge weights of $\bm{s}_{a}$ output and input are denoted by \texttt{$\overrightarrow{\bm{E}_{(s_{a},s_{b})}}\left(\bm{W}_{out}, \bm{W}_{in}\right)$} and \texttt{$\overleftarrow{\bm{E}_{(s_{a},s_{b})}}\left(\bm{W}_{in}, \bm{W}_{out}\right)$}, respectively. $\overrightarrow{\tilde{\bm{s}}_{a}}$ and $\overleftarrow{\tilde{\bm{s}}_{a}}$ denote the results of propagating all similarity information from node $s_{a}$ outgoing and incoming, respectively, both of which contain the same node $\bm{s}_{a}$. 

Furthermore, to improve the quality of dynamic decision-making, we propose a conditional selection strategy to adaptively filter node information and suppress unnecessary information. Specifically, $\overrightarrow{\tilde{\bm{s}}_{a}}$ and $\overleftarrow{\tilde{\bm{s}}_{a}}$ are first concatenated, followed by a fully connected layer and a sigmoid function to obtain the conditional mask,
\begin{equation}
	\label{eq19}
	\tilde{g}=\sigma\left(\widetilde{\bm{W}}\left(\overrightarrow{\tilde{\bm{s}}_{a}} \oplus \overleftarrow{\tilde{\bm{s}}_{a}}\right)+\tilde{\bm{b}}\right).
\end{equation}
Then, we use the generated conditional control masks to control the information flow of the original $\overrightarrow{\tilde{\bm{s}}_{a}}$ and $\overleftarrow{\tilde{\bm{s}}_{a}}$, followed by a shortcut connection to achieve an adaptively filtered and enhanced similarity representation, thus the reconstructed $\overrightarrow{\tilde{\bm{s}}_{a}^{*}}$ and $\overleftarrow{\tilde{\bm{s}}_{a}^{*}}$ can be achieved by
\begin{equation}
	\label{eq21}
	\begin{array}{l}
		\overrightarrow{\bm{s}_{a}^{*}}={\bm{W}}^*_{1}\left(\mathrm{\tilde{g}} \odot \overrightarrow{\bm{\tilde{s}}_{a}}\right)+\overrightarrow{\bm{\tilde{s}}_{a}}, \\ \overleftarrow{\bm{s}_{a}^{*}}={\bm{W}}^*_{2}\left(\mathrm{\tilde{g}} \odot \overleftarrow{\bm{\tilde{s}}_{a}}\right)+\overleftarrow{\bm{\tilde{s}}_{a}}.
	\end{array}
\end{equation}

Furthermore, we aggregate $\overrightarrow{\tilde{\bm{s}}_{a}^{*}}$ and $\overleftarrow{\tilde{\bm{s}}_{a}^{*}}$, followed by a fully connected layer, n, which is formulated as,
\begin{equation}
	\label{eq22}
	\bm{s}_{a}^{*}=\bm{W}^*\left(\overrightarrow{\bm{s}_{a}^{*}}+\overleftarrow{\bm{s}_{a}^{*}}\right)+\bm{b}^*,
\end{equation}
where $\bm{W}^* \in \mathbb{R}^{m \times m}$. Finally, we feed $\bm{s}_{a}^{*}$ into a fully connected layer to predict the final similarity score.

\subsection{Training Objectives and Inference Strategies}
We employ bidirectional triplet ranking loss as the objective function. Given a representation of the matched image-text pair  $(\bm{v}, \bm{t})$, its corresponding negative pairs are denoted as $(\bm{t}, \bm{v}^{-})$ and $(\bm{v}, \bm{t}^-)$. We compute the loss with 
\begin{equation}
	\label{eq23}
	\begin{array}{c}
		\mathcal{L}_{\text {dis}}(\bm{v}, \bm{t})=\sum_{(\bm{v}, \bm{t})}\left\{\max \left[0, \gamma-\mathcal{\bm{S}}_{dis}(\boldsymbol{\bm{v}}, \boldsymbol{\bm{t}})+\mathcal{\bm{S}}_{dis}\left(\boldsymbol{\bm{v}}, \boldsymbol{\bm{t}}^{-}\right)\right]\right. \\
		\left.+\max \left[0, \gamma-\mathcal{\bm{S}}_{dis}(\boldsymbol{\bm{v}}, \boldsymbol{\bm{t}})+\mathcal{\bm{S}}_{dis}\left(\boldsymbol{\bm{v}}^{-}, \boldsymbol{\bm{t}}\right)\right]\right\}.
	\end{array}
\end{equation}
where $\mathcal{\bm{S}}_{dis}(\boldsymbol{\bm{v}}, \boldsymbol{\bm{t}})$ is the similarity prediction function based on the "distance" similarity representation. Similarly, we define the ranking loss of MPSM(dir) as $\mathcal{L}_{\text {dir}}$.

In this paper, we use the proposed "distance" and "direction" similarity representations to investigate two training and inference strategies: joint training and independent training. For joint training, We combine $\mathcal{L}_{\text {dis}}$ and $\mathcal{L}_{\text {dir}}$ to train our MPSM model, i.e., we combine the "distance" and the "direction" similarity representation for training. For independent training, we train two single model, MPSM (dis) based on the "distance" similarity representation and MPSM (dir) based on the "direction" similarity representation. Then, in the inference phase, we average the similarities predicted by the MPSM (dis) and the MPSM (dir) for retrieval evaluation.

\section{Experiments}

\subsection{Datasets and Implementation Details}
The Flickr30K dataset \citep{plummer2015flickr30k} and MS-COCO dataset \citep{lin2014microsoft} (1K and 5K test set) were used for validating the effectiveness our proposed method. We utilize the typical Recall@K (K=1, 5, 10) as the performance evaluation metric. We trained our model with Adam optimizer with 30 epochs on the Flickr30K dataset and 20 epochs on the MS-COCO dataset. The dimensionality of the similarity representation to 256, and the other parameters are set to: $ l=3, \gamma=0.2, \lambda=9 $.

\subsection{ Quantitative Results}
We compare the proposed MPSM with several state-of-the-art baselines. Note that the majority of these models are ensemble models. Therefore, we provide two versions of MPSM: MPSM (dis) and MPSM (dir) that based on the "distance" and the "direction" similarity representation, respectively. Then, we integrate them by averaging their similarity scores, and denotes as MPSM*. 

\subsubsection{Results on Flickr30K Dataset}
The quantitative results on the Flickr30K dataset are shown in Table \ref{tab:1}, and it can be observed that our MPSM model outperforms the state-of-the-art in most assessment measures. Compared with GSMN, our method outperforms it in all metrics. Unlike GSMN, our approach propagates and aggregates semantic knowledge, rather than performing image-text matching directly. Furthermore, we simulate the interaction of global and local alignments, which obtains more comprehensive cross-modal correlations. Improvements show that propagating semantic information to learn fine-grained intra-modal correlations and incorporating them into cross-modal similarity learning improves matching performance significantly. Our proposed method outperforms other models that use the same word feature learning method (i.e., bi-GRU). Compared to CAMERA, our method achieves relative R@1 gains of 2.2$\%$ and 1.2$\%$ for I2T and T2I matching, respectively. However, our method reduces the relative R@5 and R@10 to 1.4$\%$ and 1.6$\%$ for T2I matching, respectively. This could be because CAMERA employs a pre-trained BERT. BERT learns feature representations of words based on a massive corpus, with powerful language representation and sentence processing capabilities.
\begin{table*}[h]  \scriptsize
	\centering  	
	\begin{threeparttable}
		\caption{Results on Flickr30K and MSCOCO. * indicates to the ensemble result. The best result is marked in bold.}
		\label{tab:1}
		\setlength{\tabcolsep}{1.1mm}{
			\begin{tabular}{lcccccccccccccccccc}
				\toprule			
				\multirow{3}[2]{*}{Method} & \multicolumn{6}{c}{Flickr30K dataset} & \multicolumn{6}{c}{MSCOCO 1K Test Set} & \multicolumn{6}{c}{MSCOCO 5K Test Set}\\
				\cmidrule(lr){2-7} \cmidrule(lr){8-13} \cmidrule(lr){14-19}
				& \multicolumn{3}{c}{ Image to Text} &\multicolumn{3}{c}{ Text to Image}  & \multicolumn{3}{c}{ Image to Text} &\multicolumn{3}{c}{ Text to Image} & \multicolumn{3}{c}{ Image to Text} &\multicolumn{3}{c}{ Text to Image} \\
				
				&\multicolumn{1}{l}{R@1}	&\multicolumn{1}{l}{R@5}  & \multicolumn{1}{l}{R@10} 		& \multicolumn{1}{l}{R@1}	&\multicolumn{1}{l}{R@5}  & \multicolumn{1}{l}{R@10}	 & \multicolumn{1}{l}{R@1} &\multicolumn{1}{l}{R@5}  & \multicolumn{1}{l}{R@10} & \multicolumn{1}{l}{R@1}	&\multicolumn{1}{l}{R@5}  & \multicolumn{1}{l}{R@10}    & \multicolumn{1}{l}{R@1} 	&\multicolumn{1}{l}{R@5} & \multicolumn{1}{l}{R@10} 	
				& \multicolumn{1}{l}{R@1}	&\multicolumn{1}{l}{R@5}  & \multicolumn{1}{l}{R@10}\cr
				\midrule
				
				VSE++ \citep{DBLP:conf/bmvc/FaghriFKF18}            & 52.9 & 79.1 & 87.2 & 39.6 & 69.6 & 79.5 & 64.6 & 90.0 & 95.7 & 52.0 & 84.3 & 92.0 & 41.3 & 71.1 & 81.2 & 30.3 & 59.4 & 72.4  \\
				MTFN \citep{wang2019matching}            & 65.3 & 88.3 & 93.3 & 52.0 & 80.1 & 86.1 & 74.3 & 94.9 & 97.9 & 60.1 & 89.1 & 95.0 & 48.3 & 77.6 & 87.3 & 35.9 & 66.1 & 76.1  \\
				SCAN* \citep{lee2018stacked}            & 67.4 & 90.3 & 95.8 & 48.6 & 77.7 & 85.2 & 72.7 & 94.8 & 98.4 & 58.8 & 88.4 & 94.8 & 50.4 & 82.2 & 90.0 & 38.6 & 69.3 & 80.4  \\
				VSRN* \citep{li2019visual}            & 71.3 & 90.6 & 96.0 & 54.7 & 81.8 & 88.2 & 76.2 & 94.8 & 98.2 & 62.8 & 89.7 & 95.1 & 53.0 & 81.1 & 89.4 & 40.5 & 70.6 & 81.1  \\
				IMRAM* \citep{chen2020imram}          & 74.1 & 93.0 & 96.6 & 53.9 & 79.4 & 87.2 & 76.7 & 95.6 & 98.5 & 61.7 & 89.1 & 95.0 & 53.7 & 83.2 & 91.0 & 39.7 & 69.1 & 79.8  \\
				MMCA \citep{wei2020multi}             & 74.2 & 92.8 & 96.4 & 54.8 & 81.4 & 87.8 & 74.8 & 95.6 & 97.7 & 61.6 & 89.8 & 95.2 & 54.0 & 82.5 & 90.7 & 38.7 & 69.7 & 80.8  \\
				GSMN* \citep{liu2020graph}            & 76.4 & 94.3 & 97.3 & 57.4 & 82.3 & 89.0 & 78.4 & 96.4 & 98.6 & 63.3 & 90.1 & 95.7 & -    & -    & -    & -    & -    & -     \\
				CAMERA* \citep{qu2020context}   &78.0 &\bf{95.1}	&97.9	&60.3 &\bf{85.9}	&{\bf91.7}	&77.5	&96.3	&98.8		&63.4 &90.9	&95.8	&55.1  &82.9		&91.2		&40.5 &71.7 &82.5\cr
				SMFEA \citep{ge2021structured}            & 73.7 & 92.5 & 96.1 & 54.7 & 82.1 & 88.4 & 75.1 & 95.4 & 98.3 & 62.5 & 90.1 & 96.2 & 54.2 & -    & 89.9 & 41.9 & -    & 83.7  \\
				CASC \citep{xu2020cross}             & 68.5 & 90.6 & 95.9 & 50.2 & 78.3 & 86.3 & 72.3 & 96.0 & 99.0 & 58.9 & 89.8 & 96.0 & 47.2 & 78.3 & 87.4 & 34.7 & 64.8 & 76.8  \\
				SHAN* \citep{ji2021step}            & 74.6 & 93.5 & 96.9 & 55.3 & 81.3 & 88.4 & 76.8 & 96.3 & 98.7 & 62.6 & 89.6 & 95.8 & -    & -    & -    & -    & -    & -     \\
				SGRAF* \citep{diao2021similarity}           & 77.8 & 94.1 & 97.4 & 58.5 & 83.0 & 88.8 & 79.6 & 96.2 & 98.5 & 63.2 & 90.7 & 96.1 & 57.8 & -    & 91.6 & 41.9 & -    & 81.3  \\
				
				\midrule
				\bf{MPSM (dis)}	& 77.5 & 94.0 & 97.0 & 58.7 & 83.6 & 89.1 & 78.4 & 96.0 & 98.5 & 63.1 & 90.0 & 95.6 & 58.1 & 84.3 & 91.4 & 41.5 & 70.9 & 81.4\cr
				
				\bf{MPSM (dir)}	& 76.8 & 94.3 & 97.0 & 57.3 & 82.9 & 88.9 & 78.4 & 96.3 & 98.8 & 63.5 & 90.4 & 95.8 & 57.5 & 84.4 & 91.7 & 41.7 & 71.2 & 81.5\cr
				
				\bf{MPSM*} &{\bf 80.2}	& 94.9	&{\bf 98.0} &{\bf 61.5}	& 84.5	&90.1 	&{\bf80.9}	& \bf{96.5}	&{\bf99.0}		&{\bf65.0}	& \bf{91.1}	&{\bf96.1}	&{\bf60.3}	& \bf{86.1}	&{\bf92.5}		&{\bf43.5}	& \bf{72.8}	&{\bf82. 8}\cr	
				
				\bottomrule
		\end{tabular}}
	\end{threeparttable}
\end{table*}

When compared to SGRAF, a multi-level alignment learning method that also employs vector-based similarity representation, our method achieves relative R@1 gains of 2.4$\%$ and 3$\%$ for I2T and T2I matching, respectively. Unlike SGRAF, we model the vector-based similarity representation from two perspectives: distance and direction. Furthermore, our SRGR module makes the visual-semantic correspondence more fine-grained. Moreover, our KGID module provides rich semantic information within modalities. The advancements demonstrate that learning similarity from multiple perspectives, can help with cross-modal alignment.

It's worth mentioning that compared with MPSM (dis) and MPSM (dir), MPSM* has increased by 2.7$\%$ and 3.4$\%$ in I2T retrieval, has increased by 2.8$\%$ and 4.2$\%$ in T2I retrieval relative to R@1, respectively. This demonstrates that the MPSM(dis) and MPSM(dir) models can complement and enhance each other, allowing for a more comprehensive exploration of the correspondence between modalities. Furthermore, our single model's retrieval performance is very competitive, demonstrating the effectiveness of our method.

\subsubsection{Results on MSCOCO Dataset}
Table \ref{tab:1} shows quantitative results for the larger and more complex dataset MSCOCO (1K and 5K test sets). Our MPSM surpasses existing approaches in all metrics in the 1K testset. Compared with GSMN, our method outperforms it in all metrics. Compared to SGRAF, on I2T and T2I retrieval, our MPSM improves by 1.3$\%$ and 1.8$\%$, respectively. The gain in our method's performance over R@5 and R@10 is not as large as it is for R@1, which could be owing to the presence of more interference sources in a big target set for a particular query. Our MPSM maintains its superiority in the 5K testset. Our model outperforms SGRAF by 2.5$\%$ and 1.6 $\%$ in I2T and T2I retrieval, respectively. 

\subsection{Ablation Studies and Analysis}

\subsubsection{Impact of Different Network Structures}
We compare MPSM (dis) and the integrated model MPSM with four other models (based on "distance" similarity representation). (1) w/o KGID denotes the removal of the whole KGID module from the model; (2) w/o V-KGID denotes the absence of the visual KGID module; (3) w/o T-KGID denotes the absence of the textual KGID module; and (4) w/o SRGR denotes the absence of the SRGR module. 

As shown in Table \ref{tab:2}, both MPSM (dis) and MPSM outperform these four types of models. Specifically, when we remove the KGID module, the model performance suffers, which justifies the usage of modality-specific semantic information to investigate fine-grained semantic correlations and modal representations. Noting that the performance of w/o V-KGID and w/o T-KGID is better than that of w/o KGID, owing to the inclusion of semantic information from text or image, which can help with cross-modal correspondence investigation.The performance of MPSM (dis) is superior to that of w/o SRGR, demonstrating the effectiveness of the SRGR module. It also demonstrates that in-depth exploration of cross-modal similarity relation can facilitate aggregation and enhance similarity for more accurate matching.
\begin{table}[htb] \scriptsize
	\centering  	
	\begin{threeparttable}
		\caption{Impact of different structures on Flickr30K.}
		\label{tab:2}
		\setlength{\tabcolsep}{2.5mm}{
			\begin{tabular}{lccccccc}
				\toprule
				\multirow{2}{*}{Modal}&
				\multicolumn{3}{c}{ Image to Text}&\multicolumn{3}{c}{Text to Image}
				\\ %&\multirow{2}{*}{Rsum}\cr
				\cmidrule(lr){2-4} \cmidrule(lr){5-7}
				&R@1&R@5&R@10&R@1&R@5&R@10\cr
				\midrule
				w/o KGID	&75.8	&93.2	&96.3		&57.0	&81.5	&88.4	\\
				w/o T-KGID	&76.2	&93.9	&96.9		&56.6	&82.5	&88.1 \\
				w/o V-KGID	&75.2	&94.0	&96.6		&56.8	&82.7	&88.7 \\
				w/o SRGR	&75.2	&93.7	&97.1		&57.7	&82.1	&89.0 \\
				MPSM  (dis)	&77.5	&94.0	&97.0	&58.7	&83.6 	&89.1 \\
				
				MPSM*	&80.2	&94.9	&98.0		&61.5	&84.5	&90.1 \\%	
				\bottomrule
		\end{tabular}}
	\end{threeparttable}
\end{table}

\subsubsection{Impact of Different KGID Layers}
We researched the impact layers of KGID modules on performance, gradually increasing the number of KGID layers from 0 to 4 for training and evaluation. As can be seen in Table \ref{tab:3}, increasing the KGID improves performance. The model performs best when the number of KGID layers is increased to 3, demonstrating that iteratively propagating semantic knowledge is effective in boosting performance. This is because, during knowledge dissemination, the KGID module may integrate nodes' domain information and build connections with related nodes. The performance of KGID degrades as the number of layers increases to 4. This could be due to the fact that as the network grows deeper, the noise level rises in tandem with the number of connected nodes, interfering with the learnt correspondence. As a result, we finally set the KGID module to 3 layers.
\begin{table}[htb] \scriptsize
	\centering  	
	\begin{threeparttable}
		\caption{Impact of different KGID layers on Flickr30K.}
		\label{tab:3}
		\setlength{\tabcolsep}{2.5mm}{
			\begin{tabular}{lcccccccc}
				\toprule
				\multirow{2}{*}{Modal}&
				\multicolumn{3}{c}{ Image to Text}&\multicolumn{3}{c}{Text to Image} \\%&\multirow{2}{*}{Rsum}\cr
				\cmidrule(lr){2-4} \cmidrule(lr){5-7}
				&R@1&R@5&R@10&R@1&R@5&R@10\cr
				\midrule
				w/o KGID	&75.8	&93.2	&96.3		&57.0	&81.5	&88.4 \\
				1KGID	&75.4	&93.7	&96.5		&57.6	&82.9	&88.8 \\	
				2KGID	&75.9	&93.8	&96.5		&57.8 	&82.9	&88.9	\\
				3KGID	&\bf{77.5}	&\bf{94.0}	&\bf{97.0}		&\bf{58.7}		&\bf{83.6}	&\bf{89.1} 	\\
				4KGID	&76.4	&93.6	&96.9		&56.9	&82.5	&88.3	\\
				\bottomrule[1 pt]
		\end{tabular}}
	\end{threeparttable}
\end{table}

\subsubsection{Impact of Different Alignment Strategies}
We investigated three alignments: (1) global alignment learning strategy, which implies that only global alignment is used in the model; (2) local alignment learning strategy, implies that only local alignment is used; and (3) multi-level alignment learning strategy, 
\begin{figure}[htbp]
	\centering
	\subfigure[Rsum results on Flickr30K]{
		\includegraphics[width=1.5in]{fig3(a).pdf}
	}
	\subfigure[Rsum results on MSCOCO]{
		\includegraphics[width=1.5in]{fig3(b).pdf}
	}
	\caption{Comparison of Rsum results on Flickr30K and MSCOCO 1K test set with different alignment strategies.} %  %大图名称
	\label{fig:3}
\end{figure}
which indicates that global and local alignment are combined. As shown in Figures \ref{fig:3}, the model's performance decreases dramatically when only the global correspondence is considered, without taking into account the relationship between local and global. Moreover, when compared to local alignment learning, the multilevel alignment learning strategy achieves superior performance. It demonstrates that global and local alignments can complement each other's semantic information to achieve more accurate matching by interacting with "part" and "whole".

\subsubsection{Impact of Training Strategies}
We design two different training strategies, "joint training" and "independent training with integration”, and compare them. From Table \ref{tab:5}, we can see that the "independent training and integration" strategy achieves superior performance than the "joint training" strategy. On the one hand, the training with individual learners tends to cause underfitting or overfitting, resulting in insufficient generalization ability of the joint training strategy. Instead, we train MPSM (dis) and MPSM (dir) separately, and integrate them by calculating their means to complement each other, resulting in an ensemble modal with superior generalization performance. On the other hand, the "distance" similarity representation focuses on measuring the magnitude of similarity while ignoring directional differences between images and text; the "direction" similarity representation distinguishes the difference between vectors more from direction than numerical value, and thus fails to quantify the image-text correspondence finely. However, they are complementary. Thus, the ensemble model MPSM is based on "distance" and "direction" similarity representations, which can facilitate the exploration of fine-grained cross-modal correspondences.
\begin{table}[htb] \scriptsize
	\centering  	
	\begin{threeparttable}
		\caption{Impact of different KGID layers.}
		\label{tab:5}
		\setlength{\tabcolsep}{1.0mm}{
			\begin{tabular}{ccccccccc}
				\toprule
				\multirow{2}{*}{Modal}&\multirow{2}{*}{distance}&\multirow{2}{*}{direction}&\multirow{2}{*}{Joint}&\multirow{2}{*}{Split}&
				\multicolumn{2}{c}{ Image to Text}&\multicolumn{2}{c}{Text to Image}\cr
				\cmidrule(lr){6-7} \cmidrule(lr){8-9}
				&&&&&R@1&R@10&R@1&R@10\cr
				\midrule
				\multirow{4}{*}{Flickr30K}
				&\checkmark &  & &	&77.5	&97.0		&58.7	&89.1\cr
				& &\checkmark  & &	&77.0	&97.0		&57.3	&88.9\cr
				&\checkmark &\checkmark &\checkmark &	&77.1	&97.2	&59.3	&88.7\cr
				&\checkmark &\checkmark & &\checkmark	&\bf{80.2}	&\bf{98.0}	&\bf{61.5}	&\bf{90.1}\cr
				\midrule
				\multirow{4}{*}{MSCOCO 1K}
				&\checkmark &  & &	&78.4	&98.5		&63.1	&95.6\cr
				& &\checkmark  & &	&78.4	&98.8		&63.5	&95.8\cr
				&\checkmark &\checkmark &\checkmark &	&79.5	&\bf{99.0}	&63.6	&95.9\cr
				&\checkmark &\checkmark & &\checkmark	&\bf{80.9}	&\bf{99.0}	&\bf{65.0} &\bf{96.1}\cr
				\bottomrule
		\end{tabular}}
	\end{threeparttable}
\end{table}

\subsection{Qualitative Results and Analysis}
Furthermore, we show the qualitative results of I2T and T2I retrieval on the Flickr30K in Figure \ref{fig:4}. For the I2T retrieval in (a), we show the top-5 retrieved sentences based on our predicted similarity score ranking. 
\begin{figure}[htbp]
	\centering
	\subfigure[image-to-text matching]{
		\includegraphics[width=3.2in]{fig4(a).pdf}
	}
	\subfigure[text-to-image matching]{
		\includegraphics[width=3.2in]{fig4(b).pdf}
	}
	\caption{Visualization of image-text retrieval on Flickr30K.} 
	\label{fig:4}
\end{figure}
Our model can retrieve almost all sentences that match the query image; even the incorrect instances have some similarity. For example, the "men" region corresponds to the word "men", and the men are wearing uniforms which corresponds to the phrase "men in uniform". Thus, the semantics between the matched sentence "5" and the query image are almost identical. This is due to the KGID module, which investigates fine-grained correlations between fragments. In addition, MPSM considers similarity from multiple perspectives and SRGR module explore more comprehensive similarity and more precise matching. As for the T2I retrieval, we show the top-3 retrieved images and mark the correct results with green boxes. The top-1 image is the ground-truth, and all other results are close to the sentence's semantics. These results demonstrate our model's ability to perform finer-grained matching.

\section{Conclusions}
In this paper, we propose a fine-grained matching with Multi-Perspective Similarity Modeling (MPSM) Network for cross-modal retrieval. Specifically, we develop a knowledge graph iterative dissemination module that iteratively propagates semantic knowledge to capture fine-grained intra-modal correlations and modal representations. Then, from multiple perspectives, we learn vector-based similarity representations to adequately learn multi-level correspondences. Further, we designed a relationship graph reconstruction module that focuses on aggregating and improving the similarity between similar modalities to be able to obtain more accurate matches. Experiments on both datasets show that our network is superior.


\begin{acknowledgements} 
	This work is supported by National Natural Science Foundation of China (Nos. 61966004, 61866004), Guangxi Natural Science Foundation (No. 2019GXNSFDA245018), Innovation Project of Guangxi Graduate Education (YCSW2022155), Guangxi “Bagui Scholar” Teams for Innovation and Research Project, Guangxi Talent Highland Project of Big Data Intelligence and Application, and Guangxi Collaborative Innovation Center of Multi-source Information Integration and Intelligent Processing. (Corresponding author: Zhixin Li.)
\end{acknowledgements}

\bibliography{xie_132}

\appendix
% NOTE: necessary when ptmx or no mathfont class option is given
\providecommand{\upGamma}{\Gamma}
\providecommand{\uppi}{\pi}


\end{document}
