%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{times}
\usepackage{soul}
\usepackage{url}
%\usepackage[hidelinks]{hyperref}
\usepackage[utf8]{inputenc}
\usepackage{caption}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{booktabs}
\usepackage{bbold}
\usepackage{physics}
\usepackage{multirow}
\usepackage{float}
% these packages are added by Kelvin
\usepackage{amsthm,amssymb} % define proof environment
\usepackage{soul} % to strike through text
%\usepackage[pagebackref,breaklinks,colorlinks]{hyperref}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\argmax}{arg\,max}


%to IEEE algorithm format
%\usepackage{algorithm}
%\usepackage{algorithmic}
%\usepackage{algpseudocode}
\usepackage[linesnumbered,ruled,vlined]{algorithm2e}
\newcommand\mycommfont[1]{\footnotesize\ttfamily\textcolor{blue}{#1}}
\SetCommentSty{mycommfont}

% Support for easy cross-referencing
\usepackage[capitalize]{cleveref}
\crefname{section}{Sec.}{Secs.}
\Crefname{section}{Section}{Sections}
\Crefname{table}{Table}{Tables}
\crefname{table}{Tab.}{Tabs.}

\newtheorem{innercustomgeneric}{\customgenericname}
\providecommand{\customgenericname}{}
\newcommand{\newcustomtheorem}[2]{%
  \newenvironment{#1}[1]
  {%
   \renewcommand\customgenericname{#2}%
   \renewcommand\theinnercustomgeneric{##1}%
   \innercustomgeneric
  }
  {\endinnercustomgeneric}
}
\newcustomtheorem{customthm}{Theorem}
\newcustomtheorem{customlem}{Lemma}
\newcustomtheorem{customexa}{Example}
\newcustomtheorem{customrem}{Remark}

\usepackage{soul}
\usepackage{comment}
\usepackage{xcolor}
\newcommand{\q}[1]{\textcolor{red}{#1}}
\newcommand{\g}[1]{\textcolor{blue}{#1}}
\newcommand{\kl}[1]{\textcolor{orange}{#1}}

% the following package is optional:
%\usepackage{latexsym}

% See https://www.overleaf.com/learn/latex/theorems_and_proofs
% for a nice explanation of how to define new theorems, but keep
% in mind that the amsthm package is already included in this
% template and that you must *not* alter the styling.
\newtheorem{example}{Example}
\newtheorem{theorem}{Theorem}
\newtheorem{definition}{Definition}
\newtheorem{lemma}{Lemma}
\newtheorem{remark}{Remark}
\newtheorem{property}{Property}
\newtheorem{proposition}{Proposition}
% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 



%\externaldocument{uai2023-template}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Contrastive Learning for Supervised Graph Matching \\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<gathika.ratnayaka@anu.edu.au>?Subject=Your UAI 2023 paper}{Gathika~Ratnayaka}{}}
\author[1]{{Qing~Wang}}
\author[2]{{Yang~Li}}

% Add affiliations after the authors
\affil[1]{%
    School of Computing\\
    Australian National University\\
    Canberra, Australia
}
\affil[2]{%
    School of Information Technology\\
    Deakin University\\
    Melbourne, Australia
}
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle
\appendix


\section{Proofs}
\label{apx:proofs}
 
 \begin{customlem}{1}
   Given a Sinkhorn ranking $\mathbf{S}$, Algorithm 1 can always return a matching $\mathbf{M}$ that satisfies the one-to-one mapping constraint: $\sum_{j=1}^{n} \mathbf{M}_{i,j}= 1 $ $\forall i\in V_S$ and $\sum_{i=1}^{m} \mathbf{M}_{i,j}\leq 1$ $\forall {j\in V_T}$.
 \end{customlem}
 
 \begin{proof} We first prove that such a matching $\mathbf{M}$ satisfies $\sum_{j=1}^{n} \mathbf{M}_{i,j}= 1 $ $\forall i\in V_S$ in two steps:

 \begin{itemize}
\item[(1)] We show that each node in $V_S$ can be matched to at most one node in $V_T$. In Algorithm 1, if a node $i \in V_S$ is already matched with some node in $V_S$, the node $i$ is excluded for any further consideration by the condition on line 3. Further, even if a node $i\in V_S$ is not matched, the algorithm only matches the node $i$ with at most one node in $V_T$ during Lines 4-11. Thus, each node $i \in V_S$ can never be matched to two or more nodes in $V_T$.


\item[(2)] Now we prove that each node in $V_S$ must be matched to at least one node in $V_T$ by contradiction. We assume that there exists a node $i\in V_S$ which is not matched with any node in $V_T$. Then by Algorithm 1, this means that, each time when the highest ranked node (say $j\in V_T$) is selected from $i$'s preference order, this the highest ranked node $j$ does not meet the conditions specified in Line 5 and Line 9. Thus, such a node $j$ is removed from the preference order of $i$ (Line 12). As the node $i$ is not matched, the algorithm must have attempted to match the node $i$ with  every node in $V_T$ and each node in $V_T$ fails to meet the conditions specified in Line 5 and Line 9. However, we know that $|V_T|>=|V_S|$ and each node in $V_S$ can be matched to at most one node in $V_T$ (see the proof in the first step). Thus, this is impossible - contradicting with the assumption.
\end{itemize}


We then prove that such a matching $\mathbf{M}$ satisfies $\sum_{i=1}^{m} \mathbf{M}_{i,j}\leq 1$ $\forall {j\in V_T}$ by showing that any  node $j \in V_T$ can be matched with at most one node in $V_S$:

\begin{itemize}
\item  In Algorithm 1, a node $j\in V_T$ can only be matched to a node $i\in V_S$ if node $j$ is still not matched yet (Line 5) or if node $j$ ranks $i$ with a higher preference order than its currently matched node $i'\in V_S$ (Line 9). In the first case, the node $j$ is guaranteed to be only matched to $i$. In the second case, the match between $i'$ and $j$ is broken ($\mathbf{M}_{i,j} \leftarrow 0$) before matching $i$ with $j$. These ensure that any node $j \in V_T$ can only be matched with at most one node in $V_S$.
\end{itemize}

Putting the above together, the proof is done.

\end{proof}

\begin{customthm}{1}
  Let $\mathbf{M}$ be a matching returned by Algorithm 1 over a Sinkhorn ranking $\mathbf{S}$. Then $M$ is stable w.r.t. $\mathbf{S}$. 
\end{customthm}
\begin{proof}
We prove it by contradiction. Let us assume that a matching $\mathbf{M}$ returned by Algorithm 1 over a Sinkhorn ranking $\mathbf{S}$ is unstable. Then, by Definition 2, there must exist at least one blocking pair $(i,j)$ with $i\in V_S$ and $j\in V_T$ in $\mathbf{M}$ such that the following conditions are all satisfied:

\begin{enumerate}
    \item[(1)] $\mathbf{M}_{i,j}=0$;
    
    \item[(2)] $\exists k\in V_S\backslash\{i\}$ s.t. $\mathbf{M}_{k,j}=1$ and $\mathbf{S}_{i,j}>\mathbf{S}_{k,j}$;
    
    \item[(3)] $\exists k'\in V_T\backslash\{j\}$ s.t. either $\mathbf{M}_{i,k'}=1$ and $\mathbf{S}_{i,j}>\mathbf{S}_{i,k'}$, or $\mathbf{M}_{i,k'}=0$ for all $k'\in V_T\backslash\{j\}$ .

\end{enumerate}

By Condition (1), we know that node $i$ is not matched with node $j$. By Condition (2), there exist another node $k \in V_S$ which is ranked lower than node $i$ in $j$'s preference order but is matched with node $j$. As per Algorithm 1 (Lines 8-11), if $i \in V_S$ attempts to match with $j \in V_T$, the matching attempt would be successful as node $i$ is ranked higher than node $k$ in $j$'s preference order, unless node $i$ is already matched with some node $k' \in V_T\backslash\{j\}$ that is ranked higher than $j$ in $i$'s preference order. However, by Condition (3), this is not possible.



Therefore, $\mathbf{M}$ does not have any blocking pair,  contradicting with the assumption. The proof is done.


\end{proof} 

\begin{customthm}{2}
\label{theorem:2}
The contrastive loss of Eq. 8 is both positive-informative and negative-informative.
\end{customthm}
\begin{proof}
We prove that the contrastive loss of Eq. 8 has Property 1 and Property 2 in turn.

\begin{itemize}
\item \emph{Property 1}: 
    Given two nodes $\{i,i'\}\subseteq V_S$ with $f^-(i)=f^-(i')$ and $f^+(i)>f^+(i')$, we know that $\frac{f^{+}(i)}{1+f^{-}(i)} > \frac{f^{+}(i')}{1+f^{-}(i')}$. This leads to

    \begin{equation} 
    -\ln \left(\frac{f^{+}(i')}{1+f^{-}(i')}\right) < -\ln \left(\frac{f^{+}(i)}{1+f^{-}(i)}\right).
    \end{equation}
 
Hence, $L_i<L_{i'}$ holds. Property 1 is proved.

\item \emph{Property 2} :
Given two nodes $\{i,i'\}\subseteq V_S$ with $f^+(i)=f^+(i')$ and $f^-(i)>f^-(i')$, we have $\frac{f^{+}(i)}{1+f^{-}(i)} < \frac{f^{+}(i')}{1+f^{-}(i')}$, and hence 

    \begin{equation}
    -\ln \left(\frac{f^{+}(i)}{1+f^{-}(i)}\right) > -\ln \left(\frac{f^{+}(i')}{1+f^{-}(i')}\right)
    \end{equation}
  
 This proves Property 2, i.e., $L_i>L_i'$. 
\end{itemize}
    
\end{proof}





\begin{customthm}{3}
Let $\mathbf{M}$ be a matching produced by StableGM. Then $\mathbf{M}$ is dual-optimal if $\mathbf{S}$ is R1-symmetric. 




\end{customthm}


\begin{proof} 



If $\mathbf{S}$ is R1-symmetric, by the first condition of Definition 5, we know that for any given node $i \in V_S$, there must exist one node $j \in V_T$ such that $\mathbf{S}_{i,j}>\mathbf{S}_{i,k}$ holds for all other nodes $ k\in V_T\backslash\{j\}$. This implies that node $j$ is the highest ranked node in $i$'s preference order (Line 2 in Algorithm 1). Then, according to Algorithm 1 (Lines 3-11), when node $i$ attempts to be matched (Line 3), there are two cases: 
\begin{itemize}
    \item[(a)] if node $j$ is not matched yet, then node $i$ is matched with node $j$ (Line 6 in Algorithm 1); \item[(b)] if node $j$ is already matched with another node $k\in V_S\backslash\{i\}$, then by the second condition of Definition 5 we know that $\mathbf{S}_{i,j}>\mathbf{S}_{i,k}$. This implies that node $i$ is the highest ranked node in the preference order of $j$. Hence, node $i$ is also matched with node $j$ (Line 11 in Algorithm 1).
\end{itemize}

By Definition 3, we know that a matching $\mathbf{M}$ is {$\mathcal{G}_S$-optimal} w.r.t. $\mathbf{S}$ if $\mathbf{M}_{i,j}=1$ implies $\mathbf{S}_{i,j}>\mathbf{S}_{i,k}$ for all $ k\in V_T\backslash\{j\}$. From the above two cases, we can see that, when Algorithm 1 matches node $i$ with node $j$ (Lines 6 and 11), $\mathbf{S}_{i,j}>\mathbf{S}_{i,k}$ holds for all $ k\in V_T\backslash\{j\}$. Thus $\mathbf{M}$ produced by Algorithm 1 is {$\mathcal{G}_S$-optimal} w.r.t. $\mathbf{S}$. 


Similarly, Definition 3, we know that a matching $\mathbf{M}$ is {$\mathcal{G}_T$-optimal} w.r.t. $\mathbf{S}$ if $\mathbf{M}_{i,j}=1$ implies $\mathbf{S}_{i,j}>\mathbf{S}_{k,j}$ for all $k\in V_S\backslash\{i\}$. Since Algorithm 1 matches node $i$ with node $j$ (Lines 6 and 11), $\mathbf{S}_{i,j}>\mathbf{S}_{i,k}$ holds for all $ k\in V_T\backslash\{j\}$. By the second condition of Definition 5, we can thus have $\mathbf{S}_{i,j}>\mathbf{S}_{k,j}$ holds for all $ k\in V_S\backslash\{i\}$. Hence, $\mathbf{M}$ produced by Algorithm 1 is also {$\mathcal{G}_T$-optimal} w.r.t. $\mathbf{S}$. 


Since $\mathbf{M}$ is both $\mathcal{G}_S$-optimal and $\mathcal{G}_T$-optimal w.r.t. $\mathbf{S}$, by Definition 4, $\mathbf{M}$ is dual optimal w.r.t. $\mathbf{S}$.

\end{proof}







\begin{proposition}\label{pro:R1}
      Let $\mathbf{M}$ be a matching produced by StableGM. Then $\mathbf{M}$ is dual-optimal when the  contrastive loss function $L(\cdot)$ is {minimized}. 
\end{proposition}
\begin{proof}
By Eq.~8, we know that $L(\cdot)$ is minimized when the contrastive losses of all nodes in $V_S$ are minimized, and for a given $i \in V_S$, its corresponding contrastive loss $L_{i}$ is minimized when $L_{i}= 0$. Further, according to Eq.~9, $L_{i}= 0$ implies that $f^{+}(i)=1$ and $f^{-}(i)=0$. Since $f^{+}(i) = \mathbf{S}^{2}_{i,\pi(i)}$, we thus know $\mathbf{S}_{i,\pi(i)} = 1$ when $L_{i}= 0$.

As $\mathbf{S} \in \mathbb{S}$ is a rectangular doubly stochastic matrix satisfying the conditions: $\sum_{k=1}^{n} \mathbf{S}_{i,k} = 1 $ and $\sum_{k=1}^{m} \mathbf{S}_{k,\pi(i)} \leq 1$,  $\mathbf{S}_{i,\pi(i)} = 1$ implies $\mathbf{S}_{i,k} = 0 $ for all nodes $k \in V_T\backslash\{\pi(i)\}$ and $\mathbf{S}_{k,\pi(i)} = 0 $ for all nodes  $k \in V_S\backslash\{i\}$. Hence, from the above, we can see that, when $L_{i}$ is minimized, the following properties are preserved for $i \in V_S$:

\begin{itemize}
    \item $\mathbf{S}_{i,\pi(i)} > \mathbf{S}_{i,k}$ holds for all $k \in V_T\backslash\{\pi(i)\}$;
    \item $\mathbf{S}_{i,\pi(i)} > \mathbf{S}_{k,\pi(i)}$ holds for all $k \in V_S\backslash\{i\}$.
\end{itemize}

By Definition 5, we know that $\mathbf{S}$ is R1-symmetric in this case because the following conditions hold for $j=\pi(i)$:

\begin{itemize}
    \item for any $i \in V_S$ there exists some $j \in V_T$ such that $\mathbf{S}_{i,j}>\mathbf{S}_{i,k}$ holds for all $ k\in V_T\backslash\{j\}$;
    \item if $\mathbf{S}_{i,j}>\mathbf{S}_{i,k}$ holds for all $k\in V_T\backslash\{j\}$, then $\mathbf{S}_{i,j}>\mathbf{S}_{k,j}$ also holds for all $k\in V_S\backslash\{i\}$.   
\end{itemize}

By Theorem 3, we know that $\mathbf{M}$ is dual optimal if $\mathbf{S}$ is R1-symmetric. Therefore, $\mathbf{M}$ is also dual optimal when the contrastive loss function $L(\cdot)$ is minimized.


\end{proof}





\section{Additional Experiments}

\subsection{Effect of Hardness Attention}


\begin{table}[H]
\centering
\begin{tabular}{|c|clc|cl|ll|}
\toprule
\multirow{2}{*}{$\beta$} & \multicolumn{3}{c|}{Pascal VOC}                                          & \multicolumn{2}{c|}{Spair 71K}                                   & \multicolumn{2}{l|}{Willow Object Class}                                  \\ \cline{2-8} 
                   & \multicolumn{2}{l|}{StableGM - v1}  & \multicolumn{1}{l|}{StableGM - v2} & \multicolumn{1}{l|}{StableGM-v1}    & StableGM-v2                & \multicolumn{1}{l|}{StableGM-v1}    & StableGM-v2                         \\ \toprule
0.0                & \multicolumn{2}{c|}{\textbf{80.88}} & \textbf{81.52}                     & \multicolumn{1}{c|}{82.81}          &     \multicolumn{1}{c|}{83.14}                       & \multicolumn{1}{c|}{98.31}               & \multicolumn{1}{c|}{\textbf{98.52}} \\ \hline
0.1                & \multicolumn{2}{c|}{80.71}          & 81.29                              & \multicolumn{1}{c|}{\textbf{83.05}} & \multicolumn{1}{c|}{\textbf{83.48}} & \multicolumn{1}{c|}{\textbf{98.46}} & \multicolumn{1}{c|}{98.37}          \\ \hline
0.5                & \multicolumn{2}{c|}{80.51}          & 81.03                              & \multicolumn{1}{c|}{83.04}               &  \multicolumn{1}{c|}{83.19}                              &   \multicolumn{1}{c|}{98.27}         &        \multicolumn{1}{c|}{98.31}                             \\ \hline
1.0                & \multicolumn{2}{c|}{80.18}          &          80.92                          & \multicolumn{1}{c|}{82.65}          &    \multicolumn{1}{c|}{83.15}                        & \multicolumn{1}{c|}{98.19}               &                               \multicolumn{1}{c|}{98.00}     \\ \bottomrule
\end{tabular}
\caption{Comparison of matching accuracy (\%) by StableGM-v1 and StableGM-v2 under different values of $\beta\in\{0, 0.1,0.5,1\}$.}
    \label{tab:Hardness_Parameter}
\end{table}

When $\beta = 1$, all the negative samples are considered as hard negatives, which is equivalent to a setting without any explicit hard negative sampling. From Table \ref{tab:Hardness_Parameter}, it can be seen that when $\beta = 1.0$, the performance is lower when compared to the settings where an explicit hard negative sampling is performed, demonstrating the importance of the proposed "Hardness Attention" strategy. We can also see that the best performance for Pascal VoC keypoint dataset and the Willow object class dataset is produced when $\beta = 0$ for StableGM-v2. When, $\beta = 0$, only the negative samples with normalized affinities that are greater than that of related positive samples are considered as hard negatives. For the Spair-71K dataset, the best results are produced when $\beta = 0$, demonstrating the importance of the hardness parameter $\beta$. 

\textbf{Hyperparameters:} Adam optimizer\citep{kingma2014adam} was used for training with an initial learning rate of  $1 \times 10 ^{-5}$ for VGG$_{16}$ and $5 \times 10 ^{-4}$ for other models. 0.005 was used as the Sinkhorn regularization parameter $\alpha$.


\subsection{Comparison of loss functions}

We conducted experiments on Spair 71k dataset to observe the impact of different loss functions on our StableGM framework. In the experiments, StableGM-v2 model was modified by replacing the contrastive matching loss with permutation loss (elementwise cross entropy loss)[4] and InfoNCE loss, respectively. The results are given in Table \ref{tab:performance_of_loss_functions}. In the table,  $\tau$ denotes the temperature parameter of InfoNCE loss.

\begin{table}
\centering
\begin{tabular}{|l|l|l|l|l|}
\hline
            & \begin{tabular}[c]{@{}l@{}}Permutation\\ Loss\end{tabular} & \begin{tabular}[c]{@{}l@{}}InfoNCE Loss\\   ($\tau$=1)\end{tabular} & \begin{tabular}[c]{@{}l@{}}InfoNCE Loss\\   ($\tau$=0.1)\end{tabular} & \begin{tabular}[c]{@{}l@{}}Contastive Matching\\    Loss (ours)\end{tabular} \\ \hline
StableGM-v2 & 83.24                                                      & 81.14                                                                              & 82.35                                                                                & 83.48                                                                        \\ \hline
\end{tabular}
\caption{Comparison of matching accuracy (\%) for different loss functions}
\label{tab:performance_of_loss_functions}
\end{table}

\section{Runtime Analysis}
We conduct experiments to compare the runtime needed for graph matching by different methods.
Table \ref{tab:time_analysis} presents the average runtime taken to process a pair of graphs (i.e., a pair of keypoint annotated images) in the training and test phases for StableGM-v1 (ours), StableGM-v2 (ours), GCAN \citep{jiang2022graph}, NGM-v2 \citep{wang2021neural} and NHGM-v2 \citep{wang2021neural}. 

\begin{table}[H]
\centering
\begin{tabular}{|l|l|l|l|l|l|}
\toprule
Phase & StableGM-v1 & StableGM-v2 & GCAN & NGM-v2 & NHGM-v2 \\ \toprule
Training (miliseconds)          &   45.81          &    48.45         &  55.01    &  56.37      &       322.59  \\ \hline
Testing (miliseconds)        &  20.3           &    21.80         &   24.75   & 25.25      &    171.23     \\ \bottomrule
\end{tabular}
\caption{Comparison of average runtime for processing a pair of keypoint annotated images (miliseconds per pair) by different methods on Pascal VoC keypoint dataset.}
    \label{tab:time_analysis}
\end{table}

We can see that StableGM-v1 performs best in both training and testing phases. Further, we have the following observations:
\begin{itemize}
    \item The time taken by StableGM-v2 is slightly higher than that of StableGM-v1. This is because StableGM-v2 has additional positional encoding and self-attention layers when compared to StableGM-v1. 
    \item Although StableGM-v2 and GCAN implementations employ the same method to derive normalized cross-graph node-to-node affinities, StableGM-v2 performs faster than GCAN. The superior performance of StableGM-v2 against GCAN demonstrates the efficiency of our Stable Matching algorithm over the branch and bound algorithm used by GCAN.
    \item NGM-v2 and NHGM-v2 first convert two graphs into association graphs and hypergraphs, which results in additional computational overheads. Moreover, NGM-v2 and NHGM-v2 use the Hungarian algorithm to obtain final node correspondence between two graphs, which is slower than our Stable Matching algorithm as discussed in Section B of the main paper. 
\end{itemize}


When evaluating the runtime, the experiments were carried out on a 
Linux server (Intel Xeon W-2175 with 2.50GHz, 28 cores, NVIDIA RTX A6000 GPU, 512GB of main
memory)


\section{Gradient Analysis of the Standard Contrastive Loss Function}

 If the standard contrastive loss is adapted to our setting instead of the contrastive matching loss, $L_{i}$  can be written as,

\begin{equation}
    \label{eq:popular-contrastive-loss} 
    L_{i} =-ln( \frac{\exp(s_{i,\pi(i)}/ \tau)}{\sum_{k=1}^{m}\sum_{j=1}^{n}\mathbf{Z}^{i}_{k,j}\exp(s_{k,j}/\tau)+\exp(s_{i,\pi(i)}/\tau)})
\end{equation}

Then the gradient w.r.t. the affinity of a negative sample $(i,q)$  will be, 

\[\pdv[]{L_{i}}{\mathbf{S}_{i,q}}=\frac{1}{\tau}P_{i,q}\], 

where 
\[P_{i,q} = \frac{\mathbf{Z}^{i}_{i,q}exp(S_{i,q}/ \tau)}{\sum_{k=1}^{m}\sum_{j=1}^{n}\mathbf{Z}^{i}_{k,j}\exp(s_{k,j}/\tau)+\exp(s_{i,\pi(i)}/\tau)}.\]

The gradient w.r.t. a positive sample will be,

\[\pdv[]{L_{i}}{\mathbf{S}_{i,\pi(i)}}=-\frac{1}{\tau}\sum_{k=1}^{m}\sum_{j=1}^{n}P_{k,j}\]

Here $\tau$ indicates the temperature parameter used in the standard contrastive loss. 

As the magnitude of gradient with respect to a positive sample is equal to the sum of the gradients of all the negative samples that are considered, the gradients propogated from the standard contrastive loss will ignore the affinity between a node pair in a positive sample.  Moreover, in an explicit hard negative sampling strategy like ours, the propagated gradient can become 0 even if the affinity between a node pair in a positive sample can be further improved.  These are undesirable properties that occur when applying the standard contrastive loss directly to supervised graph matching.



\bibliography{ratnayaka_648}







\end{document}
