\documentclass{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

\usepackage{bm}
\usepackage{amsfonts}
\usepackage{amsmath}
\usepackage{mathtools}
\usepackage{array}

\usepackage{url,subfigure,amsmath,amssymb,epsfig,verbatim,booktabs,graphicx,epstopdf}
% \usepackage[colorlinks=false,linkcolor=black, citecolor=blue, urlcolor=black, pdfborder={0 1 0}]{hyperref}
\usepackage{mathtools}
\usepackage{threeparttable}
\usepackage{multirow}
\usepackage{makecell}
% \usepackage{todonotes}
\usepackage{bbding}

\usepackage{color}
\usepackage{xcolor}
\newcommand{\colorb}[1]{\textcolor{blue}{#1}}
\newcommand{\colorg}[1]{\textcolor{green}{#1}}
\newcommand{\colorr}[1]{\textcolor{red}{#1}}
\newcommand{\colorp}[1]{\textcolor{violet}{#1}}

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{uai2023-main}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Multi-View Graph Contrastive Learning for Solving Vehicle Routing Problems\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2023 paper}{Jane~J.~von~O'L\'opez}{}}
\author[1]{Harry~Q.~Bovik}
\author[1,2]{Further~Coauthor}
\author[3]{Further~Coauthor}
\author[1]{Further~Coauthor}
\author[3]{Further~Coauthor}
\author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Computer Science Dept.\\
    Cranberry University\\
    Pittsburgh, Pennsylvania, USA
}
\affil[2]{%
    Second Affiliation\\
    Address\\
    …
}
\affil[3]{%
    Another Affiliation\\
    Address\\
    …
  }
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

% This Supplementary Material should be submitted as a separate file. Please do not append the Supplementary Material to the main paper. 

% Fig. \ref{fig:pitt} and Eq \ref{eq:example} in the main paper can be cross referenced using \texttt{xr}. 

\appendix

\section{Multi-hop Random Walk}
Regarding the subgraph samples used in our graph contrastive learning (as mentioned in the subsection of \textbf{Node-level representation learning} in the main paper), we first generate a $n_q$-neighbourhood-subgraph (i.e., $n_q$ = 20 in our setting) around an anchor node, and then apply multi-hop random walk (MHRW) \citep{zhang2013random} to generate subgraphs as the augmented samples (i.e. $g^{q}$ and $g^{k}$), which are fed into the GNN encoders. We elaborate MHRW as follows.


%In the section of \textbf{Node-level representation learning} in the main paper, we define subgraph samples used in our graph contrastive learning (GCL). We first generated a $n_q$-neighbourhood-subgraph around each node, and then apply multi-hop random walk (MHRW) \cite{zhang2013random} to sample subgraphs as the augmented samples (i.e. $g^{q}$ and $g^{k}$), which are fed into the GNN encoder. We elaborate MHRW as follows.

In general, a VRP graph is a weighted undirected graph $G$ with $n$ nodes, and the cost $c_{ij}$ ($c_{ij} > 0$) denotes the length of the edge $e_{i,j}$ between node $v_i$ and node $v_j$, where we define $c_{ii}$ = 0. The random walk on the graph $G$ acts like that we roll dice at a node to decide which edge will be traversed next and lead to a new node. In order to sample a walk, we start from the anchor node, and then iteratively move from the current node to another node $v_j$ within the $n_q$-neighbourhood-subgraph of the anchor node. In our implementation, the walk sampling depends on parameters that specify probabilities of walking to each node at the current step, i.e., the transit probability matrix $T = \{t_{ij}\}$. %In other words, instead of being completely random, our walk sampling will now have a tendency to behave a certain way. 
As stated in the main paper, we intuitively encourage more aggregation of the structural information from the vicinity than that from non-vicinity. Therefore, in our walk sampling, we specify the transit probability $t_{ij}$ from node $v_i$ to node $v_j$ as $1/c_{i,j}^{\alpha}$, where $\alpha$ is the hyperparameter controlling the importance of edge cost during the walk (i.e., $\alpha$ = 1 in our setting). With the defined $T$, we iteratively visit the neighbourhood of the current node, until $\frac{3}{4}n_q$ nodes are collected for constructing a subgraph. Meanwhile, we hope the random walk concentrates more around the anchor node, and thus introduce a positive probability $r$ (i.e., $r$ = 0.8 in our setting) at each step for leading the walk back to the anchor node as did in \citep{tong2006fast}. 

%In doing so, the sampled MHRW subgraphs of the same node may contain similar customer sets and structures and thus facilitate the learning of mining local patterns on the routing graphs. 
During MHRW sampling, we collect the visited nodes to build the node set of the subgraph, and keep all edges between these nodes from the original graph. The generated subgraphs are then used as augmented samples to be processed by the GNN encoders for contrastive learning.

\section{Details of POMO and Active Search}
\subsection{POMO}
In training, we adopt the state-of-the-art neural heuristic, i.e., Policy Optimization with Multiple Optima (POMO) \citep{kwon2020pomo}, to learn the solution construction step by step. Since POMO is developed on top of another popular neural heuristic, i.e., Attention Model\footnote{https://github.com/wouterkool/attention-learn-to-route} (AM) \citep{kool2018attention}, we first introduce AM before POMO.

AM is a specialized encoder-decoder neural architecture for VRPs. The encoder mainly consists of multiple self-attention layers, which encode each node with its relationship to other nodes into a vector (i.e. embedding). Then, the decoder creates a route sequence (the solution) in a step-by-step manner, by utilizing the node embeddings and context embedding (from the encoder) to compute the query, key and value vectors for the dot-product attention mechanism.

%The majority of computation takes place inside a $L$-layered self-attention encoder, which embeds each node's information as a vector along with its relationship to other nodes. Following that, the decoder creates an autoregressive solution sequence utilizing these vectors as the keys for its dot-product attention mechanism.

Given node embeddings $\{x_i\}_{i=1}^n$ for each node $v_i$ (note: in our work, they are the learned node representations from the pre-trained GNN), the encoder of AM firstly computes $d_h$-dim embeddings $h_{i}^{(0)}$ through two linear projections, i.e., $h_{i}^{(0)}=W^2(W^1 x_{i}+b^1) + b^2$. Then the node embeddings are processed by $L$ self-attention layers. The embeddings produced by layer $\ell$ ($\ell\in \{1,\dots,L\}$) is denoted as $h_{i}^{(\ell)}$. Specifically, each self-attention layer consists of two sub-layers, i.e., a multi-head attention (MHA) layer that executes message passing between the nodes, and a node-wise fully connected feed-forward (FF) layer, as follows, 
\begin{equation}\label{eq:attention}
\begin{aligned}
&\hat{{h}}_{i} =\operatorname{BN}^{\ell}\left({h}_{i}^{(\ell-1)}+\operatorname{MHA}_{i}^{\ell}\left({h}_{1}^{(\ell-1)}, \ldots, {h}_{n}^{(\ell-1)}\right)\right), \\
&{h}_{i}^{(\ell)} =\operatorname{BN}^{\ell}\left(\hat{{h}}_{i}+\operatorname{FF}^{\ell}\left(\hat{{h}}_{i}\right)\right),
\end{aligned}
\end{equation}
where each sub-layer is also equipped with the skip-connection and batch normalization (BN) for stabilizing the training. Afterwards, node embeddings $h_{i}^{(L)}$ from the last layer of the encoder are fed into the decoder. The decoder sequentially calculates the probabilities of visiting each unvisited node with the attention layer followed by a Softmax function, which is used to construct the solution in a node-by-node manner. 
POMO is essentially developed on top of AM. During policy optimization, POMO samples a set of solution trajectories $\left\{\tau^{1}, \tau^{2}, \ldots, \tau^{N}\right\}$ that start from each of all nodes, and gather each return $R\left(\tau^{i}\right)$ (i.e. the negative of tour length). During the training, it maximizes the expected return $J$ by REINFORCE algorithm \cite{williams1992simple}, with gradients computed as below,
\begin{equation}\label{eq:reinforce}
 \hspace{-1mm}\nabla_{\theta} J(\theta) \approx \frac{1}{n} \sum_{i=1}^{n}\left(R\left(\boldsymbol{\tau}^{i}\right)-b^{i}(s)\right) \nabla_{\theta} \log p_{\theta}\left(\boldsymbol{\tau}^{i} \mid s\right), 
 \end{equation}
where $p_{\theta}\left(\boldsymbol{\tau}^{i} \mid s\right)$ means the probability produced by AM for the solution $\tau^{i}$, given the instance $s$. Additionally, POMO uses a shared baseline $b^{i}(s)$ for the above gradients to reduce the variance as below, 
 \begin{equation}\label{eq:sharedbs}
 b^{i}(s)=\frac{1}{n} \sum_{j=1}^{n} R\left(\boldsymbol{\tau}^{j}\right), \quad \text { for all } i \text {. }
 \end{equation}

During the inference, POMO produces multiple greedy trajectories by rotating each input instance and starting the trajectory from each of all nodes. The final solution is specified as the best one among all the sampled trajectories.


\subsection{Active Search}

%The inference mechanism that we adopted to better generalize the trained model on unseen instances, called active search \cite{Bello2017WorkshopT}, because the model actively updates its parameters while searching for candidate solutions.
The active search, known as an inference boosting mechanism, is originally proposed in \cite{Bello2017WorkshopT}, which actively updates the parameters of a model while it is used to infer the solution to an instance. Specifically, the inference starts with a trained model and iteratively optimizes its parameters with inferred solutions to an individual testing instance, while keeping track of the best one generated during the search (inference). This approach is verified to be competitive given a long runtime since it focuses on updating parameter for each individual instance. However, updating all parameters of the model as did in \citep{Bello2017WorkshopT} is expensive and impractical. For example, it may cost several days to infer 10000 TSP100 instances. To tackle this issue, Efficient Active Search (EAS) is proposed in \citep{hottung2022efficient} to only adjust a subset of parameters during inference, %to a single instance during the search, 
while keeping all other parameters fixed. 

We adopt a similar technique as EAS, which adds instance-specific residual layers before the output layer of the attention decoder (within POMO) and only updates the parameters of these layers. In our MVGCL, the residual layers accept both node embeddings from the last attention layer and the graph embedding $x_g$ from the pre-trained encoder $f_q$, as formulated in Eq. (3) in the main paper. The instance-specific layers are updated with the aforementioned REINFORCE algorithm in POMO, but we also use the imitation loss $J_{IL}$ to increase the log-likelihood of generating the incumbent solutions such that,
\begin{equation}\label{eq:jil}  \nabla_{\theta^\prime} J_{IL}(\theta)=  \log p_{\nabla_{\theta^\prime}}\left(\boldsymbol{\overline{\tau}} \mid s\right),   \end{equation}
where $\theta^\prime$ represents parameters of the instance-specific layers and $\overline{\tau}$ is the incumbent solution so far, i.e., the best solution till the current search iteration.

%Particularly, the $l$-th trainable layer is inserted into the decoder as follows, 

%\begin{equation}    \begin{split} h_{l}&=\hat{h}+\left(\left(\operatorname{ReLu}\left(\hat{h} W^{1}_{l}+b^{1}_{l}\right) W^{2}_{l}+b^{2}_{l}\right)\right., \\  \hat{h}&=\left\{ \begin{aligned} & [h_n,x_g]  &,~l = 1, \\ & h_{l-1} &,~l > 1, \end{aligned} \right.
%\end{split} \end{equation} where $W^{1}$ and $W^{2}$ are the weight matrix; $b^{1}$ and  $b^{2}$ are bias vectors;  $\hat{h}$ to the first layer is the concatenation of the output of the last attention layer $h_n$ and the graph embedding $x_g$ from the pre-trained GNN.



%In summary, the active search direction is guided by the instance-specific information to adjust for each unseen instance accordingly, which is favourable for achieving better generalization, as shown in our experiments. 




\section{Implementation Details}
% We present the details of experimental setup for training and inference all the methods in this section. All the experiments conduct on a single NVIDIA GTX 2080Ti GPU with 14 CPUs to perform the training and inference. We generate the training and testing instances with various distributions by TSPGEN \footnote{https://github.com/jakobbossek/tspgen}.

All the experiments are conducted with a single NVIDIA GTX 2080Ti GPU and i9-10940X CPU with 14 cores, including (pre-)training and inference. The implementation details of baselines and our method are described as below.

\subsection{Baselines}
We calculate the optimal solution for TSP with Concorde solver.\footnote{https://www.math.uwaterloo.ca/tsp/concorde.html}
Regarding CVRP, the best solution is calculated with a Python wrapper\footnote{https://github.com/chkwon/PyHygese} for Hybrid Genetic Search algorithm (HGS). Regarding neural heuristic baselines, we adopt their open-sourced code on Github by keeping most of their original hyperparameters unchanged, except that we reduce the batch size in POMO \citep{kwon2020pomo} to 56 for CVRP100 due to the memory limit. The original HAC was tested on TSP50 \citep{zhang2022learning}, and we only change the node number to train a TSP100 model for its evaluation. %\colorr{Maybe a fine-tuned HAC for TSP100 can achieve better results, but this is out of the scope of this work.} 
In addition, we also modify the data loading process in each baseline so that they can be compatible with the data format in our experiments.

\subsection{MVGCL}
We adopt a 5-layer Graph Isomorphism Network\footnote{https://github.com/weihua916/powerful-gnns} (GIN) \citep{xu2018powerful} with 64 units per layer as the graph encoders in our implementation. In MHRW, we sample subgraphs with $\alpha$ = 1, $n_q$ = 20 and $r$ = 0.8. Regarding MoCo, we set the Momentum $m$ = 0.999 and InfoNCE temperature to 0.7.

%The implementation of neural heuristic in MVGCL reference to the old\_version POMO code\footnote{https://github.com/yd\-kwon/POMO/tree/master/OLD\_ipynb\_ver}  and Eas\_lay\footnote{https://github.com/ahottung/EAS/blob/main/source/eas\_lay.py}. 
Our MVGCL is developed on top of POMO\footnote{https://github.com/yd\-kwon/POMO/tree/master/OLD\_ipynb\_ver} and active search\footnote{https://github.com/ahottung/EAS/blob/main/source/eas\_lay.py}.
For a fair comparison, we use the same neural architecture in POMO as the one in our MVGCL, where the dimension for node embeddings is 128 and the dimension for hidden units in the feed-forward layer is 512. The multi-head attention uses 8 heads and the dimension for the key in attention layers is 16. Besides, we set the logit clipping value to 10 and the weight decay factor to 1e-6 for the policy network. For the distribution-preserved augmentation, we set $(p1,p2,p3)$ to $(0.7,0.2,0.1)$. During training, we apply \emph{early stopping} when the gap reduction is not significant. Regarding active search, the iteration number for each instance is fixed to 200. We add only one instance-specific residual layer before the output layer of the decoder for faster inference, though we find more layers for active search slightly improve the performance at the expense of significantly longer runtime. \textbf{The implementation code of our MVGCL will be made publicly available.}

\section{Experiments on Uniform distribution}
\begin{table*}[t]  \small
\setlength{\tabcolsep}{2.8pt}
	\centering
	%	\vspace{-4mm}

\begin{threeparttable}
	    \scalebox{0.9}{
\begin{tabular}{lc||c|c|c|c|c|c|c||c|c|c|c|c|c|c}
\toprule
			\multicolumn{2}{c||}{\textbf{Problem}}& \multicolumn{7}{c||}{\textbf{TSP50}} & \multicolumn{7}{c}{\textbf{TSP100}}   \\\midrule
\textbf{Distribution}           & \textbf{Metric} & Concorde & AM &   POMO &  LCP & HAC & DROP & MVGCL & Concorde & AM &  POMO &  LCP & HAC & DROP & MVGCL\\ \midrule\midrule
\multirow{2}{*}{\textbf{Uniform}} & Len.&   5.72	& 5.91	& 5.86	& 5.86	& 5.87	& 5.87	& \textbf{5.82}
& 7.78	& 8.10	& 7.93	& 7.94	& 8.08	& 7.97	& \textbf{7.92}
\\  
                           & Gap  &  0.00\% & 3.32\%	& 2.45\%	& 2.45\%	& 2.62\%	& 2.62\%	& \textbf{1.75\%}
                           &  0.00\% & 4.11\%	& 1.93\%	& 2.06\%	& 3.86\%	& 2.44\%	& \textbf{1.80\%}
\\ \midrule
\multicolumn{2}{c||}{\textbf{Avg. Inf. Time (s)}} &    0.08&	0.07&	0.01&	0.53&	0.08&	0.01&	0.87

&0.50	&0.22&	0.02 &	1.50 &	0.23&	0.03&	3.70
  \\  \midrule  \midrule            
\multicolumn{2}{c||}{\textbf{Problem}}& \multicolumn{7}{c||}{\textbf{CVRP50}} & \multicolumn{7}{c}{\textbf{CVRP100}}   \\\midrule
\textbf{Distribution}           & \textbf{Metric} & \textbf{HGS} & AM &   POMO &  LCP & \textbf{LKH} & DROP & MVGCL & \textbf{HGS} & AM &  POMO &  LCP & \textbf{LKH} & DROP & MVGCL\\ \midrule\midrule


\multirow{2}{*}{\textbf{Uniform}} & Len. & 10.55	& 10.82	& 10.71	& 10.76	&10.56
 & 10.74	& \textbf{10.56}	
 
& 15.68	& 16.26	& 16.10	& 16.20 &15.78	& 16.18	& \textbf{15.72}
	

\\  
& Gap  &  0.00\% & 2.56\%	& 1.52\%	& 1.99\% & 0.09\% 	& 1.80\%	& \textbf{0.09\%}	

&  0.00\% & 3.70\%	& 2.68\%	& 3.32\% & 0.64\%	& 3.19\%	& \textbf{0.26\%}

\\ \midrule
\multicolumn{2}{c||}{\textbf{Avg. Inf. Time (s)}} &    30       &   0.22   & 0.01   &   2.83 &18.2  &     0.01    &   1.07 
&  30    &  0.29    &  0.03  &   5.85 &33.6  &   0.05 &  4.43
  \\
                           \bottomrule
\end{tabular}}
\end{threeparttable}
\caption{Results of tour lengths and gaps for TSP and CVRP on the Uniform distribution, where LKH3 is also included}
\label{tb:uniform}
\end{table*}


We randomly generate 2000 instances of uniform distribution for the two VRP variants, i.e., TSP50, TSP100, CVRP50 and CVRP100. We test the same trained models as used in subsections of \textbf{Generalization on TSP} and \textbf{Generalization on CVRP} in the main paper. We also include LKH3 \citep{helsgaun2017extension} with 3000 trails as a baseline on CVRP50 and CVRP100. Results are gathered in Table \ref{tb:uniform}, and it shows that our MVGCL can also deliver superior performance on uniform distribution, where the other neural heuristic baselines cannot generalize well. For example, our MVGCL achieves gaps of 1.75\% and 1.80\% on TSP50 and TSP100, respectively, while other baselines report gaps around 2\%-4\%. Besides, the gaps of our MVGCL on CVRP50 and CVRP100 are also fairly close to HGS at 0.09\% and 0.26\%, respectively, which outperform other neural heuristic baselines and even LKH3 on CVRP100.


%Particularly, for TSP100, the gap of HAC on uniform distribution is lower than those on the five distributions used in the main paper. \colorr{The reason might be that their mixture Gaussian generator can only train HAC with hard Gaussian instances adjusted from the uniform distribution, thus the trained model cannot generalize to others.} We conclude from these observations that, 1) only training on a few distributions may impair the generalization ability on other distributions; 2) simply training on data with more distributions cannot ensure a good generalization performance. On the other hand, our MVGCL effectively exploits the information from mixed distribution by pretraining with graph contrastive learning and significantly improves the generalization performance. 

\begin{table*}[ht]\small
% \setlength{\tabcolsep}{1pt}
\renewcommand{\arraystretch}{1.2}
\centering
\vspace{-7mm}
\scalebox{0.8}{
\begin{tabular}{lc||c|c|c|c|c|c|c}
\toprule
\textbf{Instance}           & \textbf{Metric} & Opt. & AM & POMO &  LCP & HAC & DROP & MVGCL \\ \midrule\midrule
\multirow{2}{*}{\textbf{a280}} & 	Len. & 	2579 &	3132 &	3024 &	3031 &	3390 &	3038 &	\textbf{2765}	\\
&	Gap &	0.00\% &	21.44\% &	17.27\% &	17.52\% &	31.43\% &	17.80\% &	\textbf{7.20\%}	\\ \midrule
\multirow{2}{*}{\textbf{berlin52}} & 	Len. & 	7542 &	7978 &	7681 &	7593 &	7749 &	7575 &	\textbf{7573}	\\
&	Gap &	0.00\% &	5.78\% &	1.84\% &	0.68\% &	2.75\% &	0.44\% &	\textbf{0.41\%}	\\ \midrule
\multirow{2}{*}{\textbf{bier127}} & 	Len. & 	118282 &	126705 &	120501 &	121492 &	122108 &	123110 &	\textbf{119429}	\\
&	Gap &	0.00\% &	7.12\% &	1.88\% &	2.71\% &	3.23\% &	4.08\% &	\textbf{0.97\%}	\\ \midrule
\multirow{2}{*}{\textbf{ch130}} & 	Len. & 	6110 &	6360 &	6156 &	\textbf{6140} &	6258 &	6262 &	6157	\\
&	Gap &	0.00\% &	4.10\% &	0.75\% &	\textbf{0.50\%} &	2.42\% &	2.49\% &	0.76\%	\\ \midrule
\multirow{2}{*}{\textbf{ch150}} & 	Len. & 	6528 &	6780 &	6600 &	6626 &	6986 &	6670 &	\textbf{6572}	\\
&	Gap &	0.00\% &	3.87\% &	1.11\% &	1.51\% &	7.01\% &	2.17\% &	\textbf{0.68\%}	\\ \midrule
\multirow{2}{*}{\textbf{d198}} & 	Len. & 	15780 &	18037 &	16660 &	16494 &	21455 &	17185 &	\textbf{16203}	\\
&	Gap &	0.00\% &	14.30\% &	5.58\% &	4.53\% &	35.96\% &	8.90\% &	\textbf{2.68\%}	\\ \midrule
\multirow{2}{*}{\textbf{eil101}} & 	Len. & 	629 &	659 &	648 &	\textbf{643} &	665 &	644 &	644	\\
&	Gap &	0.00\% &	4.73\% &	2.98\% &	\textbf{2.26\%} &	5.69\% &	2.42\% &	2.43\%	\\ \midrule
\multirow{2}{*}{\textbf{eil51}} & 	Len. & 	426 &	438 &	435 &	440 &	437 &	435 &	\textbf{431}	\\
&	Gap &	0.00\% &	2.88\% &	2.08\% &	3.24\% &	2.53\% &	2.06\% &	\textbf{1.10\%}	\\ \midrule
\multirow{2}{*}{\textbf{eil76}} & 	Len. & 	538 &	567 &	557 &	554 &	554 &	565 &	\textbf{552}	\\
&	Gap &	0.00\% &	5.33\% &	3.50\% &	2.97\% &	3.05\% &	5.08\% &	\textbf{2.55\%}	\\ \midrule
\multirow{2}{*}{\textbf{gil262}} & 	Len. & 	2378 &	4254 &	3226 &	3232 &	5117 &	3236 &	\textbf{2649}	\\
&	Gap &	0.00\% &	78.91\% &	35.65\% &	35.91\% &	115.18\% &	36.08\% &	\textbf{11.38\%}	\\ \midrule
\multirow{2}{*}{\textbf{kroA150}} & 	Len. & 	26524 &	28634 &	27126 &	26973 &	29045 &	26971 &	\textbf{26842}	\\
&	Gap &	0.00\% &	7.96\% &	2.27\% &	1.69\% &	9.50\% &	1.69\% &	\textbf{1.20\%}	\\ \midrule
\multirow{2}{*}{\textbf{kroB150}} & 	Len. & 	26130 &	26898 &	26632 &	26391 &	28337 &	26642 &	\textbf{26302}	\\
&	Gap &	0.00\% &	2.94\% &	1.92\% &	1.00\% &	8.44\% &	1.96\% &	\textbf{0.66\%}	\\ \midrule
\multirow{2}{*}{\textbf{lin105}} & 	Len. & 	14379 &	15272 &	14604 &	14718 &	15632 &	\textbf{14513} &	14536	\\
&	Gap &	0.00\% &	6.21\% &	1.56\% &	2.36\% &	8.72\% &	\textbf{0.93\%} &	1.09\%	\\ \midrule
\multirow{2}{*}{\textbf{pr107}} & 	Len. & 	44303 &	45681 &	44933 &	46595 &	49153 &	45865 &	\textbf{44416}	\\
&	Gap &	0.00\% &	3.11\% &	1.42\% &	5.17\% &	10.95\% &	3.53\% &	\textbf{0.25\%}	\\ \midrule
\multirow{2}{*}{\textbf{pr152}} & 	Len. & 	73682 &	79293 &	\textbf{74902} &	76145 &	79990 &	77069 &	75061	\\
&	Gap &	0.00\% &	7.61\% &	\textbf{1.66\%} &	3.34\% &	8.56\% &	4.60\% &	1.87\%	\\ \midrule
\multirow{2}{*}{\textbf{pr226}} & 	Len. & 	80369 &	87801 &	83754 &	85406 &	92527 &	85437 &	\textbf{81896}	\\
&	Gap &	0.00\% &	9.25\% &	4.21\% &	6.27\% &	15.13\% &	6.31\% &	\textbf{1.90\%}	\\ \midrule
\multirow{2}{*}{\textbf{pr264}} & 	Len. & 	49135 &	57716 &	54589 &	54735 &	71243 &	54445 &	\textbf{50713}	\\
&	Gap &	0.00\% &	17.46\% &	11.10\% &	11.40\% &	44.99\% &	10.81\% &	\textbf{3.21\%}	\\ \midrule
\multirow{2}{*}{\textbf{pr299}} & 	Len. & 	48191 &	59850 &	53926 &	54920 &	67690 &	53479 &	\textbf{51354}	\\
&	Gap &	0.00\% &	24.19\% &	11.90\% &	13.96\% &	40.46\% &	10.97\% &	\textbf{6.56\%}	\\ \midrule
\multirow{2}{*}{\textbf{pr76}} & 	Len. & 	108159 &	108582 &	\textbf{108404} &	111196 &	109787 &	108572 &	109826	\\
&	Gap &	0.00\% &	0.39\% &	\textbf{0.23\%} &	2.81\% &	1.50\% &	0.38\% &	1.54\%	\\ \midrule
\multirow{2}{*}{\textbf{rat195}} & 	Len. & 	2323 &	2620 &	2490 &	2490 &	2693 &	2490 &	\textbf{2374}	\\
&	Gap &	0.00\% &	12.78\% &	7.18\% &	7.19\% &	15.93\% &	7.17\% &	\textbf{2.18\%}	\\ \midrule
\multirow{2}{*}{\textbf{rat99}} & 	Len. & 	1211 &	1263 &	1237 &	1256 &	1297 &	1255 &	\textbf{1232}	\\
&	Gap &	0.00\% &	4.33\% &	2.18\% &	3.74\% &	7.13\% &	3.62\% &	\textbf{1.76\%}	\\ \midrule
\multirow{2}{*}{\textbf{rd100}} & 	Len. & 	7910 &	8030 &	8004 &	8132 &	8140 &	7996 &	\textbf{7922}	\\
&	Gap &	0.00\% &	1.52\% &	1.19\% &	2.80\% &	2.91\% &	1.08\% &	\textbf{0.16\%}	\\ \midrule
\multirow{2}{*}{\textbf{st70}} & 	Len. & 	675 &	693 &	685 &	691 &	693 &	683 &	\textbf{678}	\\
&	Gap &	0.00\% &	2.67\% &	1.51\% &	2.39\% &	2.65\% &	1.14\% &	\textbf{0.43\%}	\\ \midrule
\multirow{2}{*}{\textbf{tsp225}} & 	Len. & 	3916 &	4352 &	4159 &	4209 &	4588 &	4215 &	\textbf{4024}	\\
&	Gap &	0.00\% &	11.15\% &	6.21\% &	7.47\% &	17.17\% &	7.63\% &	\textbf{2.75\%}	\\ \midrule
\multirow{2}{*}{\textbf{u159}} & 	Len. & 	42080 &	44908 &	42888 &	44062 &	47500 &	42751 &	\textbf{42623}	\\
&	Gap &	0.00\% &	6.72\% &	1.92\% &	4.71\% &	12.88\% &	1.59\% &	\textbf{1.29\%}	\\ \midrule
\midrule


\multicolumn{2}{c||}{\textbf{Avg. Gap}}    &     0.00\%    &  10.53\%&  5.16\%&  5.92\% & 16.75\% &  5.79\%   &     \textbf{1.58}\%      \\ \midrule
\multicolumn{2}{c||}{\textbf{Avg. Inf. Time (s)}}             &     -     &   0.48   &  0.47  & 69.26  & 0.48  &  0.35  &   48.11    \\ \bottomrule
\end{tabular}}
\caption{Results on TSPLib}
\label{tb:tspplib}
% \vspace{-2mm} %remove it in preprint
\end{table*}

\section{Detailed results on TSPLib}
Here we display the detailed results of the 25 instances from TSPLib \citep{reinelt1991tsplib}. These instances are gathered from various sources, with different node distributions and problem sizes, which are desirable to be used for assessing the generalization performance of neural heuristics. Specifically, we solve the instances with 51-299 nodes by the MVGCL model trained on TSP100 with the mixed distributions. As shown in Table \ref{tb:tspplib} (in the next page), our method significantly outperforms the neural heuristic baselines and most gaps to the optimal solutions are within 2.5\%. It suggests our MVGCL is effective in tackling realistic instances from TSPLib, which are completely unseen during training.




% \section{Additional simulation results}
% Table~\ref{tab:supp-data} lists additional simulation results; see also \citet{einstein} for a comparison. 

% \begin{table}[!h]
%     \centering
%     \caption{An Interesting Table.} \label{tab:supp-data}
%     \begin{tabular}{rl}
%       \toprule % from booktabs package
%       \bfseries Dataset & \bfseries Result\\
%       \midrule % from booktabs package
%       Data1 & 0.12345\\
%       Data2 & 0.67890\\
%       Data3 & 0.54321\\
%       Data4 & 0.09876\\
%       \bottomrule % from booktabs package
%     \end{tabular}
% \end{table}

% \section{Math font exposition}
% % NOTE: necessary when ptmx or no mathfont class option is given
% \providecommand{\upGamma}{\Gamma}
% \providecommand{\uppi}{\pi}
% How math looks in equations is important:
% \begin{equation*}
%   F_{\alpha,\beta}^\eta(z) = \upGamma(\tfrac{3}{2}) \prod_{\ell=1}^\infty\eta \frac{z^\ell}{\ell} + \frac{1}{2\uppi}\int_{-\infty}^z\alpha \sum_{k=1}^\infty x^{\beta k}\mathrm{d}x.
% \end{equation*}
% However, one should not ignore how well math mixes with text:
% The frobble function \(f\) transforms zabbies \(z\) into yannies \(y\).
% It is a polynomial \(f(z)=\alpha z + \beta z^2\), where \(-n<\alpha<\beta/n\leq\gamma\), with \(\gamma\) a positive real number.

\bibliography{uai2023-ref}

\end{document}
