% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% \usepackage[pdftex]{graphicx}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{bm}
\usepackage{amsfonts}
\usepackage{mathtools}
\usepackage{array}
\usepackage{url,subfigure,amsmath,amssymb,epsfig,verbatim,booktabs,graphicx,epstopdf}

\usepackage{threeparttable}
\usepackage{multirow}
\usepackage{makecell}
% \usepackage{todonotes}
\usepackage{bbding}

% \usepackage[ruled,linesnumbered]{algorithm2e}
% \usepackage{caption}
\usepackage{subfigure}
\usepackage{color}
\usepackage{xcolor}
\newcommand{\colorb}[1]{\textcolor{blue}{#1}}
\newcommand{\colorg}[1]{\textcolor{green}{#1}}
\newcommand{\colorr}[1]{\textcolor{red}{#1}}
\newcommand{\colorp}[1]{\textcolor{violet}{#1}}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Multi-View Graph Contrastive Learning for Solving Vehicle Routing Problems}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<jiang.yuan@ntu.edu.sg>?Subject=MVGCL UAI2023}{Yuan~Jiang}{}}
\author[2]{\href{mailto:<zhiguangcao@outlook.com>?Subject=MVGCL UAI2023}{Zhiguang~Cao}{}}
\author[3*]{\href{mailto:<y.wu2@tue.nl>?Subject=MVGCL UAI2023}{Yaoxin~Wu}{}}
\author[1]{\href{mailto:<zhangj@ntu.edu.sg>?Subject=MVGCL UAI2023}{Jie~Zhang}{}}
% Add affiliations after the authors
\affil[1]{%
    School of Computer Science and Engineering\\
    Nanyang Technological University\\ 
    Singapore
}
\affil[2]{%
    School of Computing and Information Systems, Singapore Management University\\
    Singapore
}
\affil[3]{%
    Department of Industrial Engineering \& Innovation Sciences, Eindhoven University of Technology\\
    Netherlands
  }
\affil[*]{%
    Corresponding Author
  }  
\begin{document}
\maketitle

\begin{abstract}
Recently, neural heuristics based on deep learning have reported encouraging results for solving vehicle routing problems (VRPs), especially on independent and identically distributed (i.i.d.) instances, e.g. \emph{uniform}. However, in the presence of a distribution shift for the testing instances, their performance becomes considerably inferior. In this paper, we propose a multi-view graph contrastive learning (MVGCL) approach to enhance the generalization across different distributions, which exploits a graph pattern learner in a self-supervised fashion to facilitate a neural heuristic equipped with an active search scheme. Specifically, our MVGCL first leverages graph contrastive learning to extract transferable patterns from VRP graphs to attain the generalizable multi-view (i.e. node and graph) representation. Then it adopts the learnt node embedding and graph embedding to assist the neural heuristic and the active search (during inference) for route construction, respectively. Extensive experiments on randomly generated VRP instances of various distributions, and the ones from TSPLib and CVRPLib show that our MVGCL is superior to the baselines in boosting the cross-distribution generalization performance.
\end{abstract}

\section{Introduction}
\label{sec:Introduction}


Vehicle routing problem (VRP) is essentially a combinatorial optimization problem (COP) with many important real-world applications, especially in logistics~\citep{bernhard2008combinatorial}. In reality, it occurs more than often that vehicle routing tasks are repeatedly carried out which share similar problem structures but only differ in data. For example, a logistic company may dispatch a fleet of trucks to pick up or deliver packages for customers in the same city on a daily basis, with only discrepancies in customer locations and/or demands. However, conventional heuristic methods always treat each of those tasks independently, which may yield limited computation efficiency and/or solution quality. Hence, developing \emph{neural} heuristics based on deep learning has become a sought-after alternative for solving VRPs, which aim to improve the performance by exploiting the underlying patterns in the instances~\citep{bengio2021machine}.  
%As such, there are  growing interests in developing \emph{neural} heuristics based on deep learning to exploit the common pattern in the instances~\cite{bengio2021machine}, so as to improve the overall performance of solving VRPs.  

The early neural heuristics for VRPs primarily fall into the supervised category, which requires the optimal solution as supervised labels~\citep{vinyals2015pointer, joshi2019efficient}. The resulting performance highly relies on the quality of the labels, rendering the supervised methods are less favourable since it is computationally expensive to attain optimal solutions due to the NP-hardness. Moreover, it is also difficult for supervised methods to generalize to problem sizes different from the training ones. In contrast, the neural heuristics based on (deep) reinforcement learning only need a reward (e.g. the current tour length) rather than the optimal solution at each decision step, to indicate whether a move or a selection is favourable or not~\citep{Bello2017WorkshopT, kool2018attention, kwon2020pomo, kim2021learning}. Meanwhile, it is also relatively easier for them to generalize to different problem sizes than the supervised ones.

Although the neural heuristics have reported many encouraging results for VRPs, most of their underlying models are trained and evaluated on independent and identically distributed (i.i.d.) instances with respect to the node locations, especially the \emph{uniform} distribution. An ideal neural heuristic should be able to generalize to various distributions, since the real-world instances may follow different and sometimes even unknown distributions. Unfortunately, directly applying existing neural heuristics trained on the uniform distribution to instances of other distributions will result in considerably inferior solutions~\citep{geisler2022generalization, zhang2022learning}, which may hinder their applications. On the other hand, some preliminary studies have been conducted to alleviate this generalization issue, which leverage group distributionally robust optimization (DRO) \citep{jiang2022learning} or adaptive hardness assisted curriculum learning (HAC)~\citep{zhang2022learning} to train the model. However, the former needs to label typical and atypical instances, and the latter is mainly extended to Gaussian distribution only. 

Motivated by the facts that, 1) a VRP instance can always be represented as a graph, 2) the VRP solution depends on the pattern of the graph (e.g. the distribution of nodes) \citep{chen2020dynamic, wu2021learning, hudson2022graph}, we postulate that transferable structural patterns across diverse graphs could be helpful to improve the generalization against different distributions \citep{qiu2020gcc,JureLeskovec2005GraphsOT}. Especially, similar local patterns may distribute across graphs even if those graphs belong to different distributions. On the other hand, recent advances in  computer vision (CV) and natural language processing (NLP) \citep{he2020momentum, chen2020improved, giorgi2020declutr, gao2021simcse, hassani2020contrastive, you2021graph} have testified that pre-training an encoder network in a contrastive learning manner can produce more informative and transferable representation for downstream tasks.

With the above principle, in this paper, we propose a multi-view graph contrastive learning (MVGCL) approach to foster the generalization capability of neural heuristics for VRPs, through mining the underlying patterns across graphs. 
Specifically, given a collection of graphs of VRP instances of various distributions, our MVGCL exploits contrastive learning with a weighted random walk augmentation to identify the local transferable patterns, and a distribution-preserved augmentation to identify the global distribution across these graphs.
This pre-trained graph neural network (GNN) acts as the encoder, which learns the representation in two views, 1) a \emph{node} embedding with respect to the local structural similarity; 2) a \emph{graph} embedding with respect to the overall distribution information. Subsequently, the two learnt embeddings are employed to facilitate a neural heuristic (i.e. POMO~\citep{kwon2020pomo}) and its active search scheme~\citep{hottung2022efficient} (in the inference phase) for downstream route construction, respectively. In this way, our approach not only learns the transferable pattern across various distributions, but also adjusts itself with individual instances. We conduct extensive experiments on two widely studied VRP variants, i.e., the travelling salesman problem (TSP) and capacitated vehicle routing problem (CVRP). Results on randomly generated instances and benchmark ones (i.e. TSPLib and CVRPLib) verified its effectiveness.

% The remainder of the paper is organized as follows. Section~\ref{sec:related} briefly reviews existing works. Section~\ref{sec:problem} introduces the standard 3D BPP and its conventional formulation. Section \ref{sec:method} presents our end-to-end DRL agent in great detail. Section \ref{sec:method} explains the integration of the DRL agent and the CP solver. Section \ref{sec:experiments} provides the computational experiments and analysis. Finally, section \ref{sec:conclusion} concludes the paper and points out the future direction.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Related Works}\label{sec:related}
% In this section, we review the studies on neural methods for VRPs and graph contrastive learning. 

\subsection{Neural heuristics for VRPs} 

In recent years, the neural heuristics based on deep (reinforcement) learning for routing problems have been extensively explored. The pointer network (PtrNet)~\citep{vinyals2015pointer}, as the first modern deep architecture for VRPs is essentially developed from the encoder-decoder-based sequence-to-sequence model for NLP. \citet{Bello2017WorkshopT} propose to train PtrNet with reinforcement learning since ground-truth labels are computationally expensive. \citet{kool2018attention} further boost the performance by introducing a self-attention encoder~\citep{vaswani2017attention} and an attentive decoder, which stands as the well-known attention model (AM). \citet{kwon2020pomo} propose the POMO model on top of AM by augmenting the input instances and starting the inference from multiple nodes. \citet{hottung2022efficient} conceive three different strategies to integrate the active search with POMO during the inference phase, which deliver noticeably superior performance. Differently, a concurrent line of \emph{improvement} methods~\citep{chen2019learning, lu2019learning, da2020learning, wu2021learning, kim2021learning} emphasize improving an initial but complete solution via iterative local operations. Among them, \citet{kim2021learning} propose to learn collaborative policies (LCP) with a seeder to explore large solution space and a reviser to improve the solution for local segments. On the other hand, GNNs or its variants such as graph convolutional networks (GCNs) and graph attention networks (GATs) are also exploited on VRP graphs \citep{khalil2017learning, deudon2018learning, joshi2019efficient}. Besides, \citet{li2018combinatorial} propose guided tree search by leveraging GCN embeddings. \citet{fu2021generalize} use a graph convolutional residual network and a Monte Carlo tree to generalize to larger instances on TSP. 

The aforementioned neural methods have exhibited impressive performance when the training and testing instances share the same distribution regarding the node locations, e.g., uniform~\citep{kwon2020pomo, kool2022deep}. However, \citet{geisler2022generalization} and \citet{zhang2022learning} show that simply applying those neural heuristics to other distributions may cause considerably inferior solutions. To generalize the neural heuristics beyond the single (uniform) distribution used in training, \citet{jiang2022learning} exploit group distributionally robust optimization (DRO) to train deep models (i.e. POMO and GCN) across multiple distributions, which needs to label typical and atypical instances. \citet{zhang2022learning} propose a curriculum learning-based AM trained on instances of different hardness. Those instances are generated by a hardness-adaptive generator with mixed-Gaussian distribution, which limits its generalization to more others. 


\subsection{Graph contrastive learning}

Contrastive learning (CL) is a type of self-supervised learning. It is usually used to identify the similarity among the unlabeled data and learn inherent representations across instances, which has been widely explored in CV \citep{he2020momentum, tian2020contrastive, chen2020improved} and NLP \citep{mikolov2013distributed, giorgi2020declutr}. 

To tackle the graphic data, graph contrastive learning (GCL) has been proposed \citep{qiu2020gcc, hassani2020contrastive, liu2022graph}, and a series of augmentation techniques to generate contrastive samples based on the original graph have also been accordingly developed~\citep{ you2021graph, yin2022autogcl, zhou2022data}, such as attribute removing, edge adding/masking, and subgraph/graph diffusion. Among them,~\citet{qiu2020gcc} define the contrastive samples as the r-ego sub-network of the input nodes and then apply the pre-trained GNN on tasks of node or graph classification. \citet{hassani2020contrastive} contrastively learn embeddings from the first-order neighbours and graph diffusion. GraphCL~\citep{you2020graph} handpicks ad-hoc augmentations (node dropping, edge perturbation, attribute masking and subgraph sampling) to provide specific contrastive samples for graph-level representation learning, while this augmentation selection is made automated in its subsequent work~\citep{you2021graph}. Similarly,~\citet{yin2022autogcl} propose adaptive augmentation to remove edges. 

However, the augmentation methods of existing graph contrastive learning are not directly applicable to our routing tasks in that, 1) nodes in VRPs like TSP only has coordinates as the attribute (unlike social networks with rich attributes such as age, gender and country of a person), which makes it harder to distinguish the node from each other by attribute masking~\citep{you2020graph};
2) VRP graphs are fully-connected, augmentations like node dropping or edge perturbation~\citep{wu2021self, liu2022graph} may violate the connectedness. 
Worse still, the original graph distribution could be distorted by altering its structure. Although recent works~\citep{yin2022autogcl, zhang2022costa} attempt to preserve graph class labels during augmentation, it is impractical to acquire all distribution labels for VRPs. Furthermore, unlike traditional network embeddings \citep{BryanPerozzi2014DeepWalkOL, JianTang2015LINELI, grover2016node2vec} or recent works that pre-train GNNs with attributed graphs and then directly apply them to instances of the same domain \citep{hu2019strategies}, our goal is to pre-train a GNN for learning local structure across distributions and global graph embedding of the instance, which allows the neural heuristic to solve VRPs of various distributions effectively. 

%Therefore, in this paper, we proposes two augmentation methods to tackle the above issues for solving VRPs.  

%In short, existing graph augmentation techniques are not suitable for our scenarios because 1) Removing nodes or edges violates the basic problem setting of VRPs; 2) The structural information and semantics of the graphs vary significantly and thus cannot guarantee distribution consistency after augmentation. Therefore, this work proposes two augmentation methods that are specially designed for routing problems.  

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Methodology}

\label{sec:method}

% \begin{figure*}
%   \centering
%   \includegraphics[width=\linewidth]{architecture.pdf}
%   % not allowed \vspace{-1cm}
%   \caption{The illustration of our MVGCL, where the pre-trained embedding will be used to facilitate the training and inference. }
%   \label{fig:architecture}
% \end{figure*}

\begin{figure}
  \centering
  \includegraphics[width=\linewidth]{inference.pdf}
  % not allowed \vspace{-1cm}
  \caption{The illustration of our MVGCL, where the multi-view embeddings from the pre-trained GNN will be used to facilitate the subsequent training and inference. }
  \label{fig:architecture}
\end{figure}

We first present the graph representation of TSP and CVRP, followed by our multi-view graph contrastive learning (MVGCL) approach for solving the two problems.

\subsection{Preliminary} 
We consider the two classic VRP variants, i.e., travelling salesman problem (TSP) and capacitated vehicle routing problem (CVRP) in the Euclidean space. Particularly, a problem instance with $n$ nodes is represented as an undirected graph $G = (V, E)$ with complete connections. The cost $c_{ij}$ for edge $e_{ij}$ ($e_{ij}\in E$) denotes the distance between node $v_i$ and $v_j$ ($v_i, v_j \in V$). The vehicle in TSP needs to visit each node once and return to the starting one. In CVRP, a vehicle with capacity $Q$ begins and ends its round trips at the depot node, while visiting each of other nodes once to satisfy customer demand $d_i$. The vehicle has to be fully replenished at the depot if the remaining capacity is not enough to serve any unvisited node. The optimal solution is defined as the route with the shortest length that complies with all the above constraints.

%  each customer in $V \\ {1}$ is visited exactly once by exactly one vehicle; • all routes start and end at the depot; • the sum of customer demand within a route does not exceed the vehicles’ capacity; • the sum of costs of all routes is minimal given the constraints above; 

\subsection{Multi-view GCL for VRPs}
When solving real-world VRP instances, an inevitable issue faced by neural heuristics is how to generalize the trained models to different distributions. In fact, most existing neural heuristics \citep{kool2018attention, joshi2019efficient, kwon2020pomo} for VRPs did not explicitly consider such matter and thus the underlying models trained on one distribution often yield inferior cross-distribution generalization. 

To tackle this issue, we present a multi-view graph contrastive learning (MVGCL) approach for solving VRPs. Given graphs of VRP instances, our key idea is pre-training a GNN encoder to learn useful cross-distribution representations of nodes and graphs, which can be used to enhance the generalization of the neural heuristics. Unlike CV \citep{tian2020contrastive, chen2020improved} or NLP \citep{giorgi2020declutr, gao2021simcse}, in which the common patterns could be similar image patches or words, the common patterns in VRP graphs are more obscure. From the geographic view, 1) a pattern could be clusters where customers concentrate on small areas; 2) it could also be hollows where only a few or even no customers exist. Rather than dominating the whole graph, these patterns may exist locally as small subgraphs  across various distributions. Regarding the neural heuristics for routing problems~\citep{kool2018attention,kwon2020pomo}, explicitly (pre-)training over such patterns may help them learn more informative representation and potentially alleviate overfitting, thus fostering the generalization capability. To this end, in our MVGCL, we pre-train a GNN encoder on the corpus with various instance distributions for learning the representation of local structural patterns of nodes and global patterns of graphs. Then the learnt node embedding is employed to facilitate the neural heuristic (i.e. POMO~\citep{kwon2020pomo}) trained with reinforcement learning, and the learnt graph embedding is employed to facilitate the active search~\citep{hottung2022efficient} equipped to the neural heuristic during inference. The overall framework of our MVGCL is depicted in Figure~\ref{fig:architecture}. 


%Considering practical routing problems with diverse distributions, an inevitable problem is that how the trained model can generalize to well solve instances with different distributions. Most of existing neural solvers \cite{kool2018attention, joshi2019efficient, kwon2020pomo} for routing problems did not explicitly consider such generalization issue \colorb{and thus the trained model on one distribution often cause inferior cross-distribution generalization performance.} To tackle the issue, we present a multi-view graph contrastive learning (MVGCL) approach for solving VRPs in this paper. Given graphs that represent routing problems, our key idea is pre-training \colorr{an GNN encoder} to mine useful cross-distribution (node or graph) representation, which can be used to facilitate the neural solver for routing problems, in terms of generalization. Unlike CV \cite{tian2020contrastive, chen2020improved} and NLP \cite{giorgi2020declutr, gao2021simcse}, in which the common pattern could be similar image patches or words, the common pattern in routing graphs are more obscurate. From the geographic view: one pattern could be clusters where customers concentrate on small areas due to some attractive factor; it also could be hollows where very a few or even no customers exist in certain areas due to resource shortage or extreme environment. These patterns usually exist locally as small subgraphs in graphs from various distributions instead of dominating the whole graph. For neural heuristics of routing problems~\cite{kool2018attention,kwon2020pomo}, training on top of such patterns may help the model escape from local minima on a broader class of distributions and foster the generalization capacity. To find the common patterns for solving routing problems, we propose a MVGCL approach to pre-train a GNN encoder on a corpus with various instance distributions for learning representations of structural patterns of nodes and the overall representation of the graph. Then the learnt node embedding is employed to facilitate the neural heuristic for VRPs (i.e. POMO~\cite{kwon2020pomo}) trained with reinforcement learning, and the learnt graph embedding is employed to facilitate the active search~\cite{hottung2022efficient} equipped to the neural heuristic during inference. The overall framework of the proposed MVGCL is depicted in Figure~\ref{fig:architecture}. 

%and used the pre-trained model as a node feature encoder to facilitate a deep reinforcement learning model $\boldsymbol{M}$, POMO here. In the inference phrase, we actively adjust learnable layers with graph embedding for each unseen instance to achieve better generalization ability.

\begin{figure}
  \centering
  \includegraphics[width=\linewidth]{pretrain.pdf}
  % not allowed \vspace{-1cm}
  \caption{The framework of our node-level and graph-level contrastive learning, where the node augmentation in (a) and graph augmentation in (b) will share the same pre-training paradigm in (c) to attain the GNN $f_q$ ($f_k$ will be discarded).}
  \label{fig:pretrain}
\end{figure}

\subsection{Pre-training with GCL}
In order to identify local patterns for nodes and global patterns for graphs, we exploit graph contrastive learning (GCL) to pre-train graph neural networks (GNN) in a self-supervised fashion. A contrastive learning framework for a specific problem usually comprises a properly defined contrastive objective and query/key samples. Regarding the former, we exploit the InfoNCE for subgraph as the objective function~\citep{wu2018unsupervised, he2020momentum, qiu2020gcc}, given its fit to our problem. 
%With the InfoNCE, contrastive learning is
%GCL is usually viewed as a "looking up a dictionary" task with InfoNCE.
Regarding the latter, typically, there will be an encoded query $\boldsymbol{q}$ (a local or global pattern) and a dictionary with a set of $K$+1 encoded keys $\mathcal{K} = \{\boldsymbol{k}_i\}_{i=0}^{K+1}$, where both positive keys (similar patterns) and negative keys (dissimilar patterns) exist. The similarity between the query and keys is measured by the dot product between $\boldsymbol{q}^{\top}$ and $\boldsymbol{k}_{i}$. The positive key $\boldsymbol{k_+}$ should match the query $\boldsymbol{q}$, which shares a similar pattern as $\boldsymbol{q}$ and produces a high value for $\boldsymbol{q}^{\top} \boldsymbol{k_+}$. The similarity between $\boldsymbol{q}$ and negative keys should be low. Accordingly, the loss function of InfoNCE in our method is expressed as follows,
\begin{equation}\label{eq:infonce}
 \mathcal{L}_q =-\log \frac{\exp \left(\boldsymbol{q}^{\top} \boldsymbol{k}_{+} / \tau\right)}{\sum_{i=0}^{K} \exp \left(\boldsymbol{q}^{\top} \boldsymbol{k}_{i} / \tau\right)}, 
\end{equation}
where $\tau$ is a hyper-parameter of temperature that regulates the weights of penalties on negative samples~\citep{wu2018unsupervised}. Intuitively, the above ($K$+1)-element softmax loss function encourages the model to classify $\boldsymbol{q}$ as $\boldsymbol{k_+}$, which allows our GNN encoder to learn similar representations for nodes or graphs with similar patterns in the VRP instances. 
%\colorr{In the following subsection, we elaborate the construction of positive and negative pairs (i.e. query-positive key pairs and query-negative key pairs) for each node to conduct contrastive learning.}

\subsubsection{Node-level representation learning} \label{sec:node}

The performance of contrastive learning highly relates to how contrastive samples are defined for producing query $\boldsymbol{q}$ and keys $\mathcal{K}$. Pertaining to the general graphic tasks, it is natural to define samples as nodes with rich attributes or r-ego subgraphs produced based on the connectivity of nodes \citep{you2020graph,qiu2020gcc}. However, these ideas cannot be directly applied to routing tasks. On the one hand, nodes in VRPs may not carry rich attributes, e.g., only coordinates for TSP (plus demands for CVRP). On the other hand, the connectivity is not informative for
%subtly informative with 
\emph{complete} graphs of VRP. Given these points, we resort to geographic information to define the samples for contrastive learning.  

Intuitively, the decision of visiting the next node in VRP is often subject to its geographical neighbourhoods~\citep{joshi2019efficient, fu2021generalize}, where the probability for visiting a node in the same cluster should be higher than visiting others. It motivates us to strengthen the cross-distribution generalization by mining the local structural patterns. Instead of directly feeding coordinates of nodes into deep models as did in existing neural heuristics, we expect that a pre-trained GNN encoder could empower the neural heuristic with informative local structure representations of each node. In this sense, subgraphs around a node are deemed as natural choices to acquire positive and negative samples for contrastive learning. Therefore, we propose to discriminate subgraphs for different nodes as our pre-training task, where we feed them to the GNN encoder to produce representations for local structural patterns of the nodes.

% the input to the GNN encoder is subgraphs around nodes and the GNN is trained to discover informative representations of local structural patterns for the nodes.
%discriminate similar subgraphs from dissimilar ones.


%From the perspective of contrastive learning, we describe the augmentation process for positive and negative samples here. 
We present the process of collecting positive and negative pairs of samples for each node. Specifically, as demonstrated in Figure~\ref{fig:pretrain} (a), we first extract a $n_q$-nearest-neighbours subgraph for a node and then apply the multi-hop random walk (MHRW) \citep{zhang2013random} on this subgraph to further generate MHRW subgraphs as the augmented samples, which are then fed into the GNN encoder. Since the MHRW subgraphs of the same node may contain similar customer sets and structures, we specify two samples augmented from the same $n_q$-nearest-neighbours subgraph as a positive pair, and those sampled from different nodes as negative ones. For our VRPs, we further specify the weight of walking from node $v_i$ to $v_j$ as $1/c_{ij}$ in MHRW, which is inversely proportional to the distance between them. This setting encourages to aggregate more structural information from the vicinity than non-vicinity, and such local patterns may potentially foster the generalization across distributions. The details of our adapted MHRW can be found in Appendix A.

Meanwhile, we adopt the Momentum Contrast (MoCo)~\citep{he2020momentum, chen2020improved} mechanism to pre-train the encoder with augmented samples in a contrastive way. In our approach, the MoCo includes an online GNN $f_{q}$ and a smoothly-varying momentum GNN $f_{k}$ as encoding networks with $ \boldsymbol{q}=f_{q}\left(g^{q}\right)$ and $\boldsymbol{k}=f_{k}\left(g^{k}\right)$ ~(Figure \ref{fig:pretrain} (c)), where $g^{q}$ and $g^{k}$ are the MHRW subgraphs sampled from the input graph. To keep consistent dictionary and stable training, the parameters $\theta_{k}$ of $f_{k}$ are updated according to $\theta_{q}$ of $f_{q}$ with a momentum coefficient $m \in [0,1)$ as follows,
\begin{equation}
\theta_{k} \leftarrow m \theta_{k}+(1-m) \theta_{q}.
\end{equation}

% mechanism to build a large and consistent dictionary for contrastive pre-training. To keep the dictionary large, MoCo enqueues samples from preceding mini-batches in each epoch. The MoCo in our method includes an online network $f_{q}$ and a smoothly-varying momentum network $f_{k}$ as encoding networks with $ \boldsymbol{q}=f_{q}\left(g^{q}\right)$ and $\boldsymbol{k}=f_{k}\left(g^{k}\right)$, where $g^{q}$ and $g^{k}$ are the MHRW subgraphs sampled from the input graph. To keep the dictionary consistent, the parameters $\theta_{k}$ of $f_{k}$ are updated according to $\theta_{q}$ of $f_{q}$ with a momentum coefficient $m \in [0,1)$ as follows,

%According to MoCo, we maintain a large queue $\mathbb{K}$ that only enqueues new $\boldsymbol{k}$ from each epoch. In each epoch, we sample a batch of $g^{q}$ and $g^{k}$ augmented from the same node as the positive pairs. The InfoNCE loss in Eq.~(\ref{eq:infonce}) is calculated with the sampled positive pairs and the negative keys (i.e. the key of negative pairs) from $\mathbb{K}$, and it only back-propagates to $f_q$, as shown in Figure~\ref{fig:pretrain} (c).

In each epoch, our approach samples a batch of $g^{q}$ and $g^{k}$ augmented from the same node as the positive pairs to produce $q$ and $k_+$. The MoCo enqueues key (i.e. the representations produced by $f_k$) from preceding mini-batches in each epoch to maintain a large dictionary without additional back-propagation costs. The InfoNCE loss in Eq.~(\ref{eq:infonce}) is calculated with the $q$, $k_+$ and the key representations from the dictionary, and it only back-propagates to $f_q$. Then the node embeddings $\{x_i\}_{i=1}^n$ from this pre-trained GNN $f_q$ will be used to facilitate generalizing the neural heuristic to various local structures of nodes for potentially reducing the route length on unseen VRP graphs.

% , while GNN $f_k$ (in  Figure~\ref{fig:pretrain} (c)) will be discarded}. 
%differentiating between neighbouring nodes for potentially reducing the route length and escaping from the local minima, while GNN $f_k$ (in  Figure~\ref{fig:pretrain} (c)) will be discarded

%As a result, it provides structural information to guide the solution searching direction towards promising areas of the searching space and escape from local minima.


\subsubsection{Graph-level representation learning} \label{sec:graph}
From the multi-view perspective, the node embedding is more specific for local information, while the graph embedding may carry more useful global information. As verified in previous works on the selection of combinatorial optimization solvers~\citep{sievers2019deep, zhao2021towards}, the embedding of the global graph is important for the decision-making to a targeted instance. In our MVGCL, we propose a new augmentation mechanism for the graph embedding (as shown in Figure~\ref{fig:pretrain} (b)), which could limit the variance of distribution shift to a reasonable level and avoid alternating the distribution significantly as encountered in the general GCL augmentations~\citep{you2020graph, zhou2022data}.

First, we normalize the coordinates of each node in the VRP graph to $[0,1]^2$. To generate an augmented graph, for each node of the input VRP graph, we sample a perturbation level $\eta$ from a categorical distribution, i.e.,$\eta \in \{0.1, 0.2, 1\},$ where $\eta \sim \text{Categorical}(p_1, p_2, p_3)$, $p_1+p_2+p_3=1$, $~ p1>p2 \gg p3$. Then, the perturbation of the coordinate $\mathbf{v}_i$ of node $v_i$, can be described as follows,
\begin{equation}
\mathbf{v}_i^{\prime}=\mathbf{v}_i+\eta \cdot \Delta \mathbf{v}_i ; \quad \Delta \mathbf{v}_i \sim \mathcal{U}\left[-\eta, \eta \right]^2 ,
\end{equation}
where $\Delta \mathbf{v}_i$ is uniformly sampled in a square with length $2\eta$. In this way, most (about $p_1*n$) nodes are relocated within its adjacent area $\Delta \mathbf{v}_i \sim \mathcal{U}\left[-0.1, 0.1  \right]^2$; a small part (about $p_2*n$) of nodes are relocated within lager adjacent area $\Delta \mathbf{v}_i \sim \mathcal{U}\left[-0.2, 0.2  \right]^2$; only a very few (about $p_3*n$) nodes are likely relocated to farther area $\Delta \mathbf{v}_i \sim \mathcal{U}\left[-1, 1  \right]^2$. The idea behind this is that we keep the majority of nodes stay around the original position and thus the overall distribution is preserved after perturbation. To boost the generalization of this pre-trained model, we increase augmentation diversity by allowing the minority of nodes to shift farther.

Next, we randomly rotate the perturbed graphs. Then augmentations from the same input graph are deemed as positive pairs and those from different ones are negative pairs. Note that the graph representation learning share the same training paradigm as the node representation learning (i.e., Figure~\ref{fig:pretrain} (c)). During the training of MoCo~\citep{he2020momentum}, we feed those pairs into the encoders.  Finally, GNN $f_q$ learns to produce representations for similar distributions close in the embedding space and also invariant to perturbations. 

%\colorp{After pre-training, we use the GNN $f_q$ as the encoder for the subsequent training of the neural heuristic.} 
For solving VRPs, the inference based on active search iteratively updates the model parameters for each individual instance~\citep{Bello2017WorkshopT, hottung2022efficient}, and the incorporation of global graph-aware information has the potential to further boost the performance. To this end, in the inference phase of the neural heuristic, the GNN encoder $f_q$ passes the graph embedding $x_g$ of the input graph, as the auxiliary information to favourably guide the active search.
%, while GNN $f_k$ (in Figure~\ref{fig:pretrain} (c)) will also be discarded.

\subsection{Solving VRPs with pre-trained GNN}

The goal of solving VRPs is to attain a valid trajectory for the given problem instance. Taking TSP as an example, the policy in reinforcement learning iteratively outputs an action $a_t$ to select the next node to visit at step $t$, until all nodes have been included in the final trajectory $\pi=\{a_1,\cdots,a_n\}$. And the pre-trained GNN can be used in both policy optimization and active search in inference. 


\subsubsection{Policy optimization with node embedding} In the training phase, we use Policy Optimization with Multiple Optima (POMO) \citep{kwon2020pomo} as the neural heuristic to learn route construction step by step. POMO employs a self-attention encoder and an attention decoder to generate the solution (route) autoregressively. In our approach, instead of directly feeding coordinates into the attention network~\citep{kwon2020pomo,kim2021learning,zhang2022learning}, we pass the node embeddings $\{x_i\}_{i=1}^n$ from the pre-trained GNN encoder $f_q$ as the input to the self-attention encoder of POMO. In doing so, it is supposed to strengthen the generalization of POMO since more useful local structural information is injected. The attention decoder in POMO accepts the output of its self-attention encoder as the keys and values for its multi-head attention layers. Finally, a softmax layer computes the probabilities of visiting each node at the next step. To preserve desirable training performance, we follow the multiple greedy trajectories and instance augmentation with REINFORCE algorithm \citep{williams1992simple}, as did in the original POMO. 

% \subsubsection{Policy optimization with node embedding} In the training phase, we use Policy Optimization with Multiple Optima (POMO) \cite{kwon2020pomo} as the policy optimization method to construct solutions step by step. POMO employs a self-attention encoder and an attention decoder (AttNet) ~\cite{kool2018attention} to generate the solution autoregressively. In our approach, instead of directly feeding coordinates into the attention network as did in previous works \cite{kwon2020pomo,kim2021learning,zhang2022learning}, we pass the node embedding $\boldsymbol{q}$ from the pre-trained GNN encoder $f_q$ as the input to the self-attention encoder of POMO. In this way, it is supposed to strengthen the generalization of POMO since more useful local structural information are injected from the pre-trained GNN. The attention decoder in POMO accepts the output of its self-attention encoder as the keys and values for its multi-head attention layers. Finally, a softmax layer computes the probabilities of visiting each node at the next step. To preserve desirable training performance, we adopt the techniques of multiple greedy trajectories and instance augmentation with the REINFORCE algorithm \cite{williams1992simple}, as did in original POMO~\cite{kwon2020pomo}. 

\subsubsection{Active search with graph embedding}
In the inference phase, to better generalize the trained model on unseen instances, we exploit active search \citep{Bello2017WorkshopT} to dynamically adjust the parameters for each target instance. Since updating all parameters of the model is expensive and impractical during inference, we adopt a similar technique as \citep{hottung2022efficient}, which adds instance-specific residual layers before the output layer of the decoder in POMO. And we only update the parameters of these residual layers. In our multi-view approach, the residual layers accept both node embeddings from preceding layers and the graph embedding $x_g$ from the pre-trained encoder, as illustrated in Figure \ref{fig:architecture}. 
Therefore, the representation of the entire graph could be exploited by the instance-specific layers to capture high-level information (e.g. the distribution of nodes) of the targeted VRP graph, so as to more effectively guide the active search to solve the corresponding instance.
Particularly, the $l$-th trainable residual layer is inserted into the attention decoder as 
\begin{equation}   
\begin{split}
h_{l}&=\hat{h}+\left(\left(\operatorname{ReLu}\left(\hat{h} W^{1}_{l}+b^{1}_{l}\right) W^{2}_{l}+b^{2}_{l}\right)\right., \\ 
\hat{h}&=\left\{
\begin{aligned}
& [h_n,x_g]  &,~l = 1, \\
& h_{l-1} &,~l > 1,
\end{aligned}
\right.
\end{split}
\end{equation}
where $W^{1}$ and $W^{2}$ are the weight matrix; $b^{1}$ and  $b^{2}$ are bias vectors;  $\hat{h}$ of the first layer is the concatenation of the output of the last attention layer $h_n$ and the graph embedding $x_g$ from the pre-trained GNN. 
%In summary, the active search direction is guided by the instance-specific information to adjust for each unseen instance accordingly, which is favourable for achieving better generalization, as shown in the next section. 
For more details of POMO and active search, please refer to Appendix B.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\section{Experiments}
\label{sec:experiments}


Our MVGCL is proposed to pre-train a GNN encoder to assist the neural heuristic in boosting the cross-distribution generalization performance. We evaluate our MVGCL on synthetic TSP and CVRP instances of various distributions, as well as those from the benchmark datasets, i.e., TSPLib and CVRPLib. We also conduct ablation studies to verify the respective key components of our MVGCL. 


\subsection{Experimental Settings}

\paragraph{Baselines.}{For TSP, we compare with the exact solver Concorde ~\citep{applegate2006concorde} and representative neural heuristics including Attention Model (AM)~\citep{kool2018attention}, Policy Optimization with Multiple Optima (POMO)~\citep{kwon2020pomo}, Learning Collaborative Policies (LCP)~\citep{kim2021learning}, Distributionally Robust Optimization with POMO (DROP)~\citep{jiang2022learning}, and Hardness-Adaptive Curriculum (HAC) \citep{zhang2022learning}, Efficient Active Search (EAS)~\citep{hottung2022efficient}. Among them, DROP and HAC are recent works for tackling the generalization issue pertaining to routing problems. For CVRP, it is hard for existing solvers to attain optimal solutions in a reasonable time, we instead use the strong meta-heuristic HGS~\citep{vidal2022hybrid} as a conventional baseline, which reported superior performance to LKH3~\citep{hottung2019neural}. Moreover, since HAC is originally designed for TSP only, we do not consider it for CVRP. We refer to the result of EAS in Table~\ref{tb:ablation} for ablation study.


\begin{table*}[h]  \small
\setlength{\tabcolsep}{2.8pt}
\centering
%	\vspace{-4mm}
\caption{Results of tour lengths and gaps to Concorde solver on various distributions (TSP)}
\label{tb:tsp}

\begin{threeparttable}
	    \scalebox{0.9}{
\begin{tabular}{lc||c|c|c|c|c|c|c||c|c|c|c|c|c|c}
\toprule
			\multicolumn{2}{c||}{\textbf{Problem}}& \multicolumn{7}{c||}{\textbf{TSP50}} & \multicolumn{7}{c}{\textbf{TSP100}}   \\\midrule
\textbf{Distribution}           & \textbf{Metric} & Concorde & AM &   POMO &  LCP & HAC & DROP & MVGCL & Concorde & AM &  POMO &  LCP & HAC & DROP & MVGCL\\ \midrule\midrule
\multirow{2}{*}{\textbf{Explosion}} & Len.&   4.74 &	4.88 &	4.84 &	4.85 &	4.85 &	4.88 &	\textbf{4.80} 
& 6.09	& 6.31	& 6.22	& 6.23	& 6.38	& 6.27	& \textbf{6.17}
\\  
                           & Gap  &  0.00\% &  2.95\% &	2.11\% &	2.32\% &	2.32\% &	2.95\% &	\textbf{1.27\%} 
                           &  0.00\%& 3.61\%	& 2.13\%	& 2.30\%	& 4.76\%	& 2.96\%	& \textbf{1.31\%}
\\ \midrule
\multirow{2}{*}{\textbf{Compression}} & Len.& 5.22 &	5.37 &	5.33 &	5.32 &	5.35 &	5.34 &	\textbf{5.30} 
& 6.89	& 7.16	& 7.06	& 7.07	& 7.18	& 7.12	& \textbf{7.02}
\\ 
                           & Gap    &  0.00\% &  2.87\% &	2.11\% &	1.92\% &	2.49\% &	2.30\% &	\textbf{1.53\% }
                           &  0.00\%& 3.92\%	& 2.47\%	& 2.61\%	& 4.21\%	& 3.34\%	& \textbf{1.89\%}
\\ \midrule
\multirow{2}{*}{\textbf{Cluster}} & Len.&  5.37 &	5.56 &	5.50 &	5.54 &	5.53 &	5.52 &	\textbf{5.47} 
& 7.26	& 7.58	& 7.45	& 7.48	& 7.63	& 7.46	& \textbf{7.40}
\\ 
                           & Gap  &  0.00\% &  3.54\% &	2.42\% &	3.17\% &	2.98\% &	2.79\% &	\textbf{1.86\%} 
                           &  0.00\%& 4.41\%	& 2.62\%	& 3.03\%	& 5.10\%	& 2.75\%	& \textbf{1.93\%}
\\ \midrule
\multirow{2}{*}{\textbf{Expansion}} & Len.&   4.44 &	4.58 &	4.55 &	4.56 &	4.58 &	4.60 &	\textbf{4.52} 
& 5.57	& 5.80	& 5.72	& 5.78	& 5.88	& 5.74	& \textbf{5.68}
\\  
                           & Gap &  0.00\% &  3.15\% &	2.48\% &	2.70\% &	3.15\% &	3.60\% &	\textbf{1.80\% }
                           &  0.00\%& 4.13\%	& 2.69\%	& 3.77\%	& 5.57\%	& 3.05\%	& \textbf{1.97\%}
\\  \midrule
\multirow{2}{*}{\textbf{Rotation}} & Len.&   4.54 &	4.69 &	4.64 &	4.68 &	4.66 &	4.64 &	\textbf{4.61} 
& 6.02	& 6.28	& 6.17	& 6.20	& 6.34	& 6.23	& \textbf{6.13}
\\ 
                           & Gap     &  0.00\% &  3.30\% &	2.20\% &	3.08\% &	2.64\% &	2.20\% &	\textbf{1.54\%} 
                           &0.00\% & 4.32\%	    & 2.49\%	& 2.99\%	& 5.32\%	& 3.49\%	& \textbf{1.83\%}
\\
                           \midrule
\multicolumn{2}{c||}{\textbf{Avg. Inf. Time (s)}} &    0.08&	0.07&	0.01&	0.53&	0.08&	0.01&	0.87

&0.50	&0.22&	0.02 &	1.50 &	0.23&	0.03&	3.70
  \\                            
                           \bottomrule
\end{tabular}}
\end{threeparttable}

\end{table*}



\paragraph{Implementation.} We adopt the Graph Isomorphism Network (GIN) \citep{xu2018powerful} as the encoders for $f_{q}$ and $f_{k}$, while it is also free to try other GNN variants. We use the same hyperparameters as the original POMO \citep{kwon2020pomo}, except that the batch size is reduced from 64 to 56 for CVRP100 due to the memory limit. During training, we apply \emph{early stopping} when the gap reduction is not significant. We set the number of iterations in active search for each instance to 200. The details of our implementation, including hardware, hyperparameters and network architecture, are presented in Appendix C.



\paragraph{Dataset.}
Instead of solely testing on uniform distribution as most existing neural heuristics did,
% Different from most existing neural heuristics which are merely tested on the uniform distribution, 
we evaluate all methods on various distributions, i.e., Explosion, Compression, Cluster, Expansion and Rotation, respectively. These distributions are more visually and quantitatively diverse than the uniform one \citep{bossek2019evolving}, 
%compared to the rest distributions described in \cite{bossek2019evolving}, 
thus intensifying the hardness for neural heuristics to generalize.
To guarantee the essential diversity of local structural patterns and ensure that the (pre-)training instances are unseen in testing, we generate (pre-)training instances from mixed distributions by, 1) sampling an instance uniformly, 2) then randomly applying three non-repetitive mutation operators\footnote{https://github.com/jakobbossek/tspgen/tree/master/R} from TSPGEN on this instance. We generate 6M such instances as the pre-training corpus of GCL and another 1.2M for training in our method. For other baselines without pre-training, we use all those 7.2M instances in their training phases. After training, we evaluate the models on 2000 instances from each of the above five testing distributions (i.e. 10000 instances in total). For HAC, we use its built-in data generator (i.e. hardness-adaptive generator) for the best performance. We specify the last applied mutation as the class label for each instance, since the training of DROP needs this label.




\subsection{Generalization on TSP}

In Table \ref{tb:tsp}, we display the average values of tour lengths, gaps to the optimal solutions (attained by Concorde solver) and the inference time, on the unseen instances from the five distributions for TSP50 and TSP100. Overall, the exact solver Concorde performs the best in terms of the tour length, since it is highly specialized for TSP. Among neural heuristic methods, our MVGCL achieves the smallest gap and significantly improves the generalization performance on the five testing distributions. For example, our MVGCL reduces the gap by 2.49\% (1.83\% vs 4.32\%) on Rotation distribution of TSP100 compared to AM, and even for the strong neural baseline POMO, our MVGCL brings 0.82\% (1.31\% vs 2.13\%) reduction of the gap on Explosion distribution of TSP100. While %the generalization-specialized 
HAC fails to generalize well on TSP100 instances with relatively large gaps (4.21-5.32\%), MVGCL consistently delivers smaller gaps (1.31-2.15\%). Meanwhile, DROP exhibits unstable generalization performance, which might be caused by the training on instances of mixed-distributions without clear class division. Hence, our MVGCL outperforms the two state-of-the-art methods on cross-distribution generalization. Regarding the efficiency, Concorde, LCP and MVGCL iteratively improve solutions or adjust parameters, thus induce longer runtime. 


 
%The computation speed of MVGCL can be further improved by running inference with more memory than ours (11G) or with a smaller iteration number.


\begin{table*}[hbt!]\small
\centering
\caption{Results of tour lengths and gaps to HGS solver on various distributions (CVRP)}	
\label{tb:cvrp}
\setlength{\tabcolsep}{5.9pt}
\scalebox{0.9}{
\begin{tabular}{lc||c|c|c|c|c|c||c|c|c|c|c|c}
\toprule
			\multicolumn{2}{c||}{\textbf{Problem}}& \multicolumn{6}{c||}{\textbf{CVRP50}} & \multicolumn{6}{c}{\textbf{CVRP100}}   \\\midrule
\textbf{Distribution}           & \textbf{Metric} & HGS & AM &   POMO &  LCP  & DROP & MVGCL & HGS & AM &  POMO &  LCP  & DROP & MVGCL\\ \midrule\midrule
\multirow{2}{*}{\textbf{Explosion}} & Len.&   9.79	& 10.02	& 9.92	& 9.97	& 9.91	& \textbf{9.81}
 
& 14.30	& 14.79	& 14.65	& 14.73	& 14.70	& \textbf{14.33}	\\  
                           & Gap&   0.00\% & 2.35\%	& 1.33\%	& 1.84\%	& 1.23\%	& \textbf{0.20\%}
                           &   0.00\% & 3.43\%	& 2.45\%	& 3.01\%	& 2.80\%	& \textbf{0.21\%}   \\ \midrule
\multirow{2}{*}{\textbf{Compression}} & Len. & 10.14	& 10.39	& 10.28	& 10.35	& 10.32	& \textbf{10.15}
& 14.82	& 15.36	& 15.21	& 15.30	& 15.32	& \textbf{14.85}	\\ 
                           & Gap&   0.00\%  & 2.47\%	& 1.38\%	& 2.07\%	& 1.78\%	& \textbf{0.10\%}
 
                           &   0.00\% & 3.64\%	& 2.63\%	& 3.24\%	& 3.37\%	& \textbf{0.20\%}    \\ \midrule
\multirow{2}{*}{\textbf{Cluster}} & Len.& 10.35	& 10.60	& 10.49	& 10.57	& 10.49	& \textbf{10.37}
& 15.44	& 15.99	& 15.83	& 15.89	& 15.90	& \textbf{15.48}	 \\ 
                           & Gap&   0.00\%   & 2.42\%	& 1.35\%	& 2.13\%	& 1.35\%	& \textbf{0.19\%}
                           &   0.00\% & 3.56\%	& 2.53\%	& 2.91\%	& 2.98\%	& \textbf{0.26\%}    \\ \midrule
\multirow{2}{*}{\textbf{Expansion}} & Len.& 9.40	& 9.64	& 9.53	& 9.60	& 9.61	& \textbf{9.42}
& 13.70	& 14.18	& 14.02	& 14.14	& 14.19	& \textbf{13.74}	 \\  
                           & Gap&   0.00\%    & 2.55\%	& 1.38\%	& 2.13\%	& 2.23\%	& \textbf{0.21\%}
                           &   0.00\% & 3.50\%	& 2.34\%	& 3.21\%	& 3.58\%	& \textbf{0.29\%}   \\  \midrule
\multirow{2}{*}{\textbf{Rotation}} & Len.& 9.43	& 9.66	& 9.56	& 9.64	& 9.58	& \textbf{9.45}
& 13.97	& 14.46	& 14.30	& 14.42	& 14.39	& \textbf{14.00}	\\ 
                           & Gap&   0.00\%    & 2.44\%	& 1.38\%	& 2.23\%	& 1.59\%	& \textbf{0.21\%}
                           &   0.00\% & 3.51\%	& 2.36\%	& 3.22\%	& 3.01\%	& \textbf{0.21\%}  \\ 
                           \midrule
\multicolumn{2}{c||}{\textbf{Avg. Inf. Time (s)}} &    30       &   0.22   & 0.01   &   2.83   &     0.01    &   1.07 
&  30    &  0.29    &  0.03  &   5.85   &   0.05 &  4.43 \\                               
                           \bottomrule
\end{tabular}}

\end{table*}

\subsection{Generalization on CVRP} 
\label{sec:experiments2}

We display the results for CVRP50 and CVRP100 in Table~\ref{tb:cvrp}. As aforementioned, it is much harder to find the optimal solution for CVRP, thus we specify the heuristic solver HGS with runtime 30s as the baseline to compute the gaps. 
%Compared to the desirable results of baselines in their original paper (i.e. they compared with LKH3 rather than HGS), the performance of AM, POMO and LCP obviously deteriorate when generalizing to different distributions. 
Despite the good performance of AM, POMO and LCP on uniform distribution (as reported in their original papers), we observe that they drastically deteriorate when generalizing to other distributions. For example, the strong neural baseline POMO reported gaps of 0.45\% and 0.32\% for CVRP50 and CVRP100 on uniform distribution in its original paper, whereas its gaps increase to about 2.5\% on the five distributions. In fact, generalizing to different distributions on CVRP100 is challenging for neural baselines, which yield large gaps around 2.3\%-3.7\%. However, our MVGCL achieves significantly smaller gaps (by up to 10x) than those of neural baselines, which are comparable to HGS that runs much longer. The reason might be that our MVGCL is less sensitive to the varied distributions as it exploits the universal local patterns. 
%While LCP also iteratively improves the solution, \colorp{it fails to generalize to harder distributions since its parameters are fixed during inference}. In contrast, our MVGCL is more flexible by optimizing additional neural layers during inference and yields stable performance.
Furthermore, we also present results on TSP and CVRP instances of uniform distribution in Appendix D. To sum up, our method attains higher-quality solutions over most of the distributions.

 \begin{table}[t]\small
\setlength{\tabcolsep}{1pt}
\renewcommand{\arraystretch}{1.2}
\centering
\caption{Results on TSPLib and CVRPLib}
\label{tb:cvrplib}
\scalebox{0.86}{
\begin{tabular}{lc||c|c|c|c|c|c|c}
\toprule
\textbf{Dataset}           & \textbf{Metric} & Opt. & AM & POMO &  LCP & HAC & DROP & MVGCL \\ \midrule\midrule
\multirow{2}{*}{ \textbf{TSPLib} } & Len.   &     6.86      &   8.02  &  7.44  &  7.48   & 8.65    &  7.48     &     \textbf{7.05}      \\  
                           & Gap         &     0.00\%    &  10.53\%&  5.16\%&  5.92\% & 16.75\% &  5.79\%   &     \textbf{1.58}\%      \\ \midrule
\multicolumn{2}{c||}{\textbf{Avg. Time (s)}}             &     -     &   0.48   &  0.47  & 69.26  & 0.48  &  0.35  &   48.11    \\ \midrule\midrule
\multirow{2}{*}{ \textbf{CVRPLib} } & Len.   &     16.97      &   17.82   &  17.71  &  17.83    & - & 17.84   &  \textbf{17.09}         \\  
                           & Gap              &     0.00\%     &  6.05\%    &  4.52\%  &  5.23\%   & -&  5.25\%   &      \textbf{0.70\%}      \\ \midrule
\multicolumn{2}{c||}{\textbf{Avg. Time (s)}}             &     -     &  0.29    &  0.03  &  7.22  &- &  0.05   &      4.8 \\\bottomrule
\end{tabular}}

% \vspace{-2mm} %remove it in preprint
\end{table}
 
\subsection{Results on Benchmarks}
\label{sec:experiments3}
We continue to evaluate our MVGCL on public benchmark datasets, i.e., TSPLib \citep{reinelt1991tsplib} and CVRPLib \citep{queiroga202110}, to demonstrate that our method is also effective in addressing more realistic distributions. Regarding TSPLib, we solve the instances with 51-299 nodes. Regarding CVRPLib, we solve XML100 which contains 10000 CVRP100 instances with heterogeneous distributions. The coordinates in each of the above instances are normalized to [0, 1] for a fair comparison. Instead of training a new model for each testing instance set with identical distribution~ \citep{hottung2022efficient}, we directly use the models trained on TSP100 and CVRP100 (with mixed distributions) to solve those instances.  


%TSPLib is a library of TSP instances from various sources and of various types, which is desirable for testing generalization. 

The upper half of Table \ref{tb:cvrplib} shows the average results on TSPLib instances. It is revealed that our MVGCL can generalize well to real-world distributions and varied sizes, with a low gap (1.58\%), which is significantly smaller than those of neural baselines (i.e. 5.16\%-16.75\%). The advantage of our MVGCL over HAC suggests that training with similar distributions (e.g. Gaussian mixture distributions) may limit the generalization performance, while pre-training with diverse patterns from various heterogeneous distributions could be more beneficial. The lower half of Table \ref{tb:cvrplib} shows the average results on CVRPLib instances, which reveal that our MVGCL can also generalize well to miscellaneous instances,  %with different distributions of depot location and demand, 
%even though our method is not explicitly trained on them. 
which are completely unseen in training.
% More detailed results on TSPLIB can be found in Appendix E.


%CVRPLib XML100 is a benchmark of 10000 CVRP instances with heterogeneous distributions. Specially, XML100 contains 378 distinct distribution groups for 10000 CVRP100 instances, which is desirable for testing generalization.

\begin{table}[t] \small
\setlength{\tabcolsep}{5.4pt}
\renewcommand{\arraystretch}{1}
\centering 
%\vspace{-4mm}
\caption{Ablation studies on MVGCL}
\label{tb:ablation}
\scalebox{0.93}{	
\begin{tabular}{c||c|c|c||c|c}
\toprule
& \multicolumn{3}{c||}{\textbf{Component}}                                                            & \multicolumn{2}{c}{\textbf{TSP100}} \\ 
Name              & Node Embed.              & Graph Embed.             & EAS                         & Len.        & Gap.         \\ \midrule\midrule
M1 & \XSolidBrush & \XSolidBrush & \XSolidBrush & 6.760       & 2.41\%       \\\midrule
M2 &\Checkmark & \XSolidBrush & \XSolidBrush & 6.732       & 1.98\%       \\\midrule
M3 &\Checkmark  & \XSolidBrush &    \Checkmark   & 6.722       & 1.83\%       \\\midrule
M4 &\XSolidBrush & \XSolidBrush &      \Checkmark      & 6.729       & 1.94\%       \\\midrule
M5 &\XSolidBrush &     \Checkmark    &    \Checkmark         & 6.725       & 1.88\%       \\\midrule
M6$^{\prime}$ &      \Checkmark   &   \Checkmark\kern-1.1ex\raisebox{1.2ex}{\rotatebox[origin=c]{125}{--}}        &      \Checkmark          & 6.720       & 1.80\%  \\
\midrule
M6 &      \Checkmark   &         \Checkmark       &      \Checkmark          & \textbf{6.717}       & \textbf{1.76\%}  \\
 \bottomrule
\end{tabular}}


% \vspace{-2mm}
\end{table}

\subsection{Ablation Studies}
We further conduct ablation studies to verify the effectiveness of key components in our MVGCL, where we take TSP100 as an exemplary case. In Table \ref{tb:ablation}, we ablate three components and report the average results over all 10000 instances of the five distributions. The comparison between M1 and M2 shows that the node embedding from our node-level GCL is helpful for generalization, which reduces the gap (2.41\%) of the original POMO (M1) by 0.43\%. The comparison between M2 and M3 shows that the active search with additional neural layers (referred to as EAS) \citep{hottung2022efficient} can further reduce the gap by 0.15\%. To verify the effectiveness of the distribution-preserved augmentation in our graph-level GCL, we use the pooling result of all node embeddings produced by the node-level GCL as the graph embedding $x_g$ (M6$^\prime$). We can only see a slight difference compared to M3, which implies that solely providing local information (M6$^\prime$) cannot capture the overall distribution for more effective active search. In contrast, we can observe from M3 v.s. M6 and M4 v.s. M5 that guiding the active search by graph embedding produced by our graph-level GCL (M5 and M6) can significantly improve the generalization performance. Finally, our MVGCL with all components (M6) achieves the highest gap reduction (0.65\%) compared to M1, which empirically verifies the importance of learning universal local patterns and the effectiveness of global graph embedding for active search.


% \subsection{Additional Experiments}
% We provide more experiments to comprehensively evaluate the generalization performance of MVGCL. In general, we experiment on large-scale instances, which show...; we experiment with GCN, which show... Please refer to Appendix XXX for details.

\section{Conclusions and Future Works} \label{sec:conclusion}

In this paper, we propose a multi-view graph contrastive learning approach to leverage node-level local patterns and graph-level global representation for neural heuristics equipped with active search to solve VRPs. Extensive experiments on synthetic instances and benchmark instances (TSPLib and CVRPLib) of various distributions show that our MVGCL significantly improves the cross-distribution generalization performance.
%Generalization to VRPs with various distributions is challenging for neural heuristics. In this paper, we propose to exploit multi-view graph contrastive learning to discover generalizable node-level local patterns and graph-level global representation for the neural routing heuristic and the active search, respectively. Extensive experiments on synthetic VRP instances and realistic datasets (TSPLib and CVRPLib) from various distributions show that our MVGCL significantly improves cross-distribution generalization performance. 
%Our method performs favourably against the baselines, including generalization-specialized ones, which verifies the effectiveness of the contrastive learning scheme in fostering the generalization ability of neural heuristics. 
In future, we plan to further improve the inference efficiency of our MVGCL.

% \begin{acknowledgements} % will be removed in pdf for initial submission,
% 						 % (without ‘accepted’ option in \documentclass)
%                          % so you can already fill it to test with the
%                          % ‘accepted’ class option
%     Briefly acknowledge people and organizations here.
 
%     \emph{All} acknowledgements go in this section.
% \end{acknowledgements}

% References
\bibliography{uai2023-ref}
\end{document}
