% \documentclass{uai2024} % for initial submission
\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example




% customized package
\usepackage{textcomp}
\usepackage{stfloats}
\usepackage{url}
\usepackage{verbatim}
\usepackage{graphicx}
\usepackage{algorithmic}
\usepackage{algorithm}
\usepackage{array}
\usepackage{hyperref}
\usepackage{subfigure}
\usepackage{epsfig}
\usepackage{wrapfig}
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{kantlipsum}
\usepackage{amsfonts}
\usepackage{amsthm}  % bold theorem
\usepackage[justification=centering]{caption}
% customized command
\newcommand{\colorr}[1]{\textcolor{black}{#1}}
\newcommand{\rebuttal}[1]{\textcolor{black}{#1}}
\newcommand{\colorb}[1]{\textcolor{black}{#1}}
\hypersetup{
    colorlinks=true,
    citecolor=blue,
    linkcolor=blue,
}
\newsavebox\CBox
\def\textBF#1{\sbox\CBox{#1}\resizebox{\wd\CBox}{\ht\CBox}{\textbf{#1}}}





\title{Learning Topological Representations with Bidirectional Graph Attention Network for Solving Job Shop Scheduling Problem}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Cong Zhang}
\author[2]{Zhiguang Cao}
\author[3]{Yaoxin Wu}
\author[4]{Wen Song$^{\dagger, }$}
\author[1]{Jin Sun}
% Add affiliations after the authors
\affil[1]{Nanyang Technological University, Singapore}
\affil[2]{Singapore Management University, Singapore}
\affil[3]{Department of Industrial Engineering \& Innovation Sciences, Eindhoven University of Technology}
\affil[4]{Institute of Marine Science and Technology, Shandong University, China}
\affil[1,4]{\texttt{\{cong.zhang92@gmail.com, wensong@email.sdu.edu.cn\}}}

  
\begin{document}
\maketitle
\def\thefootnote{$\dagger$}\footnotetext{corresponding author.}


\begin{abstract}
%The job shop scheduling problem (JSSP) is a well-known NP-hard combinatorial optimization problem with extensive applications in modern manufacturing production systems. The existing neural methods for solving JSSP formulate it as a Markov decision process and leverage the underlying disjunctive graphs (DGs) to represent the states. Then, a deep reinforcement learning (DRL) agent based on the graph neural network (GNN) is adopted to construct or search for optimal solutions. However, they usually use off-the-shelf GNN models tailored to undirected graphs and neglect the rich and meaningful topological structures of DGs. This paper proposes the topology-aware bidirectional graph attention network (TBGAT), a novel GNN architecture based on the attention mechanism, to embed the DG for solving JSSP in a local search framework. Specifically, TBGAT embeds the DG from a forward and a backward view, respectively, where the messages are propagated by following the different topologies of the views and aggregated via graph attention. Then, we propose a novel operator based on the message-passing mechanism to calculate the forward and backward topological sorts of the DG, which are the features for characterizing the topological structures and exploited by our model. In addition, we theoretically and experimentally show that TBGAT has linear computational complexity to the number of jobs and machines, respectively, which strengthens the practical value of our method. Besides, extensive experiments on five synthetic datasets and seven classic benchmarks show that TBGAT achieves new SOTA results by outperforming a wide range of neural methods by a large margin.
% The job shop scheduling problem (JSSP) is a well-known NP-hard combinatorial optimization problem with extensive applications in modern manufacturing production systems. The existing neural methods for solving JSSP formulate it as a Markov decision process and leverage the underlying disjunctive graphs (DGs) to represent the states. Then, a deep reinforcement learning (DRL) agent based on the graph neural network (GNN) is adopted to construct or search for optimal solutions. However, they usually use off-the-shelf GNN models tailored to undirected graphs and neglect the rich and meaningful topological structures of DGs. This paper proposes the topology-aware bidirectional graph attention network (TBGAT), a novel GNN architecture based on the attention mechanism, to embed the DG for solving JSSP in a local search framework. Specifically, TBGAT embeds the DG from a forward and a backward view, respectively, where the messages are propagated by following the different topologies of the views and aggregated via graph attention. Then, we propose a novel operator based on the message-passing mechanism to calculate the forward and backward topological sorts of the DG, which are the features for characterizing the topological structures and exploited by our model. In addition, we theoretically and experimentally show that TBGAT has linear computational complexity to the number of jobs and machines, respectively, which strengthens the practical value of our method. Besides, extensive experiments on five synthetic datasets and seven classic benchmarks show that TBGAT achieves new SOTA results by outperforming a wide range of neural methods by a large margin.
Existing learning-based methods for solving job shop scheduling problems (JSSP) usually use off-the-shelf GNN models tailored to undirected graphs and neglect the rich and meaningful topological structures of disjunctive graphs (DGs). This paper proposes the topology-aware bidirectional graph attention network (TBGAT), a novel GNN architecture based on the attention mechanism, to embed the DG for solving JSSP in a local search framework. Specifically, TBGAT embeds the DG from a forward and a backward view, respectively, where the messages are propagated by following the different topologies of the views and aggregated via graph attention. Then, we propose a novel operator based on the message-passing mechanism to calculate the forward and backward topological sorts of the DG, which are the features for characterizing the topological structures and exploited by our model. In addition, we theoretically and experimentally show that TBGAT has linear computational complexity to the number of jobs and machines, respectively, strengthening our method's practical value. Besides, extensive experiments on five synthetic datasets and seven classic benchmarks show that TBGAT achieves new SOTA results by outperforming a wide range of neural methods by a large margin. All the code and data are publicly available online at https://github.com/zcaicaros/TBGAT.
\end{abstract}

\section{Introduction}\label{sec:intro}
\rebuttal{Production and logistics are integral components of contemporary manufacturing systems. Developing intelligent solutions for complex challenges in these domains through the application of machine learning (especially deep learning) techniques holds significant potential to advance the manufacturing industry and has become a topic of growing interest. Vehicle routing problems (VRP)~\cite{golden2008vehicle} in logistics have become increasingly popular. In contrast, the job shop scheduling problem (JSSP)~\cite{garey1976complexity}, which is more complex but has substantial applications in modern production systems, receives relatively less attention.}
% \rebuttal{Production and logistics are integral components of contemporary manufacturing systems. Developing intelligent solutions for complex challenges in these domains through the application of machine learning (especially the deep learning) techniques holds significant potential to advance the manufacturing industry and has become a topic of growing interest}
% \colorr{Learning to solve vehicle routing problems (VRP)~\cite{golden2008vehicle} has become increasingly popular. In contrast, the job shop scheduling problem (JSSP)~\cite{garey1976complexity}, which is more complex, receives relatively less attention. }

In most existing research on VRP, fully connected undirected graphs are commonly employed to model the interrelationships between nodes (customers and the depot). This graphical representation enables a range of algorithms to leverage graph neural networks (GNN) for learning problem representations, subsequently facilitating the resolution of VRP~\cite{kool2018attention, xin2021multi, joshi2022learning}. However, this densely connected topological structure is not applicable to JSSP, as it cannot represent the widespread precedence constraints among operations in a job. As a result, the mainstream research on JSSP utilizes disjunctive graphs (DG)~\cite{blazewicz2000disjunctive}, a sparse and directed graphical model, to depict instances and (partial) solutions. Presently, there are two emerging neural approaches for learning to solve JSSP based on disjunctive graphs. The first neural approach, predominantly featured in existing literature, focuses on learning construction heuristics, which adhere to a dispatching procedure that incrementally develops schedules from partial ones~\cite{NEURIPS2020_11958dfe, park2021learning, park2021schedulenet}. Nonetheless, this method is ill-suited for incorporating diverse work-in-progress (WIP) information (e.g., current machine load and job status) into the disjunctive graph~\cite{zhang2024Deep}. The omission of such crucial information negatively impacts the performance of these construction heuristics. The second neural approach involves learning improvement heuristics for JSSP~\cite{zhang2024Deep}, wherein disjunctive graphs represent complete solutions to be refined, effectively transforming the scheduling problem into a graph optimization problem so as to bypass the issues faced by partial schedule representation.

A prevalent approach in the aforementioned research involves utilizing canonical graph neural network (GNN) models, originally designed for undirected graphs, as the foundation for learning disjunctive graph embeddings. We contend that this approach may be inadvisable. Specifically, disjunctive graphs were initially introduced as a class of directed graphs, wherein the arc directions represent the processing order between operations. Notably, when modelling complete solutions, disjunctive graphs transform into directed acyclic graphs (DAGs), with their topological structures (node connectivity) exhibiting a bijective correspondence with the solution (schedule) space~\cite{blazewicz2000disjunctive}. Learning these structures can substantially assist GNNs in acquiring in-depth knowledge of disjunctive graphs, enabling the differentiation between high-quality and inferior solutions. Nevertheless, conventional GNN models for undirected graphs lack the components necessary to accommodate these topological peculiarities during the learning of representations.
% As a result, this inappropriate application of GNN models may lead to the loss of crucial information and yield suboptimal learning outcomes for addressing JSSP.

% This paper introduces an end-to-end neural local search algorithm for addressing JSSP \colorr{with} disjunctive graphs \colorr{representing} complete solutions. We propose a novel topology-aware bidirectional graph attention network (TBGAT) for learning disjunctive graph embeddings to facilitate local move selection. \colorr{In contrast} to GNN models designed for undirected graphs, TBGAT effectively captures the unique topological features of disjunctive graphs. TBGAT comprises two independent graph attention modules, which learn the forward and backward views of the disjunctive graphs, respectively. Specifically, the forward view propagates messages from the \colorr{root} to the \colorr{leaf} of the graph, incorporating forward topological sorts and the earliest starting time for all operations. In this forward-view message passing, each operation receives messages from its precedent peers (those completed earlier), considered as the historical context. Conversely, the backward view propagates messages from the \colorr{leaf} to the \colorr{root} of the schedule, wherein each operation obtains messages from its succeeding neighbours (those processed later), encompassing future schedule information inclusive of backward topological sorts and the latest starting time. We demonstrate that the forward topological order of nodes in disjunctive graphs is equivalent to the global processing orderings of all operations and present a simple yet effective algorithm for calculating them based on the message-passing mechanism \colorr{to facilitate GPU computing}. To promote exploration, we implement the REINFORCE algorithm, regularized with the policy's entropy, for training our agent. Importantly, we provide theoretical analysis and experimental evidence to show that TBGAT exhibits linear computational complexity concerning the number of jobs and machines, a crucial attribute for a practical JSSP solver.

\begin{figure*}[!ht]
    \centering
    \subfigure[A JSSP instance of size $3 \times 3$.]{\includegraphics[width=.32\textwidth]{fig/dg_instance.pdf}\label{fig1a}}
    \hspace{.5em}% Space between image
    \subfigure[A possible solution.]{\includegraphics[width=.32\textwidth]{fig/dg_sol.pdf}\label{fig1b}}
    \hspace{.5em}% Space between image
    \subfigure[A critical path and the critical blocks on it.]{\includegraphics[width=.32\textwidth]{fig/critical_path_and_critical_blocks.pdf}\label{fig1c}}
    \caption{Disjunctive graph representations for JSSP instance and solution.}
    \label{fig1}
\end{figure*}

\colorr{This paper introduces an end-to-end neural local search algorithm for solving JSSP using disjunctive graphs to represent complete solutions. We propose a novel bidirectional graph attention network (TBGAT) tailored for disjunctive graphs, effectively capturing their unique topological features. TBGAT utilizes two independent graph attention modules to learn forward and backward views, incorporating forward and backward topological sorts. The forward view propagates messages from the root to the leaf, considering historical context, while the backward view propagates messages in the opposite direction, incorporating future schedule information. We show that forward topological order in disjunctive graphs corresponds to global processing orderings and present an algorithm for efficient GPU computation. For training, we \rebuttal{design a deep reinforcement learning-based algorithm (DRL), particularly the REINFORCE algorithm with entropy regularization to train the TBGAT network}. Theoretical analysis and experiments demonstrate TBGAT's linear computational complexity concerning the number of jobs and machines, a key attribute for practical JSSP solving.}

We evaluate our proposed method against a range of neural approaches for JSSP, utilizing five synthetic datasets and seven classic benchmarks. Comprehensive experimental results demonstrate that our TBGAT model attains new state-of-the-art (SOTA) performance across all datasets, significantly surpassing all neural baselines.
% In addition, our method outperforms the highly optimized constraint programming solver, CP-SAT, from Google OR-Tools suite~\cite{ortools}, particularly on large-scale instances. In a comparison with L2S~\cite{zhang2024Deep}, \colorr{a} neural improvement heuristic \colorr{baseline} for JSSP, we observe that TBGAT substantially outperforms L2S while maintaining \colorr{better} sample efficiency. The advances can be attributed to the more effective graph embedding facilitated by the TBGAT network.





\section{Related Literature}

% Due to the flourishing of artificial intelligence, a trend has recently emerged for solving scheduling-related problems from a machine-learning perspective~\cite{dogan2021machine}. For JSSP, deep reinforcement learning is the most popular machine learning paradigm. Most existing DRL-based methods for JSSP are construction heuristics that learn to construct solutions in a sequential decision-making process. L2D~\cite{NEURIPS2020_11958dfe} is a pioneering and representative study of this idea. In L2D, a GIN-based~\cite{xu2018how} policy learns the latent embeddings of partial solutions represented by disjunctive graphs and selects operations to dispatch to corresponding machines at each construction step. A similar dispatching procedure is also witnessed in RL-GNN~\cite{park2021learning} and ScheduleNet\cite{park2021schedulenet}. To incorporate the current status of machines when making decisions, RL-GNN and ScheduleNet introduce artificial machine nodes with machine-progress information into the disjunctive graph. They treat the augmented disjunctive graphs as undirected ones and propose a type-aware GNN model with two independent modules to extract the embedding of the machine and task nodes. Despite considerable improvement against L2D, the performance is still relatively far from optimality. DGERD~\cite{chen2022deep} follows a similar procedure of L2D, whereas the embedding network is modified from Transformer~\cite{vaswani2017attention}. A recent work, MatNet~\cite{NEURIPS2021_29539ed9}, employs an encoding-decoding framework to learn construction heuristics for flexible flow shop problems. However, it assumes independent machine groups for operations from each stage, which is too strong for JSSP, where operations share the same machine across all stages. JSSenv~\cite{tassel2021reinforcement} is a delicately designed and well-optimized simulator for JSSP, extended from the OpenAI gym's environment suite~\cite{brockman2016openai}. Instead of employing the disjunctive graph, JSSenv models and represents the states of partial schedules with Gantt charts~\cite{jain1999deterministic}. JSSenv then proposes a DRL agent to solve JSSP instances individually in an online fashion. However, it is an online method that performs training for each individual instance, hence is slower in computation compared with other works, which can quickly infer the solutions to given instances after being trained offline.

The rapid advancement of artificial intelligence has spurred a growing interest in addressing scheduling-related problems from a machine learning (especially the deep learning) perspective~\cite{dogan2021machine}. For JSSP, the neural methods based on deep reinforcement learning (DRL) has emerged as the predominant machine learning paradigm. The majority of existing neural methods for JSSP are construction heuristics that sequentially construct solutions through a decision-making process. L2D~\cite{NEURIPS2020_11958dfe} represents a seminal and exemplary study in this area, wherein a GIN-based~\cite{xu2018how} policy learns latent embeddings of partial solutions, represented by disjunctive graphs, and selects operations for dispatch to corresponding machines at each construction step. A similar dispatching procedure is observed in RL-GNN~\cite{park2021learning} and ScheduleNet~\cite{park2021schedulenet}. To incorporate machine status in decision-making, RL-GNN and ScheduleNet introduce artificial machine nodes with machine-progress information into the disjunctive graph. These augmented disjunctive graphs are treated as undirected, and a type-aware GNN model with two independent modules is proposed for extracting machine and task node embeddings. Despite considerable improvements over L2D, the performance remains suboptimal. DGERD~\cite{chen2022deep} follows a procedure similar to L2D but with a Transformer-based embedding ~network\cite{vaswani2017attention}. A recent work, MatNet~\cite{NEURIPS2021_29539ed9}, employs an encoding-decoding framework for learning construction heuristics for flexible flow shop problems; however, its assumption of independent machine groups for operations at each stage is overly restrictive for JSSP. JSSenv~\cite{tassel2021reinforcement} presents a carefully designed and well-optimized simulator for JSSP, extending from the OpenAI gym environment suite~\cite{brockman2016openai}. Rather than utilizing disjunctive graphs, JSSenv models and represents partial schedule states using Gantt charts~\cite{jain1999deterministic}, and also proposes a DRL agent to solve JSSP instances individually in an online fashion. However, its online nature, requiring training for each instance, results in slower computation than offline-trained methods.

L2S~\cite{zhang2024Deep} significantly narrows the optimality gaps by learning neural improvement heuristics for JSSP, thereby transforming the scheduling problem into a graph structure search problem. Specifically, L2S employs a straightforward local search framework in which a GNN-based agent learns to select pairs of operations for swapping, thus yielding new solutions. The GNN architecture comprises two modules based on GIN~\cite{xu2018how} and GAT~\cite{velivckovic2018graph}, focusing on the disjunctive graph and its subgraphs with different contexts separately. This design introduces two potential issues. Firstly, it is unclear whether GIN can maintain the same discriminative power for directed graphs as for undirected graphs. Secondly, the GAT network cannot allocate distinct attention scores to different neighbours during the representation learning since each node possesses only a single neighbour in either context subgraph, thus rendering the attention mechanism ineffective.

\rebuttal{Existing attention-based GNN variants are readily available to embed DGs, e.g., \cite{,wang2019heterogeneous}. However, they either adopt a random-walk-based approach for aggregating the neighbourhood information \cite{iyer2021bi}, which neglects the precedent constraints and machine processing orders leading to inferior performance, or the computational complexity is not linear (w.r.t. the number of jobs and machines), making them less suitable for JSSP.}

% Existing GNN-based approaches overlook the distinct characteristics of disjunctive graphs as directed acyclic graphs (DAGs) during the graph embedding learning process. These methods treat disjunctive graphs as undirected and employ off-the-shelf GNN models, which may not be well-suited for this specific context. Consequently, such an oversight may limit the expressiveness of GNN-based policies, adversely impacting the performance of those neural methods. To address this limitation, we propose a bidirectional GNN embedding network based on the attention mechanism, specifically designed \colorr{to process disjunctive graphs effectively.}

% Apart from disjunctive graphs, alternative representations for JSSP exist. For instance, edgeDQN~\cite{lin2019smart} introduces a DQN-based algorithm for addressing JSSP. By processing feature vectors engineered from JSSP instances, edgeDQN selects priority dispatching rules (PDR) for each machine from a pool of candidate rules. Notably, edgeDQN outperforms the manually designed rules. Other recent studies focus on enhancing the performance or generalization ability of existing methods through advanced search strategies~\cite{hottung2022efficient} or curriculum training strategies~\cite{iklassov2022learning}. While these approaches achieve significant performance improvements, the computational cost is often traded off, rendering the analytical evaluation of these costs challenging.

\begin{figure*}[!ht]
    \centering
    \includegraphics[width=0.8\textwidth]{fig/local_search_framework.pdf}
    \caption{The local search procedure with TBGAT network.}
    \label{fig2}
\end{figure*}

\vspace{-10pt}
\section{Prerequisite}

\subsection{The job shop scheduling problem.} 

A JSSP instance of size $|\mathcal{J}| \times |\mathcal{M}|$ comprises a set of jobs $\mathcal{J}$ and a set of machines $\mathcal{M}$. Each job $j \in \mathcal{J}$ must be processed by each machine $m \in \mathcal{M}$ following a predefined order $O_{j1} \rightarrow \cdots \rightarrow O_{ji} \rightarrow \cdots \rightarrow O_{j{|\mathcal{M}|}}$, where $O_{ji} \in \mathcal{O}$ represents the $i$th operation of job $j$. Each operation $O_{ji}$ is allocated to a machine $m_{ji}$ with a processing time $p_{ji} \in \mathbb{N}$. Let $\mathcal{O}_j$ and $\mathcal{O}_m$ denote the collections of all operations for job $j$ and machine $m$, respectively. The operation $O_{ji}$ can be processed only when all its preceding operations in $\{O_{jk}|k < i\} \subset \mathcal{O}_j$ have been completed, which constitutes the precedent constraint. The objective is to identify a schedule $\eta: \mathcal{O} \rightarrow \mathbb{N}$, i.e., the starting time for each operation, that minimizes the makespan $C_{\max} = \max(\eta(O_{ij}) + p_{ij})$ without violating the precedent constraints.

\subsection{The disjunctive graph representation.} 

Disjunctive graphs~\cite{blazewicz2000disjunctive} can comprehensively represent JSSP instances and solutions. As illustrated in Fig.~\ref{fig1a}, a $3 \times 3$ JSSP instance is represented by its corresponding disjunctive graph $G=\langle \mathcal{O}, \mathcal{C}, \mathcal{D} \rangle$. The artificial operations $O_S \in \mathcal{O}$ and $O_T \in \mathcal{O}$, which possess zero processing time, denote the start and end of the schedule, respectively. The solid arrows represent \textit{conjunctions} ($\mathcal{C}$), which indicate precedent constraints for each job. The two-headed arrows signify \textit{disjunctions} ($\mathcal{D}$) that mutually connect all operations belonging to the same machine, forming machine cliques with distinct colors. Discovering a solution is tantamount to assigning directions to disjunctive arcs such that the resulting graph is a directed acyclic graph (DAG)~\cite{balas1969machine}. For instance, a solution to the JSSP instance in Fig. \ref{fig1a} is presented in Fig. \ref{fig1b}, where the directions of all disjunctive arcs are determined, and the resulting graph is a DAG. Fig. \ref{fig1c} emphasizes a critical path of the solution in Fig. \ref{fig1b}, i.e., the longest path from the source node $O_S$ to the sink node $O_T$, with critical blocks denoted by red frames (\rebuttal{by longest we mean the total processing time of all operations along the path, except $O_S$ and $O_T$, is the largest among all paths from $O_S$ to $O_T$}). The critical blocks are groups of operations belonging to the same machine on a critical path. The sum of the processing times of operations along the critical path represents the makespan of the solution. Identifying a solution with a smaller makespan is equivalent to finding a disjunctive graph with a shorter critical path.





\section{The local search algorithm with proposed TBGAT network}


We adopt a neural local search framework akin to L2S~\cite{zhang2024Deep}, as depicted in Fig. \ref{fig2}. The process commences with an initial solution produced by a construction heuristic (e.g., the dispatching rule), which is preserved as the current seed and the incumbent (the best-so-far solution). Subsequently, the disjunctive graph representation of the present seed is constructed, and the candidate moves (neighbours) are determined by employing the $N_5$ neighbourhood structure~\cite{nowicki1996fast}. This structure generates a candidate solution by exchanging the first or last pair of operations in a critical block along a critical path of the disjunctive graph, as demonstrated in Fig. \ref{fig3}. Following~\cite{nowicki1996fast}, the first pair of operations in the initial critical block and the last pair of operations in the final critical block are excluded. Furthermore, in the presence of multiple critical paths, a random critical path is chosen~\cite{zhang2024Deep,nowicki1996fast}. The TBGAT network subsequently ingests the disjunctive graph of the current seed as input and produces one of the candidate moves. Ultimately, a new seed solution is acquired by swapping the operations in the disjunctive graph according to the selected operation pair, superseding the incumbent if superior. This search procedure persists until a stopping criterion is met, such as reaching a predetermined horizon, e.g., 5000 steps.


The aforementioned local search procedure can be recast in the framework of the Markov decision process (MDP). Specifically, the state, action, reward, and state transition are delineated as follows. 

\textbf{State.} The state $s_t$ at time step $t$ represents the disjunctive graph representation of the seed solution at $t$. \textbf{Action.} The action set $A_t$ at $t$ comprises the candidate moves of $s_t$ calculated by applying the $N_5$ neighborhood structure. It is worth noting that $A_t$ may be dynamic and contingent on different seed solutions, where $A_t=\emptyset$ indicates that the current seed solution is the optimal one~\cite{nowicki1996fast}. \textbf{Reward.} The step-wise reward between any two consecutive states $s_t$ and $s_{t+1}$ is computed as $r(s_t, a_t) = \max\left(C_{\max}(s^*) - C_{\max}(s_{t+1}), 0\right)$, with $s^*$ denoting the incumbent. This is well-defined, as maximizing the cumulative reward is tantamount to maximizing the improvement to the initial solution, since $\sum_t^Tr_t=C_{\max}(s_0)-C_{\max}(s_T)$. \textbf{State transition.} The state $s_t$ deterministically transits to the subsequent state $s_{t+1}$ by executing the selected action $a_t \in A_t$ at $s_t$, i.e., exchanging the operation pair in $s_t$. The episode terminates if $A_t=\emptyset$, beyond which the state transition is ceased.

\begin{figure}[h]
    \centering
    \includegraphics[width=.3\textwidth]{fig/N5_moves.pdf}
    \caption{The $N_5$ neighborhood structure.}
    \label{fig3}
\end{figure}

\subsection{The forward and backward views of DGs}
Disjunctive graphs (DGs) constitute a specific class of DAGs, exhibiting unique features. Firstly, by adhering to the direction of conjunctive and disjunctive arcs, each node $O_{ji}$ (barring $O_S$ and $O_T$) has neighbours from two directions. Namely, the predecessors pointing to $O_{ji}$ and successors pointing from $O_{ji}$. Nevertheless, existing GNN-based models either neglect the latter neighbors~\cite{NEURIPS2020_11958dfe,zhang2024Deep} or fail to differentiate the orientations of neighbors~\cite{park2021learning}. In contrast, we recognize that neighbours from both directions are crucial and contain complementary information. Secondly, any $O_{ji}$ in a schedule cannot commence processing earlier than its predecessors, owing to the precedent constraints and the determined machine processing order. The earliest timestamp at which $O_{ji}$ may begin without violating these constraints is defined as the earliest starting time $EST_{ji}$. However, $O_{ji}$ is not mandated to start precisely at $EST_{ji}$ if not delaying the overall makespan. Instead, it has the latest starting time, denoted as $LST_{ji}$. The $EST_{ji}$ and $LST_{ji}$ collectively determine a schedule, which can be calculated recursively from a \textit{forward} and \textit{backward} perspective of the disjunctive graph~\cite{jungnickel2005graphs}, respectively, as follows,

\begin{equation}
    EST_{ji} = \max_{O_{nl} \in \mathcal{P}_{O_{ji}}}(EST_{nl}+p_{nl}),
\label{eq1}
\end{equation}

\begin{equation}
    LST_{ji} = \min_{O_{nl} \in \mathcal{S}_{O_{ji}}}(LST_{nl}-p_{nl}),
\label{eq2}
\end{equation}
where $\mathcal{P}_{O_{ji}}$ and $\mathcal{S}_{O_{ji}}$ represent the sets of predecessors and successors of $O_{ji}$, respectively. In the forward perspective, the computation traverses each $O_{ji}$ by adhering to the directions of conjunctive and disjunctive arcs, which are inverted in the backward perspective. An illustration of the forward and backward perspectives of message flow is demonstrated in Fig.~\ref{fig4}.

\begin{figure}[!ht]
    \centering
    \includegraphics[width=.3\textwidth]{fig/forward_and_backward_view.pdf}
    \caption{The forward and backward view of the DG.}
    \label{fig4}
\end{figure}

The processing order of operations intrinsically determines the quality of a schedule, as it defines the message flows of both forward and backward perspectives, represented by the connections of nodes and the orientations of arcs within the graph, i.e., the graph topology. Furthermore, a one-to-one correspondence exists between the space of disjunctive graph topologies and the space of feasible schedules. In other words, each pair of disjunctive graphs possessing distinct topologies corresponds to schedules of differing qualities. Hence, enabling the agent to leverage such topological features to learn discriminative embeddings for various schedules is highly advantageous, as it aids in distinguishing superior schedules from inferior ones. To achieve this, we employ the topological sort~\cite{wang2009electronic}, a partial order of nodes in a DAG that depicts their connectivity dependencies, as the topological features. The formal definition of the topological sort of nodes in the disjunctive graph is provided in Definition \ref{def1} below.

\newtheorem{definition}{Definition}
\begin{definition} (\textbf{topological sort})
    Given any disjunctive graph $G = \langle \mathcal{O}, \mathcal{C}, \mathcal{D} \rangle$, there is a topological sort $\Phi: \mathcal{O} \rightarrow \mathbb{Z}$ such that for any pair of operations $O$ and $O'$ if there is an arc (disjunctive or conjunctive) connecting them as $O \rightarrow O'$, then $\Phi(O) < \Phi(O')$ must hold.
    \label{def1}
\end{definition}

Furthermore, within the context of JSSP, for any two operations $O, O' \in \mathcal{O}$, if $O$ is a prerequisite operation of $O'$, that is, $O$ must be processed prior to $O'$, then $O$ is required to have a higher ranking than $O'$ in the topological sort. This demonstrates that the topological sort serves as an alternative representation depicting the processing orders defined by the precedent constraints and the processing sequence of machines. Formally,

\newtheorem{lemma}{Lemma}
\begin{lemma}
    For any two operations $O_{ji}, O_{mk} \in \mathcal{O}$, if $O_{ji}$ is a prerequisite operation of $O_{mk}$, then $\overrightarrow{\Phi}(O_{ji}) < \overrightarrow{\Phi}(O_{mk})$ and $EST_{ji} < EST_{mk}$, where $\overrightarrow{\Phi}: \mathcal{O} \rightarrow \mathbb{Z}$ is the topological sort calculated from the \textbf{forward} view of the disjunctive graph.
    \label{lem1}
\end{lemma}

\colorr{The proof is in Appendix~\ref{proof_lemma1}}. A parallel conclusion can be derived for the backward view of the disjunctive graph, as presented below.

\newtheorem{corollary}{Corollary}
\begin{corollary}
    For any two operations $O_{ji}, O_{mk} \in \mathcal{O}$, if $O_{ji}$ is a prerequisite operation of $O_{mk}$, then $\overleftarrow{\Phi}(O_{ji}) > \overleftarrow{\Phi}(O_{mk})$ and $LST_{ji} > LST_{mk}$, where $\overleftarrow{\Phi}: \mathcal{O} \rightarrow \mathbb{Z}$ is the topological sort calculated from the \textbf{backward} view of the disjunctive graph.
    \label{cor1}
\end{corollary}

% \colorr{The proof is in Appendix~\ref{proof_corollary1}}.The forward and backward topological sorts are key features our GNN model aims to exploit while learning the latent representations of disjunctive graphs. These features not only reflect the topology of the disjunctive graph but also the global processing orders of operations (i.e., Lemma \ref{lem1} and Corollary \ref{cor1}). However, employing traditional algorithms to calculate these two features can be computationally demanding for training the DRL agent in light of three practical reasons. Firstly, DRL algorithms often utilize batch processing to expedite and stabilize the training process, in which a batch of instances is gathered and processed concurrently. However, it is challenging to simultaneously calculate the forward and backward topological sorts for multiple disjunctive graphs within a batch using traditional algorithms, as they rely on serial processing. Secondly, DRL agent training typically occurs on GPUs, while traditional methods compute topological sorts on CPUs, resulting in substantial communication overhead between the two devices and further impairing training efficiency. Thirdly, the well-known data inefficiency of DRL agent training~\cite{dorner2021measuring} exacerbates the computational burden, as it necessitates an immense amount of data to converge. To address these difficulties, we introduce a novel algorithm called MPTS, based on the \textit{message-passing} mechanism inspired by GNN computation. This algorithm enables batch computation of forward and backward topological sorts and is compatible with GPU processing.
\colorr{The proof is in Appendix~\ref{proof_corollary1}. Our GNN model leverages forward and backward topological sorts to learn latent representations of disjunctive graphs. These sorts capture graph topology and global processing orders (Lemma \ref{lem1} and Corollary \ref{cor1}). However, traditional algorithms for these sorts pose challenges for DRL agent training due to batch processing complexities, GPU-CPU communication overhead, and DRL data inefficiency. To overcome these issues, we propose MPTS, a novel algorithm based on a message-passing mechanism inspired by GNN computation. MPTS facilitates batch computation of forward and backward topological sorts, ensuring compatibility with GPU processing for more efficient training.}

In fact, MPTS is universally applicable to any directed acyclic graph (DAG). Specifically, given a $DAG=\langle \mathcal{O}^*, \mathcal{E}^* \rangle$ with $\mathcal{O}^*$ and $\mathcal{E}^*$ denoting the sets of nodes and arcs, respectively. Let $\Bar{\mathcal{O}} \subset \mathcal{O}^*$ be the set of nodes with zero in-degrees, and $\Tilde{\mathcal{O}} = \mathcal{O}^* \setminus \Bar{\mathcal{O}}$ be the set of the remaining nodes, where $\Tilde{\mathcal{O}'} \subset \Tilde{\mathcal{O}}$ is the set of nodes with zero out-degrees. We assign a message $m^t_x$ to each node $O_x \in \mathcal{O}^*$, initialized as $m^0_{\Bar{x}}=\rebuttal{1}$ for nodes $O_{\Bar{x}} \in \Bar{\mathcal{O}}$ and $m^0_{\Tilde{x}}=\rebuttal{0}$ for nodes $O_{\Tilde{x}} \in \Tilde{\mathcal{O}}$. Next, we define a message-passing operator $MPO: m^{t}_x \rightarrow m^{t+1}_x$, which calculates and updates the message for each node $O_x$ as $m^{t+1}_x = \max_{O_y \in \mathcal{N}_x}(m^t_y)$, where $\mathcal{N}_x$ is a neighborhood of $O_x$ containing all nodes $O_y$ pointing to $O_x$. Finally, let $L$ denote the length of the global longest path in $DAG$, and $L_x$ represent the length of any longest path from nodes in the set $O_{\Bar{x}}$ to $O_x$, we can demonstrate the following.

\newtheorem{theorem}{Theorem}
\begin{theorem}
    After applying MPO for $L_x$ times, $m^{L_x}_x = 1$ for all nodes $O_x \in \mathcal{O}$. Moreover, for any pair of nodes $O_x$ and $O_z$ connected by a path, if $L_x < L_z \leq L$ then $\Phi(O_x) < \Phi(O_z)$.
    \label{thm1}
\end{theorem}

\begin{figure*}[!ht]
    \centering
    \includegraphics[width=1\textwidth]{fig/forward_module_and_backward_module.pdf}
    \caption{The architecture of the policy network.}
    \label{fig5}
\end{figure*}



\colorr{The proof is in Appendix~\ref{proof_theorem1}}. Theorem~\ref{thm1} suggests an efficient method for computing the topological sort, in which we can iteratively apply the MPO on any $DAG$ and gather the nodes $O_x$ with $m^t_x=1$ at each iteration $0 \leq t \leq |\mathcal{O}^*|$. Consequently, nodes collected in earlier iterations must hold higher ranks than those in later iterations within the topological sort. Since disjunctive graphs also belong to the class of DAGs, Theorem~\ref{thm1} can be directly applied to compute the forward topological sort $\overrightarrow{\Phi}$, as stated in Lemma~\ref{lem1}, and the backward topological sort $\overleftarrow{\Phi}$, as indicated in Corollary~\ref{cor1}, respectively. Given that the MPO operator can be readily implemented on a GPU to leverage its powerful parallel computation capabilities, MPO is anticipated to be more efficient than traditional algorithms when handling multiple disjunctive graphs concurrently. To substantiate this assertion, we provide an empirical comparison in the experimental section.


\subsection{Graph embedding with TBGAT}
In order to effectively learn graph embeddings by leveraging the topological features of disjunctive graphs, we introduce a novel bidirectional graph attention network designed to embed the forward and backward views of the disjunctive graph using two independent modules, respectively. For each view, message propagation adheres to the topology of respective view, and aggregation is accomplished through an attention mechanism, which distinguish our model from previous works. The overarching architecture of the proposed TBGAT is depicted in Fig.~\ref{fig5}.


\subsubsection{The forward embedding module}
In the forward view of the disjunctive graph, each node $O_x \in \mathcal{O}$ is associated with a three-dimensional raw feature vector $\overrightarrow{\mathbf{h}}^0_x = (p_x, est_x, \overrightarrow{\Phi}_x) \in \mathbb{R}^3$. Here, $p_x$ represents the processing time of node $O_x$, $est_x$ represents the earliest starting time, and $\overrightarrow{\Phi}_x$ represents the forward topological sort of node $O_x$. The forward embedding module (FEM) is a graph neural network that consists of $L$ layers. Each layer produces a new message for a node by aggregating the messages of this node and its neighbours from the previous layer. The aggregation operator used for updating the message of node $O_x$ is a weighted sum, where the weights are determined by attention scores that indicate the importance of each message. In other words, the aggregation operator for updating the message of node $O_x$ is expressed as follows,

% \vspace{-8pt}
\begin{equation}
    \overrightarrow{\mathbf{h}}^{l+1}_x = \alpha^l_{x,x}\mathbf{\theta}^l_{ag}\overrightarrow{\mathbf{h}}^{l}_x +\hspace{-3mm}\sum_{O_y \in \mathcal{N}(O_x)}\hspace{-3mm}\alpha^l_{x,y}\mathbf{\theta}^l_{ag}\overrightarrow{\mathbf{h}}^{l}_y, 0 \leq l \leq L-1.
\label{eq3}
\end{equation}

The above attention scores $\alpha^l_{x,x}$ and $\alpha^l_{x,y}$ are used as weights in the aggregation operator for updating the message of node $O_x$ in the forward embedding module (FEM). In specific, $\alpha^l_{x,x}$ and $\alpha^l_{x,y}$ are the attention scores for the messages of node $O_x$ itself and its neighbor $O_y$, respectively; $\mathbf{\theta}^l_{ag}$ and $\mathcal{N}(x)$ denote the learnable parameters for the aggregation operator and the neighborhood of node $O_x$, respectively. Particularly, the two attention scores are computed through a widely-used practice in graph neural networks as follows,
%The attention scores are computed using a widely-used practice in graph neural networks.
\begin{equation}
    \alpha^l_{x,*} =
        \frac{
        \exp\left(\mathrm{LReLU}\left([\mathbf{a}^l]^{\top}
        [\mathbf{\theta}^l_{at}\overrightarrow{\mathbf{h}}^l_x \, \Vert \, \mathbf{\theta}^l_{at}\overrightarrow{\mathbf{h}}^l_*]
        \right)\right)}
        {\sum_{k \in \mathcal{N}'(O_x)}
        \exp\left(\mathrm{LReLU}\left([\mathbf{a}^l]^{\top}
        [\mathbf{\theta}^l_{at}\overrightarrow{\mathbf{h}}^l_x \, \Vert \, \mathbf{\theta}^l_{at}\overrightarrow{\mathbf{h}}^l_k]
        \right)\right)},
\label{eq4}
\end{equation}

\noindent where $||$ denotes the concatenation operator; $*$ refers to $x$ or $y$; $\mathcal{N}'(O_x)=\mathcal{N}(O_x) \cup \{O_x\}$; $\mathrm{LReLU}$ is the LeakyRelU layer~\cite{radford2015unsupervised}; and $\mathbf{a}^l$ and $\mathbf{\theta}^l_{at}$ are the learnable parameters. 

\subsubsection{The backward embedding module}
The backward embedding module (BEM) has an architecture that is similar to that of the forward embedding module. However, there is a key difference in the raw feature used for each node. Specifically, the raw feature for each node $O_x$ is substituted with the backward hidden state vector $\overleftarrow{\mathbf{h}}^0_x = (p_x, lst_x, \overleftarrow{\Phi}_x) \in \mathbb{R}^3$, where $p_x$ represents the processing time of node $O_x$, $lst_x$ represents the latest starting time of node $O_x$, and $\overleftarrow{\Phi}_x$ represents the backward topological sort of node $O_x$. This allows the backward embedding module to encode the information about the temporal dependencies between nodes in the reverse order of the forward embedding module.


\input{table/result-table-classic-testing}
\input{table/result-table-classic-generalization}

\subsubsection{Merging the forward and backward embeddings}
The merged embedding for a given node $O_x$ is derived by concatenating the output vector of the last layer of the forward embedding module (FEM) and the last layer of the backward embedding module (BEM), which results in a single vector that encodes both the forward and backward temporal dependencies of the node as 

\begin{equation}
    \mathbf{h}^L_x = \overrightarrow{\mathbf{h}}^L_x || \overleftarrow{\mathbf{h}}^L_x, \text{for all } O_x.
\label{eq5}
\end{equation}

Moreover, we concatenate $\mathbf{h}^L_x$ with the graph embedding to form the ultimate embedding for each node as

\begin{equation}
    \mathbf{h}_x = \mathbf{h}^L_x || \mathbf{h}_G, \text{for all } O_x,
\label{eq6}
\end{equation}

\noindent where $\mathbf{h}_G$ is obtained with the mean pooling of the node embeddings as $\mathbf{h}_G = \frac{1}{|\mathcal{O}|}\sum_{O_x \in \mathcal{O}} \mathbf{h}_x$.


% \vspace{-8pt}
\colorr{\subsection{Action selection}}

Given the node embeddings $\mathbf{h}_x$ and a graph embedding $\mathbf{h}_G$, we calculate a ``score'' for selecting each operation pair in the $N_5$ neighborhood structure as follows. For any pair of operations $(O_x, O_z)$ obtained from the $N_5$ neighborhood structure, we first concatenate the corresponding node embeddings $\mathbf{h}_x$ and $\mathbf{h}_z$ to obtain the joint representation $\mathbf{h}_{xz}$ for the action $(O_x, O_z)$. We then feed $\mathbf{h}_{xz}$ into the action selection network $Net_A$, i.e., a multi-layer perceptron (MLP) with $L_A$ hidden layers, to obtain a scalar score $sc_{xz}$ for the action $(O_x, O_z)$. The score $sc_{xz}$ is then normalized to obtain a probability $p_{xz}$, from which we could sample an action.


\rebuttal{
\begin{theorem}
    The TBGAT network has linear time complexity w.r.t both $|\mathcal{J}|$ and $|\mathcal{M}|$.
    \label{thm2}
\end{theorem}
By following the proof of Theorem 4.1 in paper~\cite{zhang2024Deep}, it is not difficult to see that the FEM module and the BEM modules pose linear computational complexity. Then, since the action selection network is an MLP that is, of course, linear to $|\mathcal{J}|$ and $|\mathcal{M}|$, TBGAT has linear computational complexity regarding the number of jobs $|\mathcal{J}|$ and the number of machines $|\mathcal{M}|$, respectively. $\hfill \square$
}



% \vspace{-8pt}
\colorr{\subsection{The entropy-regularized REINFORCE algorithm}}
% \vspace{-8pt}

\colorr{To} train our policy network, we utilize a modified version of the REINFORCE algorithm proposed by Williams~\cite{williams1992simple}. Our modifications include periodic updates of the policy network parameters, as opposed to updating them only at a fixed step limit $T$. This approach has been shown to improve the generalization of the policy network to larger values of $T$ during testing~\cite{zhang2024Deep}. Additionally, to encourage exploration of the action space, we incorporate a regularization term $\mathcal{H}(\pi_\theta) = - \mathbb{E}_{a\sim\pi_\theta}\log(\pi_\theta(a))$ based on the entropy of the policy $\pi_\theta$ into the original objective of the REINFORCE algorithm. The complete learning procedure is outlined in Algorithm~\ref{algo1} \colorr{in Appendix~\ref{Entropy_regularized_REINFORCE}}.



\section{Experiment}


To comprehensively evaluate the performance of our TBGAT, we conduct a series of experiments on both synthetic and publicly available datasets.

\subsection{Experimental setup}

\colorr{The algorithm configurations can be found in Appendix~\ref{algorithm_config}.}

\subsection{Testing datasets and baselines}
\noindent\textbf{Datasets.} We evaluate the performance of our proposed method on two categories of datasets. The first category comprises synthetic datasets that we generated using the same method as the training dataset. The synthetic dataset includes five different sizes, namely $10\times10$, $15\times10$, $15\times15$, $20\times10$, and $20\times15$, each consisting of $100$ instances. The second category includes seven widely used public benchmark datasets, i.e., Taillard \cite{taillard1993benchmarks}, ABZ \cite{adams1988shifting}, FT \cite{fisher1963probabilistic}, LA \cite{lawrence1984resouce}, SWV \cite{storer1992new}, ORB \cite{applegate1991computational}, and YN \cite{yamada1992genetic}. These datasets contain instances with small and large scales, including those not seen during training, such as $100\times20$, which challenges the generalization ability of our model. It is worth noting that our model is trained with randomly generated synthetic datasets, whereas the seven open benchmark datasets are generated using distributions different from ours. Hence, the results on these classic datasets can be considered the zero-shot generalization performance of our method. We test the model trained on the closest size for each problem size, e.g., the model trained on the size of $10\times10$ is used for testing on the problem of size $10\times10$ or others close to it.




\textbf{Baselines.} In order to demonstrate the superior performance of TBGAT, we conduct a comparative analysis against nine different baseline methods of various genres, including eight state-of-the-art neural approaches such as construction heuristics (L2D~\cite{NEURIPS2020_11958dfe}, RL-GNN~\cite{park2021learning}, ScheduleNet~\cite{park2021schedulenet}, ACL~\cite{iklassov2022learning}, JSSEnv~\cite{tassel2021reinforcement}, and DGERD~\cite{chen2022deep}), improvement heuristic (L2S~\cite{zhang2024Deep}), and active search (EAS~\cite{NEURIPS2021_29539ed9}). We also include an exact solver, CP-SAT~\cite{ortools}, which has been shown to be robust and effective in solving JSSP~\cite{da2019industrial} when given sufficient computational time ($3600$ seconds). For each problem size, we report the performance of our method in terms of the average relative gap to the best-known solutions, which are available online (for the seven classic benchmark datasets) or computed optimally with CP-SAT (for the synthetic evaluation dataset).\footnote{Please refer to http://optimizizer.com/TA.php and http://jobshop.jjvh.nl/.} For the synthetic datasets, we compare our results against the optimal solution obtained using CP-SAT. The average relative gap is calculated by averaging the gap of each instance, which is defined as follows,

\begin{equation}
    \sigma = (C_{max} - C^*_{max})/C^*_{max} \times 100\%,
\end{equation}

\noindent where $C^*_{max}$ is the best-known solution (for the classic benchmark datasets) or the optimal solution (for the synthetic datasets).

% \vspace{-10pt}

\colorr{\subsection{Performance on public benchmarks}}

% \subsubsection{Public benchmark}
\colorr{We present the results on public datasets.} To present the evaluation results more clearly, we report the results for 500 improvement steps in Table~\ref{table:2} and the generalization results for different numbers of improvement steps in Table~\ref{table:3}. In addition to the baselines mentioned earlier, we also include RL-GNN~\cite{park2021learning} and ScheduleNet~\cite{park2021schedulenet} for comparison. The tables show that TBGAT performs well when generalized to public benchmarks. Specifically, TBGAT achieves the best performance for all problem sizes and datasets, outperforming CP-SAT with a relative gap of 69.2\% on Taillard $100\times20$ instances with a much shorter computational time of 6.7 minutes in Table. \ref{table:3}, compared to 1 hour taken by CP-SAT. Moreover, TBGAT can find optimal solutions for several benchmark datasets with different scales, such as $FT\ 6\times6$, $LA\ 15\times5$, $LA\ 20\times5$, and $LA\ 30\times10$, while L2S fails to do so. These results confirm that TBGAT achieves state-of-the-art results on the seven classic benchmarks and is relatively robust to different data distributions, as the instances in these datasets are generated using distributions substantially different from our training.

% \vspace{-5pt}
\colorr{\subsection{Comparison with other SOTA baselines}}

\colorr{Due to page limit, we leave the comparison against other SOTA baselines in Appendix~\ref{moreresults}, including ACL~\cite{iklassov2022learning}, JSSEnv~\cite{tassel2021reinforcement}, DGERD~\cite{chen2022deep}), and active search (EAS~\cite{NEURIPS2021_29539ed9}).}

% \vspace{-5pt}
\colorr{\subsection{Ablation study}}

\colorr{We conducted an ablation study on the number of attention heads. We also empirically verify the linear computational complexity. Please refer to Appendix~\ref{abstudy} for details.}

\section{Conclusion and future work}
\colorr{We} present a novel solution to the job shop scheduling problem (JSSP) using the topological-aware bidirectional graph attention neural network (TBGAT). Our method learns representations of disjunctive graphs by embedding them from both forward and backward views and utilizing topological sorts to enhance topological awareness. We also propose an efficient method to calculate the topological sorts for both views and integrate the TBGAT model into a local search framework for solving JSSP. Our experiments show that TBGAT outperforms a wide range of state-of-the-art neural baselines regarding solution quality and computational overhead. Additionally, we theoretically and empirically show that TBGAT possesses linear time complexity concerning the number of jobs and machines, which is essential for practical solvers. 
% Furthermore, we suggest future research directions to expand TBGAT to other tasks, such as learning to pick different neighbourhood structures for variable neighbourhood search heuristics.


\section{Acknowledgement}

Wen Song is supported by the National Natural Science Foundation of China (Grant 62102228) and the Natural Science Foundation of Shandong Province (Grant ZR2021QF063). Zhiguang Cao is supported by the National Research Foundation, Singapore under its AI Singapore Programme (AISG Award No: AISG3-RP-2022-031), and the Singapore Ministry of Education (MOE) Academic Research Fund (AcRF) Tier 1 grant.



% References
\bibliography{uai2024-template}

\newpage

\onecolumn

\title{Learning Topological Representations with Bidirectional Graph Attention Network for Solving Job Shop Scheduling Problem}
\maketitle
\def\thefootnote{$\dagger$}\footnotetext{Corresponding author.}



% This Supplementary Material should be submitted together with the main paper.

\appendix

\section{Proof of Lemma 1}\label{proof_lemma1}
$\mathit{Proof.}$ Since $O_{ji}$ is a prerequisite operation of $O_{mk}$, then there exists at least one path from $O_{ji}$ to $O_{mk}$ in the forward view of the disjunctive graph, i.e., $\mathcal{P}_{(O_{ji}, O_{mk})} \neq \emptyset$, where $\mathcal{P}_{(O_{ji}, O_{mk})}$ denotes the set of all paths from $O_{ji}$ to $O_{mk}$. Then, for any path $P(O_{ji}, O_{mk}) \in \mathcal{P}_{(O_{ji}, O_{mk})}$, by the transitivity of the topological sort, we can get that $\overrightarrow{\Phi}(O_{ji}) < \overrightarrow{\Phi}(O_{mk})$. Furthermore, because $\mathcal{P}_{(O_{ji}, O_{mk})} \neq \emptyset$, we have $EST_{ji} < EST_{mk}$ due to the precedent constraints and the processing orders given by the disjunctive arcs, i.e., the operations at the head of the arcs always start earlier than that located at the tail of the arcs, which also transits from $O_{ji}$ to $O_{mk}$ by following the path $P(O_{ji}, O_{mk})$. $\hfill \square$

\section{Proof of Corollary 1}\label{proof_corollary1}
$\mathit{Proof.}$ It is a similar procedure by following the proof of
Lemma~\ref{lem1}, but with each edge reversed. 

\section{Proof of Theorem 1}\label{proof_theorem1}
$\mathit{Proof.}$ We show that $m^{L_x}_x = 1$ for all nodes $O_x \in \mathcal{O}$. First, it is obvious that $m^{L_{\Bar{x}}}_{\Bar{x}} = 1, \forall O_{\Bar{x}} \in \Bar{\mathcal{O}}$ since $m^0_{\Bar{x}} = 1$ and $\mathcal{N}_{\Bar{x}}=\emptyset$. Second, one can prove that $m^{L_{\Tilde{x}}}_{\Tilde{x}} = 1$ by contradiction. Specifically, if $m^{L_{\Tilde{x}}}_{\Tilde{x}} \neq 1$, then there must exist a node $O_{\Bar{x}'} \in \Bar{\mathcal{O}}$ connecting to $O_{\Tilde{x}}$ via a path, which has message $m_{\Bar{x}'} \neq 0$, since $L_x$ is the maximum length. Hence, it contradicts with $m^0_{\Bar{x}'} = 1$. Next, if $L_x < L_z$ for any pair of nodes $O_x$ and $O_z$ connected by a path, it is clear that $\Phi(O_x) < \Phi(O_z)$ by the definition of the topological sort. $\hfill \square$

\section{The Entropy Regularized REINFORCE Algorithm}\label{Entropy_regularized_REINFORCE}
\input{algo/Entropy-regularized-REINFORCE}


\colorr{\section{Algorithm Configurations}\label{algorithm_config}}
% Similar to prior research in this field, the training data for our proposed method consists of randomly generated instances of the Job Shop Scheduling Problem (JSSP). To create these instances, we consider a range of problem sizes, spanning from small to large scales. Specifically, we generate JSSP instances for eight different problem sizes, ranging from $6\times6$, $10\times10$, and $15\times10$ for small scales, to $15\times15$, $20\times10$, and $20\times15$ for medium scales, and including $30\times10$ and $30\times15$ for large scales with hundreds of operations. The processing time of each operation is generated uniformly at random from the range of integers between $1$ and $99$. The precedence constraints, which predefined the processing order of different operations for each job, are generated using random permutation.
\colorr{In our work, we generate random instances of JSSP as training data, across various sizes: small ($6\times6$, $10\times10$, $15\times10$), medium ($15\times15$, $20\times10$, $20\times15$), to large ($30\times10$, $30\times15$) scales, involving hundreds of operations. The operation processing times are assigned random values from 1 to 99, and the sequence of operations for each job is determined through random permutations.}

The hyperparameters for our proposed TBGAT model are determined through empirical tuning on the small problem size of $10\times10$. For FEM and BEM, we utilize three layers with each layer consisting of four attention heads and a hidden dimension of 128 for both the input and hidden layers. The number of attention heads serves as the main parameter for our model. Therefore, we conduct an ablation study to investigate the correlation between the number of attention heads and performance (see the study in section~\ref{sec:head-abstudy}). Regarding the action selection network, $Net_A$ has $L_A=4$ hidden layers, each with half of the dimensions of its parent layer. To ensure the stability of our training process, we normalize all raw features by dividing a large number. Specifically, we divide $\overrightarrow{\Phi}_{ji}$ and $\overleftarrow{\Phi}_{ji}$ by the largest sort $\overrightarrow{\Phi}^* = \max_{ji}\overrightarrow{\Phi}_{ji}$ and $\overleftarrow{\Phi}^* = \max_{ji}\overleftarrow{\Phi}_{ji}$, respectively. The processing time $p_{ji}$ is divided by 99, while $EST_{ji}$ and $LST_{ji}$ are both divided by $1000$. Our model is trained for each problem size independently, with $128000$ random instances uniformly distributed into $2000$ batches of size $64$, which are generated on the fly.

% During training, the model is assessed on a fixed validation set of $100$ instances every ten batches, which are generated in advance using the same procedure as the training instances. The model with the best validation performance is saved. We use $EC=1e^{-5}$, $n=10$, and $T=500$ in our entropy-regularized REINFORCE algorithm with Adam optimizer and constant learning rate $lr=1e^{-5}$. Throughout our experiments, actions are sampled from the policy. To ensure a fair comparison with L2S~\cite{zhang2024Deep}, the initial solutions are computed using the same priority dispatching rule $FDD/MWKR$ (minimum ratio of flow due date to most work remaining). Our TBGAT network is implemented in the Pytorch-Geometric (PyG)\cite{fey2019fast} framework. Other parameters follow the default settings in PyTorch\cite{paszke2019pytorch}. We conduct all experiments on a workstation equipped with an AMD Ryzen Threadripper 3960X 24-Core Processor and a single Nvidia RTX A6000 GPU. All the code and data will be made publicly available online.% if the paper is accepted.

\textbf{\colorr{Training phase configurations. }}
\colorr{During training, we} use $EC=1e^{-5}$, $n=10$, and $T=500$ in our entropy-regularized REINFORCE algorithm with Adam optimizer and constant learning rate $lr=1e^{-5}$. Throughout our experiments, actions are sampled from the policy. To ensure a fair comparison with L2S~\cite{zhang2024Deep}, the initial solutions are computed using the same priority dispatching rule $FDD/MWKR$ (minimum ratio of flow due date to most work remaining). Our TBGAT network is implemented in the Pytorch-Geometric (PyG)\cite{fey2019fast} framework. Other parameters follow the default settings in PyTorch\cite{paszke2019pytorch}. We conduct all experiments on a workstation equipped with an AMD Ryzen Threadripper 3960X 24-Core Processor and a single Nvidia RTX A6000 GPU.


\textbf{\colorr{Evaluation phase configurations.} }
During testing, we employ the same hyperparameters as in the training phase and load the model with the optimal trained parameters. Furthermore, we evaluate the generalization performance of our model on larger improvement steps (up to 5000) since the effectiveness of the improvement heuristics heavily rely on sufficient search depth. Specifically, we train our model with $T=500$ but evaluate its performance on 500, 1000, 2000, and 5000 improvement steps for each problem size, respectively.


\section{Results on synthetic dataset}\label{synthetic_dataset_result}

The experimental results on the synthetic datasets are summarized in Table~\ref{table:1}, where the upper and lower decks display the results for basic testing (500 steps) and step-wise generalization (1000, 2000, and 5000 steps), respectively. From the results, we observe that L2S and TBGAT with 500 improvement steps achieve superior performance compared to L2D, primarily because improvement heuristics generally outperform construction ones in terms of solution quality. Furthermore, TBGAT outperforms L2S \colorr{regarding} solution quality, with consistently better results across all step horizons. In particular, TBGAT with 1000 steps achieves a smaller optimality gap than L2S with 2000 steps across all problem sizes. Moreover, TBGAT-5000 achieves the best overall results among all the compared neural methods. These findings suggest that TBGAT is more effective in learning representations for disjunctive graphs.

\input{table/result-table-syn}


\section{Compare with more SOTA baselines}\label{moreresults}

\subsection{Comparison with EAS}

\input{table/result-table-EAS}

In this subsection, we present a comparison between TBGAT and EAS~\cite{NEURIPS2021_29539ed9}, a state-of-the-art active search method for solving JSSP. We evaluate the performance of both methods using instances of three different scales, ensuring a fair comparison with EAS. We also compare the three reported versions of EAS, namely EAS-Emb, EAS-Lay, and EAS-Tab. The results are presented in Table.~\ref{table:4}. It is evident that TBGAT outperforms EAS with only 500 improvement steps, and TBGAT also has a significant advantage in computational time. This is because EAS is an active search method that requires additional time for fine-tuning on each problem instance, while TBGAT can quickly infer high-quality solutions once trained offline.


\subsection{Comparison with ACL}

\input{table/result-table-ACL}

In this subsection, we compare TBGAT with ACL~\cite{iklassov2022learning}, a curriculum learning method for learning priority dispatching rules that can generalize to different problem sizes, specifically on the Taillard dataset. Table.~\ref{table:5} presents the comparison results, which show that TBGAT-500 outperforms ACL by 36.4\% on average optimality gap. The advantage of TBGAT-500 over ACL is further expanded when TBGAT-1000 and TBGAT-2000 are considered, demonstrating the effectiveness and robustness of TBGAT in solving JSSP instances of different scales.

\subsection{Comparison with JSSEnv}

\input{table/result-table-JSSEnv}

In this subsection, we further compare our method with JSSEnv~\cite{tassel2021reinforcement}, an online neural construction heuristic based on deep reinforcement learning only evaluated on Taillard $30\times20$ instances. As reported in its original paper, JSSEnv requires 600 seconds of solving time for each problem instance. In contrast, our method follows an offline training and online testing fashion, which saves significant time during evaluation. Regarding the performance in Table.~\ref{table:6}, TBGAT almost outperforms JSSEnv on all instances except for `Tai-41'. Importantly, JSSEnv learns to solve each instance online, which may be less efficient in generalizing to new unseen instances than our method, since TBGAT can be directly applied once trained offline. %The results are summarized in Table.~\ref{table:6}.

\subsection{Comparison with DGERD}

\input{table/result-table-DGERD}

In this subsection, we continue to compare our method with DGERD~\cite{chen2022deep}, another recent neural construction heuristic for solving JSSP. DGERD employs a GNN to learn the latent representations for constructing solutions from the partial solution represented with disjunctive graphs, which is similar to L2D. In the original experiment, they select several representative instances from the Taillard benchmarks for testing, where each instance is solved 50 times with DGERD, and the average makespan is reported. We evaluate the performance of our method and DGERD on the same instances, and the results are presented in Table~\ref{table:7}. In general, our method with 500 improvement steps achieves a much smaller makespan than DGERD on each instance.


\section{Ablation studies}\label{abstudy}

\begin{figure*}[!ht]
    \centering
    \subfigure[For different \# of attention heads]{\includegraphics[width=0.35\textwidth]{fig/number_of_heads.pdf}\label{fig6a}}
    \subfigure[Learning speed compared with L2S~\cite{zhang2024Deep}.]{\includegraphics[width=.35\textwidth]{fig/learning_speed.pdf}\label{fig6b}}
    \subfigure[Time complexity w.r.t $\mathcal{M}$]{\includegraphics[width=.35\textwidth]{fig/complexity_analysis_j=40.pdf}\label{fig6c}}
    \subfigure[Time complexity w.r.t $\mathcal{J}$]{\includegraphics[width=.35\textwidth]{fig/complexity_analysis_m=10.pdf}\label{fig6d}}
    \caption{Ablation studies of the model architecture.}
    \label{fig6}
\end{figure*}

\subsection{The number of attention heads}
\label{sec:head-abstudy}
The expressiveness of attention-based graph neural networks is known to depend on the number of attention heads. In general, each head learns problem representations independently, and emphasizes a different respective aspect of the problem that can compensate for each other, thus resulting in a more comprehensive understanding of the problem. Therefore, it is important to evaluate the impact of the number of heads in our TBGAT model. To this end, we conducted an ablation study, training TBGAT with four and eight attention heads on the $10\times10$ problem size, respectively. From the training curves in Fig.~\ref{fig6a}, we can observe that the TBGAT model with eight heads learns faster than the one with four heads, but the performance after convergence is similar. However, the eight-head TBGAT model appears to suffer from overfitting, as its performance starts to degrade after 100 batches of training.

\subsection{Sampling efficiency against L2S}
To demonstrate the superiority of TBGAT over L2S~\cite{zhang2024Deep} in terms of data efficiency during training, we present the learning curves of the two models in Fig.~\ref{fig6b}. As can be observed, TBGAT learns much faster than L2S, which can be attributed to its more appropriate graph embedding modules and more effective training algorithm (i.e., the entropy-regularized REINFORCE algorithm). This indicates that TBGAT is more sampling-efficient, i.e., requires fewer instances to reach a good performance level.

\subsection{Verification of linear computational complexity}
\rebuttal{To verify \rebuttal{Theorem~\ref{thm2}}}, we examine the computational overhead of our method for solving instances of different scales. Fig.~\ref{fig6c} shows the curves of computational time against the number of jobs $|\mathcal{J}|$ and machines $|\mathcal{M}|$, respectively. We can observe that our method exhibits roughly linear time complexity with respect to the number of jobs and machines. This finding supports Theorem~\ref{thm2}, which states that the computational complexity of our method scales linearly with the problem size, making it practical for solving JSSP instances of large scales.

\end{document}

