% \begin{algorithm}[H]
% \caption{Entropy-regularized REINFORCE}
% \label{alg:algorithm}
% \textbf{Input}: Policy $\pi_\theta(a_t|s_t)$, step limit $T$, step size $n$, learning rate $\alpha$, training problem size $|\mathcal{J}|\times |\mathcal{M}|$, batch size $B$, total number of training instances $I$.
% \\
% %\textbf{Parameter}: Parameter $\theta$ of $\pi_\theta(a_t|s_t)$\\
% \textbf{Output}: Trained policy.
% \begin{algorithmic}[1] %[1] enables line numbers
% %\STATE $i\gets 0$
% \FOR{$i=0$ to $i<I$}
%     \STATE Randomly generate $B$ instances of size $|\mathcal{J}|\times |\mathcal{M}|$, and compute their initial solutions $\{s_0^1,...,s_0^B\}$ by using the dispatching rule FDD/MWKR;
%     \FOR{$t=0$ to $T$}
%         \FOR{$s_t^b\in\{s_t^1,...,s_t^B\}$}
%             \STATE Initialize a training data buffer $D^b$ with size 0;
%             \STATE Compute a local move $a^t_b\sim\pi_\theta(a_t^b|s_t^b)$;
%             \STATE Update $s_t^b$ w.r.t $a_t^b$ and receive a reward $r(a_t^b, s_t^b)$;
%             \STATE Store the data $(s_t^b,a_t^b,r(a_t^b, s_t^b))$ into $D^b$;
%             \IF{$t$ mod $n$ = 0}
%                 \FOR{$j=t, t+1, \cdots, t+n$}
%                     \STATE $\theta = \theta +lr\nabla_\theta\left(\log\pi_\theta(a_{j}^b|s_{j}^b)\cdot R_{j}^b + \colorb{\mathcal{H}(\pi_{\theta})}\right)$, where $R_{j}^b$ is the return for step $j$, and \colorb{$\mathcal{H}(\pi_{\theta})$} is the entropy of $\pi_{\theta}$;
%                 \ENDFOR
%                 \STATE Clear buffer $D^b$;
%             \ENDIF
%         \ENDFOR
%     \ENDFOR
%     \STATE $i = i + B$;
% \ENDFOR
% \STATE \textbf{return} $\theta$;
% \end{algorithmic}
% \label{algo1}
% \end{algorithm}




\begin{algorithm}[H]
\caption{$n$-step REINFORCE}
\label{alg:algorithm}
\textbf{Input}: Policy $\pi_\theta(a_t|s_t)$, step limit $T$, step size $n$, learning rate $\alpha$, training problem size $|\mathcal{J}|\times |\mathcal{M}|$, batch size $B$, total number of training instances $I$
\\
%\textbf{Parameter}: Parameter $\theta$ of $\pi_\theta(a_t|s_t)$\\
\textbf{Output}: Trained policy $\pi_{\theta^*}(a_t|s_t)$
\begin{algorithmic}[1] %[1] enables line numbers
%\STATE $i\gets 0$
\FOR{$i=0$ to $i<I$}
    \STATE Randomly generate $B$ instances of size $|\mathcal{J}|\times |\mathcal{M}|$, and compute their initial solutions $\{s_0^1,...,s_0^B\}$ by using the dispatching rule FDD/MWKR\;
    \STATE Initialize a training data buffer $D^b$ with size 0\ for each $s_0^b \in \{s_0^1,...,s_0^B\}$;
    \FOR{$t=0$ to $T$}
        \FOR{$s_t^b\in\{s_t^1,...,s_t^B\}$}
            \STATE Compute a local move $a^b_t\sim\pi_\theta(a_t^b|s_t^b)$\;
            \STATE Update $s_t^b$ w.r.t $a_t^b$ and receive a reward $r(a_t^b, s_t^b)$\;
            % \STATE Store the data $(s_t^b,a_t^b,r(a_t^b, s_t^b))$ into $D^b$\;
            \IF{$t$ mod $n$ = 0}
                \STATE $loss^b_{\theta} = 0$
                \FOR{$j=n$ to $0$}
                    \STATE $loss^b_{\theta} += -(\log\pi_\theta(a_{t-j}^b|s_{t-j}^b)\cdot R_{t-j}^b + \colorb{\mathcal{H}(\pi_\theta(a_{t-j}^b|s_{t-j}^b))})$, where $R_{t-j}^b$ is the return for step $t-j$\, and \colorb{$\mathcal{H}(\cdot)$} is the entropy;
                \ENDFOR
                \STATE $\theta = \theta +\alpha\nabla_\theta\left(loss^b_{\theta}\right)$;
                % \STATE Clear buffer $D^b$\;
            \ENDIF
        \ENDFOR
    \ENDFOR
    \STATE $i = i + B$\;
\ENDFOR
\STATE \textbf{return} $\pi_{\theta}(a_t|s_t)$\;
\end{algorithmic}
\label{algo1}
\end{algorithm}