%\def\year{2022}\relax
%File: formatting-instructions-latex-2022.tex
%release 2022.1
\documentclass[accepted]{uai2022}% for initial submission
%\usepackage{aaai22}  % DO NOT CHANGE THIS
%\usepackage{times}  % DO NOT CHANGE THIS
%\usepackage{helvet}  % DO NOT CHANGE THIS
%\usepackage{courier}  % DO NOT CHANGE THIS
%\usepackage[hyphens]{url}  % DO NOT CHANGE THIS
%\usepackage{graphicx} % DO NOT CHANGE THIS
%\urlstyle{rm} % DO NOT CHANGE THIS
%\def\UrlFont{\rm}  % DO NOT CHANGE THIS
%\usepackage{natbib}  % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT

\usepackage[american]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}

\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
%\usepackage{tikz} % nice language for creating drawings and diagrams
%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

%\usepackage[hyphens]{url}  % DO NOT CHANGE THIS
\usepackage{graphicx} % DO NOT CHANGE THIS
\urlstyle{rm} % DO NOT CHANGE THIS
\usepackage{caption} % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
%\DeclareCaptionStyle{ruled}{labelfont=normalfont,labelsep=colon,strut=off} % DO NOT CHANGE THIS
%\frenchspacing  % DO NOT CHANGE THIS
%\setlength{\pdfpagewidth}{8.5in}  % DO NOT CHANGE THIS
%\setlength{\pdfpageheight}{11in}  % DO NOT CHANGE THIS
%
% These are recommended to typeset algorithms but not required. See the subsubsection on algorithms. Remove them if you don't have algorithms in your paper.
\usepackage{algorithm}
\usepackage{algorithmic}

\usepackage{newfile}
\usepackage{xr}
\usepackage{tcolorbox}
\usepackage{color}
\usepackage{ifthen}
\usepackage{amsfonts}
\usepackage{amsmath}
%\usepackage{amsthm}
\usepackage{amssymb}
\usepackage{subcaption}
\usepackage{array}
\usepackage{ctable}
\usepackage{comment}


\newtheorem{theorem}{Theorem}
\newtheorem{ass}{Assumption}
\newtheorem{lemma}{Lemma}
\newtheorem{definition}{Definition}
\newtheorem{condition}{Condition}
\newtheorem{corollary}{Corollary}
\newtheorem{proposition}{Proposition}
\newcommand{\bc}{\color{blue}}


%\usepackage{microtype}
%\usepackage{graphicx}
%\usepackage{booktabs} % for professional tables
%\usepackage{hyperref}       % hyperlinks

 % Attempt to make hyperref and algorithmic work together better:
%\newcommand{\theHalgorithm}{\arabic{algorithm}}

%\usepackage{multirow}



%\usepackage{amsmath,amssymb,amsfonts}
%\usepackage{algorithm}
%\usepackage{algorithmic}
%\usepackage{xcolor}
%\usepackage{mathtools}

%\usepackage{caption}
%\usepackage{subcaption}
\usepackage{comment}
%\usepackage{amsthm}

%\usepackage{verbatim}
%\usepackage{xr}
%\usepackage{xr-hyper}
%\usepackage{xcite}
%\externaldocument{UAI2022NPH}
%\externalcitedocument{UAI2022NPH}
%\externalcitedocument{NPreferences}
%\input{counters}
%\addtocounter{equation}{0}
%\input{figcounters}
%\addtocounter{figure}{0}


\newcommand{\<}{\langle}
\renewcommand{\>}{\rangle}
\newcommand{\bs}{\boldsymbol}
\newcommand{\mm}{\mathrm}
\newcommand{\bm}{\mathbf}
\newcommand{\mc}{\mathcal}
\newcommand{\mcal}{\mathcal}
\newcommand{\beq}{\begin{equation}}
\newcommand{\eeq}{\end{equation}}
\newcommand{\cS}{\mathcal{S}}
\newcommand{\cA}{\mathcal{A}}
\newcommand{\cF}{\mathcal{F}}
\newcommand{\cX}{\mathcal{X}}
\newcommand{\cL}{\mathcal{L}}
\newcommand{\cM}{\mathcal{M}}
\newcommand{\cN}{\mathcal{N}}
\newcommand{\cJ}{\mathcal{J}}
\newcommand{\cG}{\mathcal{G}}
\newcommand{\bE}{\mathbb{E}}
\newcommand{\bR}{\mathbb{R}}

\usepackage{color}
\usepackage{ifthen}
\definecolor{darkgreen}{rgb}{0,0.5,0}
\newboolean{showcomments}
\setboolean{showcomments}{true}
\newcommand{\desmond}[1]{\ifthenelse{\boolean{showcomments}}{\textcolor{red}{(Desmond says: #1)}}{}}
\newcommand{\thien}[1]{\ifthenelse{\boolean{showcomments}}{\textcolor{red}{(Thien says: #1)}}{}}
\newcommand{\shiau}[1]{\ifthenelse{\boolean{showcomments}}{\textcolor{red}{(SH says: #1)}}{}}
\newcommand{\laura}[1]{\ifthenelse{\boolean{showcomments}}{\textcolor{red}{(Laura says: #1)}}{}}
\newcommand{\addcite} [0]{{\textcolor{darkgreen}{(add citation(s))}}}
\newcommand{\addcites} [0]{{\textcolor{darkgreen}{(add citation(s))}}}
\newcommand{\addref} [0]{{\textcolor{darkgreen}{(add ref)}}}
\newcommand{\todo} [1]{{\textcolor{blue}{(TODO: #1)}}}

\newcommand{\squishlist}{
 \begin{list}{$\bullet$}
  { \setlength{\itemsep}{0pt}
     \setlength{\parsep}{3pt}
     \setlength{\topsep}{3pt}
     \setlength{\partopsep}{0pt}
     \setlength{\leftmargin}{1.5em}
     \setlength{\labelwidth}{1em}
     \setlength{\labelsep}{0.5em} } }
     
\newcommand{\squishend}{
  \end{list}  }

\title{Neural-Progressive Hedging: Enforcing Constraints in Reinforcement Learning with Stochastic Programming (Supplementary Materials)}


\author[1]{Supriyo Ghosh}
\author[2]{Laura Wynter}
\author[2]{ Shiau Hong Lim}
\author[3]{Duc Thien Nguyen}
% Add affiliations after the authors
\affil[1]{%
    Microsoft Research\\
    Bangalore, India
}
\affil[2]{%
    IBM Research AI\\
    Singapore
}
\affil[3]{%
    Singapore Management University\\
    Singapore
  }

\begin{document}

\maketitle
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{comment}
\section{Theoretical Results}


\begin{ass}[Imitation Learning and Warm Start]
Let $\kappa^i\rightarrow 0$ as $i\rightarrow\infty$. Furthermore, there exists an $\hat{\imath}$ such that for  all $i\geq \hat{\imath}$, $\kappa^{i}=0$. 
\label{ass:warm}
\end{ass}

\begin{ass}[Existence and local convexity] Assume that the  solution set of  equation \eqref{eqn_step_sub_cvar} for a CVaR objective, or equation \eqref{eqn_step_sub} otherwise, is nonempty and finite, $\cG(\xi)$ is convex and compact, the gradients of $\tilde{f}$ are locally Lipschitz for each $\xi$ and that the dual penalty parameters $\nu^i$ are sufficiently large for all $i$.
\label{exist}
\end{ass}

 \begin{lemma}
 Under Assumption \ref{ass:warm}, the NP algorithm is equivalent to the progressive hedging algorithm over  an infinite number of iterations. \label{lemma1}
 \end{lemma}
{\bf Proof:}
     Assumption \ref{ass:warm} states that there exists a finite iterate $\hat{\imath}$ such that for all $i\geq\hat{\imath}$, $\kappa^i=0$. Since
      $x^{i+1}(\cdot) = \kappa^i  x^\pi(\cdot) + (1-\kappa^i)  P_{\cM}[\hat{x}^i(\cdot)]$,  for all $i'\geq\hat{\imath}$, $x^{i'}(\cdot) =  P_{\cM}[\hat{x}^{i'}(\cdot)]$, and hence the update of the primal variable  of the algorithm reduces to the progressive hedging update; the result follows.
$\blacksquare$

By Lemma \ref{lemma1} we can consider the infinite sequence of iterates from $\hat{\imath}\rightarrow\infty$ produced by the NP algorithm as an infinite sequence of iterates from a progressive hedging algorithm.

Instances of stochastic programming typically make use of discretized support $\Xi$. We thus define the problem~\eqref{eqn_problem_global} from the main text  in terms of a discrete $\Xi$ and refer to this problem for the remainder of this section.

\begin{ass}[Discrete support]
Let   $\Xi$  be a discrete support and let $1\ldots K$ index each scenario corresponding to a random variable $\xi\in\Xi$, with probability $p_k=1/K$. Then, problem~\eqref{eqn_problem_global} from the main text can  be expressed as: 
\beq
\min_{x_k\in{\cal G}_k; x_k\in {\cal M}} \frac{1}{K}\sum_{k=1\ldots K} \tilde{f}_{k}(x_k). \nonumber
\label{eq:discproblem}
\eeq
\label{ass:discrete}
\end{ass}





Note  the dimensions $x\in\Re_+^{KDw}$ and $y\in\Re_+^{K(D-1)w}$ where $D$ is the depth of the discrete scenario tree and $w$ is the dimension of each action given a scenario and a time stage on the scenario tree. %In this case, we suppose that the vector $y$ above is padded $[y \; 0]^T$ and similarly the 2x2 block matrix $[B \; 0\;  ; 0 \;  0]$ in which case $B$ is itself of full rank. On the other hand, $A_k$ will not be of full rank as  one block of $w$ rows  will be 0 since measurability is not enforced on the scenario tree leaves.


\begin{theorem}[Convergence of Alg. 1 for Convex  $\tilde{f}$ ] 
Under Assumptions 1 ,2 and 3 along with the convexity of $\tilde{f}$,  the sequence of iterates  $(x^i(\cdot),y^i(\cdot), \lambda^i (\cdot), u^i (\cdot))$   generated by the neural-progressive hedging algorithm  is such that 
\beq
\begin{aligned}
&\|x^{i+1} - x^i \|^2 + \| y^{i+1}- y^i \|^2 + (1/\nu^2) \| \lambda^{i+1} - \lambda^i \|^2 + \nonumber \\
&  (1/\nu^2) \|u^{i+1} - u^i\|^2  <  \|x^i - x^{i-1}  \|^2 + \|y^i- y^{i-1} \|^2 + \nonumber\\
&  (1/\nu^2) \| \lambda^i - \lambda^{i-1 }\|^2 + (1/\nu^2) \|u^i - u^{i-1}\|^2,
  \end{aligned}
 \eeq
 and
 \beq
\begin{aligned}
& |x^{i+1} - x^*|^2 + |y^{i+1}- y^* \|^2 + (1/\nu^2) \| \lambda^{i+1} - \lambda^* \|^2 + \nonumber \\
& (1/\nu^2)  \|u^{i+1} - u^*\|^2    <  |x^i - x^*|^2 + |y^i- y^*|^2 \nonumber \\
& + (1/\nu^2) \| \lambda^i - \lambda^* \|^2 + (1/\nu^2)  \|u^i - u^*\|^2
 \end{aligned}
 \eeq
 with equality at $(x^*(\cdot), y^*)$ in the case of finite convergence, and thus converges to a  local solution  $(x^*(\cdot), y^*)$ with $(\lambda^*(\cdot), u^*(\cdot))$ as $i\rightarrow\infty$.   
 \label{conv}
\end{theorem}
{\bf Proof:}
    From Lemma 1, Algorithm 1 is equivalent to the Progressive Hedging Algorithm of \cite{Rockafellar2018ProgressiveDO} when run for an infinite number of iterations. The convergence of the Progressive Hedging Algorithm to a solution $(x^*(\cdot), y^*(\cdot))$ is thus guaranteed under  Assumptions 2 and 3 along with the convexity of $\tilde{f}$. 
$\blacksquare$

\begin{theorem}[Convergence of Alg. 1 for Nonconvex  $\tilde{f}$] 
Let Assumptions 1, 2 and 3 hold and let $(x^i(\cdot),y^i(\cdot))$ be a  locally optimal solution to each subproblem \eqref{eqn_step_sub_cvar}. If  sequences $\{x^i,y^i,\lambda^i,u^i\}$ converge to point $\{x^*,y^*,\lambda^*,u^*\}$, then $(x^*(\cdot),y^*(\cdot))$ generated by the neural-progressive hedging algorithm  is a locally optimal solution  to \eqref{eqn_problem_global}.
\end{theorem}
{\bf Proof:}        
From Lemma 1,  Algorithm 1 is equivalent to the Progressive Hedging Algorithm of \cite{Rockafellar2018ProgressiveDO} when run for an infinite number of iterations. For nonconvex $\tilde{f}$, when the Progressive Hedging Algorithm converges to a point, under Assumptions 2 and 3, it was shown in \cite{rockafellar1991scenarios} that the point is   a stationary point of the problem (3).
$\blacksquare$


\begin{proposition}
	Let $ \tilde{f}$ be Lipschitz  $\forall \xi$, i.e., $ \|\tilde{f}(x(\xi),\xi) - \tilde{f}(x'(\xi),\xi) \|\le L\|x(\xi) - x'(\xi)\| $. We have the following bound as a function of $\kappa^i$ and  Lipschitz constant $L$:
\beq
\begin{aligned}
	& \bE [ \tilde{f}(x^{i + 1} (\cdot), \cdot) ] \leq \bE[ \tilde{f}(x^\pi (\cdot), \cdot) ] + L(1-\kappa^i) \cdot \nonumber \hspace{0.2in} \\
	& \hspace{1.1in}  \|  P_{\cM}(\hat{x}^i(\cdot)) - x^\pi(\cdot)\|.\label{eq:SP_bound}
\end{aligned} 
\eeq
\end{proposition}
{\bf Proof:}  
	For each scenario $ \xi $, we have 
	\begin{flalign}
		&\tilde{f}(x^{i + 1} (\xi), \xi) - \tilde{f}(x^\pi (\xi), \xi) \le L\|x^{i + 1} (\xi) - x^\pi (\xi)\| \hspace{0.5in}\nonumber \\ 		
		 \le& L \| \kappa^i    x^\pi(\xi) + (1-\kappa^i) \cdot 
		P_{\cM}(\hat{x}^i(\xi)) - x^\pi(\xi)\| \hspace{0.2in}\nonumber \\ 
		\le& L(1-\kappa^i) \|  P_{\cM}(\hat{x}^i(\xi)) - x^\pi(\xi)\| \blacksquare \nonumber
	\end{flalign}

\end{comment}

% \section*{Experiment settings}
% We perform all the experiments on Ubuntu 18.04 virtual machines with 32-core CPU, 64 GB of RAM, and a single Nvidia Tesla P100 GPU. The distributed Ray framework and RLlib \citep{liang2017ray} were used for the DDPG method. The pure stochastic program (SP) and neural-progressive hedging (NP) methods with linear and non-linear objective function are solved using IBM ILOG CPLEX 12.9 and IPOPT \citep{wachter2006implementation}, respectively. The constrained policy optimization (CPO) \citep{CPO} and proximal policy optimization with a Lagrangian penalty (PPO-L) \citep{PPOL} methods are solved using OpenAI safe RL implementation \citep{PPOL}.

% The unconstrained RL policy used as an expert is computed at each time step $t$ using  the DDPG algorithm \citep{DDPG}.
% We use a recurrent neural network (RNN) architecture for training the DDPG method with 1 hidden layer consisting of 25 hidden predictor nodes and a tanh nonlinear activation function. In addition, a long short-term memory (LSTM)  model is used to represent the RNN architecture with LSTM cell size 256 and maximum sequence length of 20.
% Parameter values are as follows: the discounting factor $\gamma = 0.99$, minibatch size $b = 50$ and learning rate $lr=3e^{-5}$.  
% For both constrained RL methods (i.e., CPO and PPO-L), we use a neural network with 2 hidden layers, each consisting of 256 hidden nodes with \emph{tanh} nonlinear activation function.
% The source codes for the constrained benchmark algorithms can be found at: https://github.com/openai/safety-starter-agents. 

% A discretized scenario tree is used in each decision epoch to solve the pure SP and NP methods for the experiments. For the financial planning example, in each decision period $t$, we generate a two layer scenario tree where the first layer consists of a root node and  the second layer  includes 1000 nodes, giving rise to 1000 scenarios. The interest rates for each of the scenarios are sampled from a multi-dimensional log normal distribution whose mean and covariance matrix are estimated from the training data set of price movements in the S\&P500.
% For the  liquidity constraints, we sample 10 liquidity demand processes from a Gaussian distribution with $\mu=0.025$ and $\sigma=0.01$, giving rise to 10,000 scenarios in the second layer of the scenario tree. 
% For the bike sharing problem, due to its complex non-linear objective function, we generate a two-layer tree with 200 scenarios, where the first layer consists of a root node and  the second layer  includes 200 nodes. The demand values at stations for each of the scenarios are sampled from a multi-variate normal distribution whose mean and covariance matrix are learnt from 60 days of training demand data.% \citep{ghosh2019improving}. 


\section*{Additional Numerical Results}


Figure~{\ref{fig:results7} compares the warm-start (called NP-WS) version with the damped-guidance, or imitation-learning-type  expert guidance (called NP). Both versions perform far better than the RL policy. 


\begin{figure}[!htb]
	\centering
		\includegraphics[width=.45\textwidth]{Figures/plot_warm_start_comparison.pdf}
	\caption{ Performance comparison of two versions of the algorithm: warm start ($\kappa^1=1$,  $\hat{\imath}=1$) vs. imitation learning ($\kappa^i=(1+i)^{-2} $, $\hat{\imath}=20$, in this example).}
	\label{fig:results7}
\end{figure}

\begin{figure*}[!htb]
	\centering
	\begin{subfigure}{0.37\textwidth}
		\includegraphics[width=\textwidth]{Figures/averageEpCost_CPO.pdf} \caption{}
	\end{subfigure}  \hskip 1cm
	\begin{subfigure}{0.37\textwidth}
		\includegraphics[width=\textwidth]{Figures/averageEpCostPPO_Lagrangian.pdf} \caption{}
	\end{subfigure} 
	\caption{ Episodic (episode length = 30) constraint violation cost during training for (a) CPO; and (b) PPO-Lagrangian.}
	\label{fig:results8}
\end{figure*}



% \begin{table*}[!htb]
% \small
% \begin{center}
% \begin{tabular}{>{}m{1.5cm}  >{\centering}m{1.0cm} >{\centering}m{1.2cm} >{\centering}m{1.2cm} >{\centering}m{1.0cm} >{\centering}m{1.0cm} >{\centering}m{1.2cm} >{\centering}m{1.2cm} c}
% \hline
% \multicolumn{1}{>{\centering}m{1.5cm}|}{} & \multicolumn{4}{>{}m{4.4cm}|}{First 30 days, annualized values} & \multicolumn{4}{>{\centering}m{4.4cm}}{Second 30 days, annualized values} \\
% \hline
% \multicolumn{1}{>{\centering}m{1.5cm}|} {Algorithms} &  {\footnotesize Returns  } & {\footnotesize Sharpe} & {\footnotesize Volatility} & \multicolumn{1}{>{\centering}m{1.0cm}|} {\footnotesize MDD} &  {\footnotesize Returns} & {\footnotesize Sharpe} & {\footnotesize Volatility} & {\footnotesize MDD} \\
% \specialrule{.1em}{.05em}{.05em} 
% {\small SP-0.0} & 11.84 & 3.58 & 27.33 & 7.63 & 2.3 & 1.16 & 17.8 & 4.8  \\
% {\small SP-0.95} &  11.83 & 3.58 & 27.37 & 7.65 & 0.87 & 0.51 & 17.32 & 4.85 \\
% {\small SP-0.99} &  0.0 & -0.54 & \textbf{0.0} & \textbf{ 0.0} & 0.0 & -3.78 & \textbf{0.0} & \textbf{0.0} \\
% {\small \textbf {NP-0.0 }} &  \textbf{22.47} & { 4.44} & 40.29 & 10.46 & \textbf{7.44} & \textbf{2.22} & 29.1 & 7.41 \\
% {\small \textbf{NP-0.95}} & \textbf{ 22.47} & {4.44} & 40.29 & 10.46 & \textbf{7.44} & \textbf{2.22} & 29.1 & 7.41 \\
% {\small NP-0.99} &  21.64 & 4.29 & 40.38 & 10.44 & 7.4 & 2.09 & 30.96 & 7.99 \\
% {\small DDPG} & 19.33 & 4.36 & 35.63 & 9.52 & 6.08 & 2.12 & 24.86 & 5.93\\
% {\small uCRP} &  12.08 & \textbf{5.68} & 17.16 & 5.26 & 1.38 & 0.97 & 12.5 & 3.77 \\
% {\small OLMAR} &  10.4 & 4.65 & 18.26 & 5.97 & -4.17 & -2.69 & 12.98 & 3.54 \\
% {\small PAMR} & 6.35 & 2.45 & 22.08 & 6.03 & -8.02 & -3.39 & 20.1 & 5.19 \\
% {\small RMR} &  10.68 & 4.72 & 18.45 & 5.97 & -4.56 & -2.82 & 13.58 & 3.83 \\
% \specialrule{.1em}{.05em}{.05em} 
% \end{tabular}
% \end{center}
% \caption{Performance metrics without liquidity constraints: stochastic programming (SP) with CVaR $\alpha=0,0.95,0.99, $  the proposed neural-progressive hedging method (NP) with CVaR $\alpha=0,0.95,0.99, $  DDPG, and  trading  baselines: uCRP (uCRP), OLMAR, PAMR, and RMR. The SP with $\alpha=0.99$ puts all funds in cash, hence Max. Daily Drawdown (MDD) and volatility are both 0.}
% \label{table:metrics2}
% \end{table*}



Figure~{\ref{fig:results8} illustrates the episodic constraint violation cost for two benchmark constrained RL algorithms, CPO of \citet{CPO}, and PPO-Lagrangian of \citet{PPOL}. Each episode duration is 30 time steps and in each time step $t$, we enforce a cost of 1 if the amount available in the liquid instrument is less than the cumulative account payable up through time $t$. Observe that  CPO  fails to learn the constraints  during training. The PPO-Lagrangian method is able to bring down the episodic cost to 0 during training (the limit of the episodic cost is set to 0), but as shown in the main paper (see Figure 2(c)), the learned PPO-L policy is not able to  satisfy the constraints during execution. 

% Table \ref{table:metrics2}     compares the returns of the neural-progressive hedging algorithm with those from standard trading strategies. Specifically, we show the  best performing online portfolio selection algorithms as benchmarks: (i) A uniform constant rebalancing portfolio (uCRP) approach \cite{cover2011universal}; (ii) Online moving average reversion (OLMAR)  \cite{li2012line} (iii) Passive-aggressive mean reversion (PAMR) \cite{PAMR} and (iv) Robust median reversion (RMR)  \cite{huang2016robust}. We use a grid search to optimize the two key hyper-parameters of these universal portfolio algorithms: namely the lookback window $w$ and threshold parameter $\epsilon$. The source codes for the online portfolio selection algorithms can be found at https://github.com/Marigold/universal-portfolios. 




\section*{Implementation Details}

\paragraph{Discretization of the stochastic program scenario tree}

Consider a finite scenario tree formulation of a stochastic programming problem, such that the set of nodes in the scenario tree at time stage $t$ are denoted $N_t$. 
A node denotes a point in time when a realisation of the random process becomes known and a  decision  is taken. Each node  replicates the data of the optimization problem,  conditioned on the probability of visiting that node from its parent node. A path from the root to each leaf node is referred to as a scenario; its  probability of occurrence,  $p_s$, is the product of the conditional probabilities of visiting each of the nodes on that scenario path.
The discretized model-based   stochastic program is thus:
\beq
 \max   \sum_{s=1\ldots S} F_s(x, \xi ) := \sum_{s=1\ldots  S} p_s  \sum_{t=1\ldots T} f_t(x_s(t)).
 \label{eq:SPobj2}
 \eeq 
The \textit{non-anticipativity} constraints are critical for the implementability of the policy but they couple the scenario sub-problems by requiring that the action $x_t$ at  time $t$ is the same across scenarios (i.e., sample paths) sharing the sample path up to and including time $t$. For each $\xi\in\Xi$, these coupling constraints are expressed as:
\beq
x(\xi) = ( x_1, x_2(\xi_1), x_3 (\xi_1,\xi_2), \ldots x_T (\xi_1 \ldots \xi_{T-1}).
\label{eq:meas}
\eeq 
Using the discretized formulation of \eqref{eq:SPobj2}, and
following \citet{rosa1996augmented} we can rewrite \eqref{eq:meas}  in a manner that  facilitates  relaxation of those constraints: Define the last common stage of two scenarios $s_1$ and $s_2$ as 
\beq
t^{\max}(s_1,s_2):= \max\{ \hat{t}: s_{1}(t) = s_{2}(t), t=1,\ldots \hat{t} \},
\eeq
and then re-order the scenarios $s=1\ldots S$, so that at every $s$, the scenario $s+1$ has the largest common stage with scenario $i$ for all scenarios $s' > s$, that is $t^{\max}(s,s+1):= \max\{ t^{\max}(u,v) : v>u\}$. Then, define the sibling of scenario $s$ at time stage $t$ as a permutation $\nu(s,t):= s+1$ if $t_{\max}(s,s+1)\geq t$ and $\nu(s,t):= \min\{t' : t^{\max}(s,t')\geq t \}$ otherwise. The inverse permutation shall be  denoted $\nu^{-1}(s,t).$
Note that the sibling of a scenario depends upon the time stage, and that a scenario with no shared decisions at a time stage has by definition itself as sibling. 
Using the above, \citet{rosa1996augmented}  re-define the constraints enforcing measurability in terms of the sibling function
as follows:
\beq
x_s(t) = x_{\nu(s,t)}(t) \;\; \forall (s,t), \; s \ne \nu(s,t).
\label{eq:measurability2}
\eeq
Equation \eqref{eq:measurability2} is convenient in the primal-dual formulation in terms of discrete scenarios, presented next.
We are interested in maintaining the separability of the subproblems which depend only on individual scenarios of the random variable to facilitate handling large problems via scenario-based decomposition. To do so, we relax the constraints using the following formulation
\beq
\cM := \{x:    M_1 x_1 (\xi) + \ldots + M_Sx_S  (\xi)= 0\},
\label{eq:measurability3}
\eeq
where the matrices in \eqref{eq:measurability3}  are defined so that each $M_s$ is a matrix of -1, 0 and 1 such that at the root node $x_{11}=x_{12}, x_{12}=x_{23}=\cdots x_{1,s-1}=x_{1,s}$, at the stage $t=2$, there are as many such sets of equalities as children nodes emanating from the root node, and so on up to stage $T-1$.  At stage $T$, all nodes are leaves and no such linking constraints are required. The projection of a point $x^i$ onto the subspace $\cM$, $P_{\cM}[x^i(\cdot)]$  can be computed by taking the conditional expectation of $x^i$, $E_{\xi \, | \, \xi_1, \ldots \xi_{i-1}}$.
Lagrange relaxation of the measurability constraints \eqref{eq:measurability2} gives rise to the following Lagrange function, in terms of the discrete scenarios $s=1\ldots S$:
\begin{align}
\cL(x,\lambda) = \sum_{s=1\ldots S}  p_s \sum_{t=1\ldots T} f_t(x_s(t)) + \hspace{0.7in}\nonumber \\ 
\sum_{s=1\ldots S}  \sum_{t=1\ldots T-1}  \lambda_s(t) (x_s(t) - x_{\nu(s,t)}(t)).
\label{eq:primal_dual}
\end{align}
%As before we  gather the remaining two sets of constraints \eqref{eq:withinstage} and \eqref{eq:dynamics} in  constraint subset $G'$.
The scenario subproblems are re-defined as a function of the inverse permutation of the sibling function:
\begin{align}
\min_{x_s \in G'_s} \cL_s(x_s,\lambda_s) = p_s \sum_{t=1\ldots T} f_t(x_s(t)) + \hspace{0.5in} \nonumber\\
\sum_{t=1\ldots T-1} ( \lambda_s(t) - \lambda_{\nu^{-1}(s,t)}(t) ) x_s(t)
\label{eq:lag1}
\end{align}
for each $s=1\ldots S$.
The dual problem is given by
\beq
\max_{\lambda} D(\lambda) := \min_{x \in  G' } \cL (x,\lambda).
\label{eq:dual1}
\eeq

%\paragraph{Penalty parameters in the Neural-progressive hedging algorithm}
It is possible to further speed up convergence of our NP algorithm in practice using the approach of \citet{zehtabian2016penalty}. This approach monitors the primal and dual gap terms in convergence criteria separately to update the penalty parameters so as to reduce the convergence gap quickly.% Specifically,  the idea involves  not increasing  the penalty parameter when the second term is large. When the second term is small but the first term is large,     the penalty parameter is increased slightly, and when both terms are relatively small,  the penalty parameter is increased more significantly to force faster dual convergence.



\begin{comment}
\paragraph{Experiment settings}
We perform all the experiments on Ubuntu 16.04 virtual machines with 32-core CPU, 64 GB of RAM, and a single Nvidia Tesla P100 GPU. The distributed Ray framework and RLlib \cite{liang2017ray} were used for the DDPG method. The pure SP and NP methods with linear and non-linear objective function are solved using IBM ILOG CPLEX 12.9 and IPOPT \cite{wachter2006implementation}, respectively. The CPO and PPO-L methods are solved using OpenAI safe RL implementation \cite{PPOL}.

A discretized scenario tree as described above is used in the experiments. In each decision period $t$ of the neural-progressive hedging algorithm implementation on the financial planning example, we generate a two layer scenario tree where the first layer consists of a root node and  the second layer  includes 1000 nodes, giving rise to 1000 scenarios. The interest rates for each of the scenarios are sampled from a multi-dimensional log normal distribution whose mean and covariance matrix are estimated from the training data set of price movements in the S\&P500.
For the  liquidity constraints, we sample 10 liquidity demand processes from a Gaussian distribution with  $\mu=0.025$ and $\sigma=0.01$, giving  rise to 10,000 scenarios in the second layer of the scenario tree. 
For the bike sharing problem, due to its complex non-linear objective function, we generate a two-layer tree with 200 scenarios, where the first layer consists of a root node and  the second layer  includes 200 nodes. The demand values at stations for each of the scenarios are sampled from a multi-variate normal distribution whose mean and covariance matrix are learnt from 60 days of training demand data.

The unconstrained RL policy used as an expert is computed at each time step $t$ using  the DDPG algorithm \cite{DDPG}.
We use a recurrent neural network (RNN) architecture for training the DDPG method with 1 hidden layer consisting of 25 hidden predictor nodes and a tanh nonlinear activation function. In addition, a long short-term memory (LSTM)  model is used to represent the RNN architecture with LSTM cell size 256 and maximum sequence length of 20.
Parameter values are as follows: the discounting factor $\gamma = 0.99$, minibatch size $b = 50$ and learning rate $lr=3e^{-5}$.  
Two state-of-the-art methods are used to compare with the constrained policy: (i) Constrained policy optimization \cite{CPO}; and (b) Proximal policy optimization with a Lagrangian penalty \cite{PPOL}. For both we use a neural network with 2 hidden layers, each consisting of 256 hidden nodes with tanh nonlinear activation function.
The source codes for the constrained benchmark algorithms can be found at https://github.com/openai/safety-starter-agents. 
\end{comment}

\bibliography{NPreferences}
%\bibliographystyle{aaai22}
\end{document}

