%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%packages are manually added
\usepackage{times}
\usepackage{soul}
\usepackage{url}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{booktabs}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{wrapfig}
\usepackage{amssymb, amsfonts}
\usepackage{subcaption}
\usepackage{multirow}
\usepackage{xcolor}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Learning to Reason about Contextual Knowledge \\
for Planning under Uncertainty}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,2]{Cheng Cui}
\author[1]{Saeid Amiri}
\author[1]{Yan Ding}
\author[1]{Xingyue Zhan}
\author[1]{Shiqi Zhang}

% Add affiliations after the authors
\affil[1]{%
    Department of Computer Science\\
    SUNY Binghamton\\
    Binghamton, New York, USA
}
\affil[2]{%
    Cognex Corporation\\
    Natick, Massachusetts, USA
}
\affil[ ]{\texttt{\{ccui7, samiri1, yding25, xzhan215,zhangs\}@binghamton.edu}}

  \begin{document}
\maketitle

\begin{abstract}
Sequential decision-making (SDM) methods enable AI agents to compute an action policy toward achieving long-term goals under uncertainty. 
Existing research has shown that contextual knowledge in declarative forms can be used for improving the performance of SDM methods. 
However, the contextual knowledge from people tends to be incomplete and sometimes inaccurate, which greatly limits the applicability of knowledge-based SDM methods. 
In this paper, we develop a novel algorithm for knowledge-based SDM, called PERIL, that learns from interaction experience to reason about contextual knowledge, as applied to urban driving scenarios. 
Experiments have been conducted using CARLA, a widely used autonomous driving simulator. 
Results demonstrate PERIL's superiority in comparison to existing knowledge-based SDM baselines.
\end{abstract}

\section{Introduction}\label{sec:intro}
% Intelligent agents are increasingly present in our everyday life, including those embodied agents (robots) that interact with the real world. 
Artificial intelligence agents need to estimate the current world state while determining what to do based on the current state estimation, resulting in the problem of sequential decision-making (SDM) under partial observability~\citep{kaelbling1998planning,hausknecht2015deep,jaakkola1994reinforcement}. 
Existing research has demonstrated that an agent's SDM capability can be improved by reasoning with contextual knowledge to estimate the current world state~\citep{zhang2015corpp,chitnis2018integrating}. 
However, the contextual knowledge provided by domain experts can hardly be comprehensive, and sometimes includes inaccurate information. 
Motivated by the observation that AIs need significant efforts to recover from inaccurate knowledge in SDM tasks~\citep{amiri2020learning}, we aim to develop an approach to help the SDM robots learn to reason about contextual knowledge. 

Consider a lane changing scenario in urban driving. 
On the one hand, the vehicle needs to perceive the environment, e.g., using Lidar range sensors, to detect whether there is sufficient room in the desired lane. 
The perception output, together with other contextual information (say weather and traffic), is then processed in a reasoning system to estimate the world state, including the intentions of other drivers (humans or not). 
On the other hand, the vehicle can plan actions to actively facilitate lane changing, such as using turn signals to request space, and slowing down to find room for the lane change. 
Existing methods have enabled robots to logically reason about the world state, and use the reasoning results to facilitate decision making~\citep{zhang2022survey}. 
However, how to learn from a robot's decision making experience to improve the reasoning capability for SDM tasks is still an open problem. 

In this paper, we develop a learning algorithm for knowledge-based SDM, called \emph{perceptual reasoning and interactive learning} (\textbf{PERIL}), as shown in Figure 1~\ref{fig:overview}. 
We use a perceptual reasoner that consists of a deep supervised learning classifier and a knowledge base of logic rules for perceiving and reasoning about the current world state. 
The perceptual reasoner takes as input streaming data from on-board sensors and observable facts, such as current time and weather. 
The contextual information is processed together to compute a distribution representing the current world state estimation. 
The distribution is then provided to the interaction component as an informative prior to guide its action selections toward achieving long-term goals.

\begin{figure*}[t]
  \begin{center}
    % \vspace{-1em}
    \includegraphics[width=0.95\textwidth]{images/PERIL_overview_3.pdf}
    % \vspace{-.2em}
    \caption{An overview of PERIL.
    The perceptual reasoner consists of a classifier for passive perception and a knowledge base of rules and weights for automated reasoning. It receives streaming data of feature vectors and facts from the environment. Based on the perception state estimation of the classifier and observed facts, the perceptual reasoner uses the knowledge base to infer an informative prior to compute the initial belief for the interaction. In the interaction, world dynamics refers to the way in which the environment and the state evolve over time (in POMDPs, world dynamics is represented by the transition function). Then a policy suggests the best action for information collection based on the belief at each time step.  Finally, the loop is closed by providing feedback containing labels and ground truth to the perceptual reasoner, training the classifier, and learning the new weights of rules. 
    }
    \label{fig:overview}
  \end{center}
% \vspace{-1em}
%https://docs.google.com/drawings/d/1rV1HhdBRgQazUT4qRY5DBcLZ6GeftsG-mpFUMsKn5l4/edit?usp=sharing
\end{figure*}

The \textbf{main contributions} of this paper includes:
\begin{itemize}
    \item A formal statement of the knowledge-based SDM problem we are concerned with, where we specify the algorithm input and output, as well as the assumptions; 
    \item The PERIL algorithm that enables AIs to learn from both contextual knowledge and data gathered at runtime to close the perceive-reason-act loop; 
    \item Extensive experiments and illustrative trials in urban driving scenarios using CARLA-based~\citep{dosovitskiy2017carla} simulation for demonstrating the superiority of our approach.
\end{itemize}

In comparison to competitive baselines ~\citep{amiri2020learning,6728533}, we found that PERIL improves the autonomous vehicle's overall performance in sequential decision-making by increasing cumulative rewards and reducing interaction costs.


\section{Related Work}
\label{sec:related}

This paper is about incorporating perceptual reasoning and interactive learning into sequential decision-making under uncertainty. 
We discuss research topics that are relevant to this work. 

Researchers have developed methods that incorporate human knowledge in declarative forms into planning under uncertainty frameworks~\citep{gobelbecker2011switching,zhang2015corpp,hanheide2017robot,chitnis2018integrating,amiri2020learning,amiri2022reasoning}. 
There are other works that studied how human knowledge can be used to improve the performance of reinforcement learning (RL) agents~\citep{zhang2022efficient,leonetti2016synthesis,yang2018peorl,icarte2022reward,jiang2019task,hayamizu2021guiding,zhang2022efficient}. 
A survey paper summarized research on knowledge-based sequential decision making~\citep{zhang2022survey}. 
Those methods use a knowledge base that cannot be updated as the agent becomes more experienced.
Recently, LLM-based planning methods have been proposed, such as SayCan~\citep{ahn2022can}, and Inner Monologue~\citep{huang2022inner}. 
Nevertheless, these methods lack the ability to reason about human knowledge (COWP and LLM+P are exceptions~\citep{liu2023llm+,ding2022robot}), whereas our approach explicitly addresses quantitative uncertainty.
In comparison, PERIL learns to reason about contextual knowledge, producing agent behaviors that are robust to imperfect knowledge. 

AIs, including autonomous vehicles, that operate in the real world require the simultaneous capabilities of perception for estimating the current world state, and planning to achieve long-term goals~\citep{nilsson1984shakey,thrun2005probabilistic}. 
It is a common practice that the perception component outputs the current world state in a symbolic form to the planning component~\citep{khandelwal2017bwibots,veloso2018increasingly,shuai2019kejia}. 
There is recent research from the literature that tightly integrates the perception and planning components~\citep{hausknecht2015deep,lee2020making,wangswingbot,pmlr-v80-srinivas18b,ding2022glad}. 
There is the survey paper on interactive perception that summarized relevant research~\citep{bohg2017interactive}. 
They used machine learning techniques, e.g., a deep neural network, to estimate the current world state. 
What is passed to the planning component includes not only the current state in symbolic forms, but also the (un)reliability information. 
Also, recent research developed an approximate algorithm to help the agent choose a subset of exogenous state variables to reason about when
planning and planning in such a reduced state space can often be significantly more efficient than planning in the full model~\citep{chitnis2020learning}.
There exists research that uses universal planning networks to learn underlying representations through visual perceptions so as to optimize planning. The learned representation can be leveraged to transfer task-related semantics to other agents for more challenging tasks.
PERIL shares the same spirit with the above-mentioned methods by learning complex representations for estimating the current world state~\citep{pmlr-v80-srinivas18b}. 
Beyond that, PERIL leverages contextual knowledge from domain experts to refine the output from neural networks (CNNs in our case) before passing it along to the planning component. 

Autonomous vehicles, as a type of robots, need to plan their behaviors under partial observability~\citep{bai2015intention}. 
More specifically, the on-board sensors cannot provide a global view of the environment, and the vehicles need to estimate the current world state based on the streaming data collected over time. 
POMDPs are well suitable for planning behaviors under partial observability~\citep{kaelbling1998planning}, and have been used in planning for autonomous vehicles~\citep{6728533,ijcai2017-664,suchan2019out,wray2021pomdps,ha2020probabilistic,zhanghd2022dynamic}. For instance, \citeauthor{wray2021pomdps} used POMDPs to reason at the times when the perception data is limited, but their approach does not leverage any contextual knowledge for reasoning. 

Work closest to this research is an algorithm called LCORPP~\citep{amiri2020learning} that learns from data and reasons about human knowledge to estimate the world state. 
They used LSTM~\citep{hochreiter1997long} for sequence classification, and P-log~\citep{baral2009probabilistic} for representing and reasoning about contextual knowledge. 
Other than a different application domain, the main difference from that work is that PERIL is able to learn from the interaction experience to improve its reasoning capability, using Markov logic networks~\citep{richardson2006markov,domingos2019unifying}. 
To the best of our knowledge, PERIL is the first work that learns to use human knowledge for sequential decision-making under uncertainty. 
\section{Background}
In this section, we summarize three key techniques used in this paper: convolutional neural networks, Markov logic networks, and partially observable Markov decision processes.
\subsection{Convolutional Neural Networks}
A convolutional neural network (CNN) is comprised of convolutional layers followed by fully connected layers as in a standard multilayer neural network~\citep{lecun1998convolutional}. 
The basic building blocks of CNN consist of convolutional, pooling, activation, and fully-connected layers. In a convolutional layer, a filter is passed over the image, viewing a few pixels at a time. The convolution operation is a dot product of the original pixel values with weights defined in the filter. Pooling layers are used for downsampling, and fully-connected layers output  a list of probabilities for different possible labels. The activation layers introduce non-linearity.
The architecture of a CNN is designed to take advantage of the 2D structure of an input image (or other 2D input such as a speech signal). 
We use CNNs for the perception of road conditions in this work. 
\subsection{Markov Logic Networks}
Markov networks are undirected cyclic probabilistic graphical models where each edge has a potential function~\citep{richardson2006markov,domingos2019unifying}. 
Markov logic networks (MLNs) are a template for building Markov networks. 
They are first-order knowledge bases with a weight associated with each rule. 
A first-order logic knowledge base is a set of hard constraints on the set of possible worlds: if a world violates even one formula, it has zero probability. The basic idea in MLNs is to soften these constraints: \emph{When a world violates one formula in the knowledge base it is less probable, but not impossible.} 
An MLN program is a set of pairs $(F_i, w_i)$, where  $F_i$ is a formula in first-order logic and $w_i$ is a real number that specifies the weight of the formula.  Learning in MLNs can be done using the following equation: 
$$ \frac{\partial log P_w (X= x)}{\partial w_i} = n_i(x) - \sum_{x'} P_w (X = x')n_i (x') $$
where the sum is over all possible databases $x'$, and $P_w(X =x')$ is $P(X =x')$
computed using current weight vector $w = (w_1,... , w_i,...)$ and $n_i(x)$ is the true groundings in data $x$.
In this paper, we use MLNs to enable an SDM agent to not only reason with human knowledge, but also learn to improve its reasoning capability from experience. 


\subsection{Partially Observable MDPs}
Markov decision processes (MDPs) can be used for SDM. 
When the environment is not fully observable, we can use POMDPs that generalize MDPs by assuming partial observability of the current state~\citep{kaelbling1998planning}. 
A partially observable MDP (POMDP) is a tuple $(S, A, T, R, Z, O, \gamma)$ where $S$ is the state space, $A$ is the action set, $T$ is the state-transition function, $R$ is the reward function, $O$ is the observation function, $Z$ is the observation set, and $\gamma$ is a discount factor that determines the planning horizon. 

A POMDP agent maintains a belief state distribution $b$ with observations ($z \in Z$) using the Bayes update rule: 
$$
	b'(s') = \frac{O(s', a, z)\sum_{s \in S} T (s,a,s')b(s)}{Pr(z|a,b)}  
$$
where $s$ is the state, $a$ is the action, $Pr(z|a,b)$ is a normalizer, and $z$ is an observation. 
Solving a POMDP produces a policy that maps the current belief state distribution to an action toward maximizing long-term utilities.

\section{Problem Statement}
\label{sec:problem}
In this section, we formally present the knowledge-based sequential decision making problem to pave the way for the learning algorithm developed in this paper. 
We first define the problem domain by the tuple below: 
$$
    \langle \Theta, \lefteqn{\underbrace{\phantom{E, F, H, Q}}_{\mathbf{V}^R}}E,F,H,\overbrace{Q,V}^{\mathbf{V}^I}, A, T, Z, O \rangle.
    % link : https://tex.stackexchange.com/questions/297/how-can-i-get-an-underbrace-and-an-overbrace-to-partially-overlap-in-an-equation
$$ 

The agent is provided with contextual knowledge $\Theta$, a finite set of first-order logical statements (rules). 
The logical rules of $\Theta$ are over a finite set of variables $\mathbf{V}^R = F \cup E \cup H \cup Q$, where $F$, $E$, $H$, and $Q$ are sets of fact, evidence, hidden, and query variables respectively. 
$V$ is the set of latent variables for interaction. 
% , and their values are initialized as a uniform distribution.
Interaction variables $\mathbf{V}^I$ consists of query and latent variables ($\mathbf{V}^I = Q \cup V$).
The agent is provided with a finite set of actions $A$ that the agent can perform. 
$T$ is a transition function: $T(s, a, s') = Pr(s'|s,a)$, where $s, s' \in S$ is the 
factored space specified by $\mathbf{V}^I$.
$Z$ is an observation set, and $O$ is an observation function: $O(s, a, z) = Pr(z|s, a)$.

Figure~\ref{fig:variables_diagram} depicts the two sets of variables $\mathbf{V}^R$ and $\mathbf{V}^I$ for reasoning and interaction respectively, and their overlap on $Q$.
Variable sets $E$, $F$, $H$, and $Q$ are mutually
exclusive. 
Logical reasoning with $\Theta$ produces the combinatorial possible settings of $\mathbf{V}^R$ that are consistent to the logical statements.
The query variables are shared by both interaction and reasoning variables ($Q = \mathbf{V}^R \cap \mathbf{V}^I$, and $Q \neq \emptyset$). Some properties of the variables and their values:
\begin{itemize}
    \item The agent cannot directly observe the variables of $H \cup Q \cup V$. 
    \item Values of fact variables $F$ can be directly collected from the world and no perception is needed. 
    \item Variables $E$ are estimated via streaming data $\lambda$ from sensory readings (e.g., Lidar sensors and cameras). 
\end{itemize}

In episode $i$ and at execution time $t$, the agent receives $z_t\in Z$, and sensory readings $\lambda_t$, where $\lambda^i_t$ is a perception of $E_i$. 
After each episode $i$ (i.e., when a terminal state is reached), values of $\mathbf{V}^R \cup \mathbf{V}^I$ are provided by a human expert, and the collected data can be used for learning purposes. 

The robot's task is specified by a reward function $R(s,a) \rightarrow \mathbb{R}$. 
The objective is to compute a policy $\pi$ for the robot to choose actions at each time step toward maximizing its expected future discounted reward, $\mathbf{E}\left[\sum_{t=0}^{\infty} \gamma^{t} r_{t}\right]$, where $\gamma$ is a discount factor, and $r_t$ is the reward received at time $t$. 

\begin{wrapfigure}{r}{0.4\columnwidth}
  \vspace{-2em}
  \begin{center}
    \includegraphics[width=0.39\columnwidth]{images/variable_diagram_0831.pdf}    
  \end{center}
  \vspace{-1em}
  \caption{Domain variables and their dependencies. }
  \vspace{-.5em}
  \label{fig:variables_diagram}
  %https://docs.google.com/drawings/d/1IrugQZBaj8pBRcJs6qIwViUjUd7Kqmu1P_4P7PjRYPM/edit
\end{wrapfigure}
% Given the stated domain and variables, PERIL is able to solve it as learning through the perceptual reasoning and interaction with the environment. 

%\vspace{.5em}
%\noindent
\emph{\bf Remarks: }
The diagram in Figure~\ref{fig:variables_diagram} can be viewed as an integration of two subproblems. 
The ``reasoning'' box points to a logical-probabilistic reasoning subproblem~\citep{richardson2006markov,baral2009probabilistic,wang2019bridging}, whose input includes logical facts ($F$), e.g., current time, and evidence ($E$), e.g., using computer vision techniques. 
The values of $E$ are estimated using streaming data $\lambda$. 
One can use provided logical-probabilistic rules to infer the values of $Q$ (\textbf{Subproblem I}). 
The ``interaction'' box and the two variables of $A$ and $Z$ together capture the dependencies of a POMDP \emph{at one specific step}, which is the second subproblem of planning under uncertainty (\textbf{Subproblem II}). 
Variables $Q$ and $V$ form the state space of a POMDP, and values of $Z$ are used for state estimation. 
One can compute a policy $\pi$ for the POMDP for sequentially selecting $a \in A$. 
When the two subproblems overlap on some variables ($Q$), one can leverage the reasoning results to guide a robot's sequential decision-making. 
To the best of our knowledge, it is the first time that this integrated reasoning and planning problem is formulated using a pictorial diagram. 

While existing research has investigated reasoning for planning under uncertainty~\citep{zhang2015corpp,chitnis2018integrating,amiri2020learning}, those robots cannot improve their reasoning capabilities as the robots become more experienced. 
Next, we present a learning algorithm that helps a robot improve its skills of leveraging domain knowledge for decision making under uncertainty. 

\begin{algorithm}[t]
\caption{PERIL}

\label{alg:APR}
\footnotesize
\begin{algorithmic}[1]
\ENSURE{Domain $\langle \Theta, E, F, H, Q, V, A, T, Z, O \rangle$, reward function $R$, and parameter $N$}
% \REQUIRE Reasoner $\Omega$, POMDP model $\Psi$ as interaction planner, logic rules $\Theta$, weight set $\mathcal{W}$, dataset $\Phi$, Classifier $C$, parameters $\mathcal{N}$ and $n$
\REQUIRE MLN system $Sol^R$, POMDP system $Sol^P$, relational learning system $Lrn^R$, and supervised learning system $Lrn^S$
\STATE Initialize dataset $\Phi \leftarrow \emptyset$; dataset $\Psi \leftarrow \emptyset$; $\pi \leftarrow \textnormal{random}$; 
classifier $C\leftarrow \textnormal{random}$ \label{A1}
\STATE Initialize weights $W$, $w\leftarrow 1.0$, each $w$ corresponds to $\theta \in \Theta$ \label{A2}
\STATE Compute $\pi$ using $Sol^P$ for POMDP: $(Q\cup V, A, T, Z, O, R)$ \label{A3}
\WHILE [No termination condition -- lifelong learning] {true}  \label{A4}
    % \STATE Initialize $Counter^{episode} \leftarrow < N $ \label{A5}
    \FOR{$i \in [0,N-1]$} \label{A5}
    \STATE Get $\lambda$, and $f$ from the world, where $f$ is a vector of $F$ 
    \label{A6}
    \STATE $e \leftarrow C(\lambda)$; $e$ is a vector and includes the values of $E$  \label{A7}
    \STATE $Pr(Q) \leftarrow Sol^R(\Theta, W, f, e)$  \label{A8}
    \STATE Compute distribution $b$ over state set $S=Q \cup V$ using $Pr(Q)$ and uniform distributions over variables $V$ \label{A9}
    \WHILE{$s$ is not a terminal state} \label{A10}
        \STATE Select action $ a \leftarrow \pi (b) $ and execute $a$ \label{A11}
        \STATE Make an observation $z$ \label{A12}
        \STATE Update $b$ based on $a$ and $z$ \label{A13}
    \ENDWHILE \label{A14}
    \STATE Collect ground truth values $\mathbf{v}^R=\{\hat{e}, \hat{f}, \hat{h}, \hat{q} \}$ \label{A15}
    \STATE Augment dataset: $\Phi \leftarrow \Phi \cup \{\lambda: \hat{e}\}$   \label{A16}
    \STATE Augment dataset: $\Psi \leftarrow \Psi \cup \{ \mathbf{v}^R \}$\label{A17}
    \ENDFOR \label{A18}
\STATE $C \leftarrow Lrn^{S}(\Phi)$ \COMMENT{Supervised learning}  \label{A19}
\STATE $W \leftarrow Lrn^{R}(\Theta, \Psi, W) $  \COMMENT{Relational learning} \label{A20}
\ENDWHILE \label{A21}
\end{algorithmic}
\end{algorithm}

\section{Algorithm}
\label{sec:framework}
In this section, we present PERIL, short for ``\emph{perceptual reasoning and interactive learning},'' a novel algorithm that addresses the knowledge-based sequential decision-making problem described in Section~\ref{sec:problem}. 
A PERIL agent perceives the environments using supervised learning, reasons over domain variables using contextual knowledge, and generates interaction behaviors using a decision-theoretic planning approach. 
PERIL's reasoning capability is enhanced via relational learning as the agent is more experienced over time. 

Algorithm~\ref{alg:APR} describes PERIL, the key contribution of this research.
The input includes a domain description, a problem description specified by reward function $R$, and parameter $N$ for batch-based learning. 
Implementing PERIL systems requires software tools for relational learning ($Lrn^R$) and supervised learning ($Lrn^S$), as well as MLN and POMDP systems ($Sol^R$ and $Sol^P$). 
Lines~\ref{A1}-\ref{A2} are for initialization, where $\Phi$ and $\Psi$ are for storing data for supervised learning and relational learning respectively. 

There are three loops in PERIL. 
%% Outer while loop
 Each iteration of the \textbf{outer while loop} (Lines 4-21) corresponds to one batch where supervised learning and relational learning are activated once (Lines~\ref{A19}-\ref{A20}). 
%%5 For loop
 The nested \textbf{for-loop} (Lines \ref{A5}-\ref{A18}) includes $N$ iterations -- each corresponding to a sequence of perceptual reasoning (Lines \ref{A5}-\ref{A9}), interaction (Lines \ref{A10}-\ref{A14}), and data augmentation (Lines \ref{A15}-\ref{A17}).  
  In perceptual reasoning, the agent infers the query variables $Q$, using the logical weighted rules ($\Theta$, $W$), and the direct observations ($f$) or the estimated observations ($e$) from the world (Lines \ref{A6}-\ref{A8}). 
Using the union of inferred $Q$ and $V$, PERIL builds the initial prior belief $b$ (Line~\ref{A9}), where the posterior is calculated in the inner interaction while-loop (Lines \ref{A10}-\ref{A14}). Once the interaction loop is done, two datasets are augmented with the newly collected data instances.    
%receives raw sensory data $\lambda$, and collects facts $f$(Line~\ref{A6}). 
%It estimates the evidence variable $e$ using the streaming data $\lambda$ and infers $Q$ based on $\Theta$, $W$, $f$, $e$ variables. B  
%%% inner while loop
 The \textbf{inner while loop} (Lines \ref{A10}-\ref{A14}) corresponds to one episode, where the agent takes one action $a$, 
%in each iteration
makes an observation $z$, and updates belief $b$. The actions in this loop are suggested by policy $~\pi$ calculated by $Sol^{P}$. 
PERIL is a lifelong learning algorithm for SDM, and does not have a termination condition. 

PERIL agents learn from interaction experience to improve their capabilities of reasoning with contextual knowledge from people, and planning under uncertainty. 
To the best of our knowledge, no existing algorithm supports this ``learning to reason and plan'' capability. 
%PERIL is the key contribution of this paper. 
Next, we describe a full instantiation of PERIL, as applied to an urban driving domain. 

\begin{figure}[t]
% \vspace{.5em}
    \begin{subfigure}[t]{0.4\linewidth}
         \centering
         \includegraphics[scale=0.09]{images/cop.png}
         \caption{Cooperative}
         \label{fig:1}
     \end{subfigure}
     \quad
     \begin{subfigure}[t]{0.4\linewidth}
         \centering
         \includegraphics[scale=0.09]{images/non_cop.png}
         \caption{Not cooperative}
         \label{fig:2}
     \end{subfigure}
    %  \vspace{-.3em}
        \caption{Two lane merging situations in CARLA-based simulation. (a) The vehicle on the left is cooperative and yields the right of way. (b) The vehicle on the left is not cooperative}
        \vspace{-.7em}
        \label{fig:carla}
\end{figure}

\begin{figure*}[t]
% \vspace{-.5em}
    \centering
    \includegraphics[width=\textwidth]{images/perception_process.pdf}
    \vspace{-.5em}
    \caption{An overview of the perception component where the vehicle receives raw data from the Lidar sensor. 
    The sensory readings are projected to 2D space, and converged into an image. Finally, a CNN outputs if the desired lane is sensed crowded.} 
    %https://docs.google.com/drawings/d/1D-ypAacfSsiH-4zSl-82UAU4j0fiTd2Eb1JoojfEmBQ/edit?usp=sharing
    \label{fig:architecture}
\end{figure*}

\section{Instantiation}
\label{sec:inst}
We use CARLA, an open-source autonomous driving simulation platform, to illustrate a realization of PERIL~\citep{dosovitskiy2017carla}. 
A CARLA environment consists of 3D models of vehicles, traffic signs, buildings, and pedestrians. 
Figure~\ref{fig:carla} shows two example lane-merging situations. 
Next, we provide technical details of each component of our PERIL framework.

\paragraph{CNNs for Perception}
$C$ is our classifier that takes as input raw sensory data (3D Lidar sensory readings in our case), and outputs the road condition. 
We use CNN to build classifier $C$, and to process streaming data $\lambda$ from Lidar sensors. 
Figure~\ref{fig:architecture} shows how $C$ is constructed in our instantiation. 
The 3D sensory readings are first projected to 2D space. 
Then the road area is cropped out to generate a 2D image, which is fed into CNNs for classification. 
The output of classifier $C$ is saved in variable $CarsDetected$ ($true$ or $false$). 
In our domain, $E$ includes only one element: $E=\{CarsDetected\}$. 

\paragraph{MLNs for Logical Probabilistic Reasoning}
We use MLN for logical probabilistic reasoning, and relational learning. 
Our MLN-based reasoner includes five variables: $Weather$, $Time$, $Crowded$, $CarsDetected$, and $Cooperative$. 
Among them, $Weather$ and $Time$ are fact variables: $F=\{Weather, Time\}$. 
The weather can be $Sunny$ or $Rainy$, and the time is either $Busy$ or $Normal$, which is used for reasoning about traffic condition. 
$Crowded$ and $Cooperative$ are query variables: $Q=\{Crowded, Cooperative\}$. $H=\emptyset$. 
There is one evidence variable $E=\{CarsDetected\}$. 
Other drivers' behaviors are simplified to a binary variable of $Cooperative$ with a domain of $true$ or $false$. 
An MLN program includes a set of first-order logical statements, where each is associated with a weight. 
We use MLN to build our logical probabilistic reasoner $Sol^{R}$ and relational learning system $Lrn^{R}$. 
First-order logic rules $\Theta$ form the declarative domain knowledge base. For instance, the following rule
\begin{align*}
    \texttt{\small Time(+t,s)$\rightarrow$ Crowded(+c,s)}
\end{align*} 
indicates that the time implies the crowdedness of the road. If it is at busy time, it is likely the road is crowded. If it is at normal time, it is more likely that the road is not crowded.
The second rule 
\begin{align*}
    \texttt{\small Crowded(+c,s)$\rightarrow$ CarsDetected(+d,s)}
\end{align*}
states that, when the road is crowded, it is more likely that the ego vehicle can detect surrounding cars.
The third rule 
\begin{align*}
\texttt{\small Weather(+w,s) $\wedge$ Crowded(+c,s)$\rightarrow$ *Cooperative(s)}
\end{align*} 
states that the weather condition and the road crowdedness affects the surrounding vehicles (drivers) being cooperative or not. For example, rainy weather (e.g., affecting drivers' visibility) and crowded roads might cause the drivers to be less cooperative.
All rules $\Theta$ are associated with weights. During weight (relational) learning, each rule is converted to conjunctive normal form, and a weight is learned for each of its clauses.
It should be noted that those are ``commonsense'' rules that are normally correct but not always. 
MLNs are well suited for learning to reason with those rules. We then use the input of $H$,$E$ and $F$ to infer the value of Query variables $Q$ from MLN.

\paragraph{POMDPs for Planning under Uncertainty}
We use POMDPs to construct a probabilistic planner for active information gathering, and goal achievement. 
$S : Q \times V \cup \{term \}$ is the state space, where $term$ is a terminal state that identifies the end of an episode. 
$V=\{RoomAvailable\}$. 
$RoomAvailable=true$  means that there is room available in the desired lane for the ego vehicle's lane merging behavior. 
We consider three behaviors in our action space: 
$A=\{signal,~move,~merge\}$, where we assume the vehicle can only merge to one side of the road (say left). 
$signal$ means that the vehicle uses turn signal to indicate its intention to merge. 
$move$ means that the vehicle adjusts its position to get prepared for lane changing, which is also useful for communicating its intention to the other drivers. 
Intuitively, after the vehicle is confident that there is room in the desired lane, and the other drivers are cooperative, the vehicle should take the $merge$ action. 

We use transition function $T(s, a, s')=Pr(s'|s,a)$ to model how action $a$ leads the transition from $s$ to $s'$. 
Actions except for $merge$ have different costs (a small negative value). 
Action $merge$ causes either a big reward or a big penalty (a big negative value), depending on the road condition (values of $Cooperative$, and $RoomAvailable$). 
For instance, if $Cooperative=false$ or $RoomAvailable=false$, action $merge$ will result in a big penalty. 
Action costs, success reward, and failure penalty are modeled in reward function $R(s,a)$. 

The observation set is $Z:\{true, false, na\}$. 
We use the observation function $O(s,a,z)=Pr(z|s,a)$ to describe the perception model of the vehicle. 
For instance, when $Cooperative=true$, there is $0.7$ probability that the vehicle observe $true$ (the other drivers are cooperative).

\begin{figure}[t]
    \includegraphics[width=\columnwidth]{images/illustrative_example_4.pdf}
    \vspace{-.8em}
    \caption{An illustrative example trial of PERIL.}
    %https://docs.google.com/drawings/d/1KFUJhiZaEF2IbnFUtmYjluQcpwAzZr367uH62D0PFLs/edit?usp=sharing
    \label{fig:example}
\end{figure}


\begin{figure*}[t]
\vspace{-.5em}
    \begin{center}
    \includegraphics[width=.95\textwidth]{images/sequence_of_actions_4.pdf}
    %\vspace{-2.2em}
    %\vspace{-1.5em}
    \caption{The ego vehicle took a sequence of actions in the interaction process to successfully merge left. (a) The ego vehicle intended to merge left. It turned on the left signal. (b) The surrounding vehicle on the left was not cooperative at first. The ego vehicle kept left blinking. (c) The surrounding vehicle on the left became cooperative, and the ego vehicle started to move left. (d) The ego vehicle kept moving left and found room in the left lane. (e) The ego vehicle successfully merged left.  }
    %https://docs.google.com/drawings/d/1GL0_uysdbbyl5oSuaV85UnltmYlGn5rp0us4gyjGzu0/edit?usp=sharing
    \label{fig:sequence}
    \end{center}
\end{figure*}


\section{Illustrative Example}
Figure~\ref{fig:example} shows an example trial. 
The vehicle first collected a ``fact'' that it was a rainy day at a busy time. 
The vehicle received streaming data, and the CNN classifier outputs that ${CarsDetected=true}$, meaning that the left lane is occupied by at least one vehicle. 
% and directly observes the facts that \textit{Weather} is \textit{Rainy} and \textit{Time} is \textit{Busy}. 
Reasoning with contextual knowledge about weather and time, our vehicle believed that it was likely the road was crowded and the other drivers were less cooperative. 
The ego vehicle then used our MLN-based reasoner to perform probabilistic inference, and found that $Pr(Crowded=true)=0.995$, and  $Pr(Cooperative=false)=0.970$. 
% driver is not cooperative with the probability of 0.97. 
Those probabilities were used to initialize the POMDP belief $b$.
With the initial belief of the current state and sequential observations, the ego vehicle repeatedly selected actions as shown in Figure~\ref{fig:sequence}.
After two $signal$ and two $move$ actions, the ego vehicle successfully completed a merging lane task. 
%\footnote{A demo video is uploaded as part of this submission. }

\section{Experiments}
We have conducted experiments using the CARLA simulator to evaluate the key hypothesis that learning to reason about domain knowledge improves the agent's performance within the sequential decision-making context. 
We have compared PERIL with the following baselines. 
\textbf{LCORPP} is a baseline method that uses supervised learning for perception, and automated reasoning to guide a probabilistic planner~\citep{amiri2020learning}. 
LCORPP's knowledge base is hardcoded, so it cannot learn to reason about knowledge. 
\textbf{PERIL w/o POMDP} is the same as PERIL except that the action policy is manually crafted: the vehicle takes up to two $signal$ actions (depending on the confidence on state estimation), then a $move$ action, and $merge$. 
\textbf{POMDP-LC} is a classic POMDP-based approach for planning lane changing behaviors~\citep{6728533}, which includes neither supervised learning nor relational learning. 

\paragraph{Experiment Setup}
In each trial, we first spawn our ego vehicle, which is tasked to merge to the left lane. 
We set the range of Lidar sensor to $20m$. 
We sequentially spawn  $M$ vehicles on the left lane ($0\le M \le 8$ in our case) within an area of $radius=20m$ around the ego vehicle. 
If a vehicle has any contact with an existing one, then this vehicle is moved and re-spawned.
We annotated the Lidar sensory data: if there exist two vehicles in the left lane that are at most $10m$ away from each other, then a Lidar instance is labeled $true$, i.e., $CarsDetected=true$. Otherwise, the label is $false$. 
% where an instance is labeled $False$ if there exists less than two vehicles in the left lane or the distance of the closest vehicle is greater than 10 m, otherwise the label is $True$.
Fact variables $Time$ and $Weather$ were sampled uniformly. 
$Crowded$ and $Cooperative$ were sampled using the Markov network of our MLN program. 
For instance, if $Time=normal$, then there is probability $0.7$ that $Crowded=true$. 
We have added perception noise into the observation model. 
For instance, the vehicle's observation is correct in $0.7$ probability. 
%\saeid{this is the definition of positive and negative, we probably have defined it earlier }We set that the other driver is cooperative and there is available space on the left lane as "positive" observation, and that the other driver is not cooperative and there is no room on the left lane as "negative" observation. 
The costs of $signal$ and $move$ actions are $10s$ and $15s$ respectively. 
Successful and unsuccessful trials receive $100$ and $-100$ reward respectively.

We used Alchemy for MLN-based relational learning and logical probabilistic reasoning.\footnote{https://alchemy.cs.washington.edu/}
POMDPs were solved using an off-the-shelf solver~\citep{kurniawati2008sarsop}. 
We used PyTorch~\citep{paszke2019pytorch} for training the CNNs. 
%\saeid{what is the value of N and other parameters mentioned in the algorithm}

\begin{figure}[t]
     \centering
     \begin{subfigure}{0.48\columnwidth}
         \centering
         \includegraphics[width=\columnwidth]{images/reward_comparison.pdf}
         %\caption{}
         \label{fig:reward_comparison}
     \end{subfigure}
     \hfill
     \begin{subfigure}{0.48\columnwidth}
         \centering
         \includegraphics[width=\columnwidth]{images/cost_comparison.pdf}
         %\caption{}
         \label{fig:cost_comparison}
     \end{subfigure}
     \vspace{-.9em}
        \caption{PERIL performed better than the baselines in both overall reward, and interaction cost. }
        \label{fig:comparison}
\end{figure}

\paragraph{Experimental Results}
Every data point in our figures is an average of 4,000 trials, evenly distributed into 5 runs. 
We evaluated the mean values of the 5 runs for each data point, and used the 5 mean values to generate the standard errors. 

Figure~\ref{fig:comparison} shows the results of comparing PERIL with three baseline methods. 
We see that PERIL achieved the highest cumulative reward on average, and required the lowest interaction cost on average. 
The LCORPP baseline produced the second best performance in both reward and cost, which indicates the usefulness of perceptual reasoning. In a stochastic world, LCORPP cannot learn how likely the handcrafted rules are correct while with perceptual reasoning, PERIL can learn weights associated with such rules for better reasoning. 
Specifically, PERIL uses MLN to learn to reason about contextual knowledge, which contributes to the best performance among the four methods. 
% The results in Figure~\ref{fig:comparison}  supports our hypothesis where in comparison with three baselines, PERIL achieves the highest reward and lowest action cost.
All methods produced an average success rate between $0.87$ to $0.89$, where we did not observe statistically significant differences among the methods. 
A successful merge is when the ego vehicle merges left in the presence of enough room and vehicle cooperation. 
An unsuccessful merge in our setup functions like a risky situation in practice, and does not indicate a collision, because autonomous vehicles (or human drivers) have collision-avoidance mechanisms, which are not considered in our experiments. 
% Each data point is an average 20,000 trials, split into 5 batches of size 4000.
Results here support our key hypothesis that PERIL outperforms baseline methods with higher rewards and lower costs. 

% Table \ref{table:tab1} shows the performances of PERIL and baselines under different perception capabilities e.g., $reliability=0.5$ indicates that the observation is random.
% In reality, the $reliability$ of observation of the ego vehicle should be much higher than a random, so we demonstrate the results starting from  $reliability=0.7$.
% The results suggest that PERIL outperformed the baselines as long as the vehicle's perception capability is reasonably good ($\geq 0.7$).
Table~\ref{table:tab1} shows the performances of PERIL and baselines under low and high perception capabilities. 
Low (high) perception quality corresponds to a POMDP observation function, where the vehicle can correctly perceive ``crowdedness'' in 0.7 (0.9) probability. 
Our hypothesis is that PERIL's superiority over the other methods is not affected by the vehicle's perception system. 
The results suggest that PERIL significantly outperformed the baselines at \textbf{0.05 significance level}. 

\paragraph{Ablation Study}

We did an ablation study to evaluate the importance of the two learning components in PERIL (supervised learning and relational learning). 
%to compare the approach of learning from two dimensions with learning from only one of the dimensions. 
% \textbf{PERIL (w/o supervised learning)} where only the weights of logical rules are learned and \textbf{PERIL (w/o relational learning)} where only the classifier is learned.
The results are shown in Figure~\ref{fig:curve_band}. 
% Each data point is the average of 20,000 trials. 
% In the early stage of the experiment, we set batch number $N=100$, but we increase it to larger value $N=400$ in the later stage of the experiment. 
% As PERIL's dataset is augmented, it learns from more perceptual and reasoning data and therefore, having higher reward and taking less action costs. 
% While, the performance of ``PERIL w/o relational learning" or ``PERIL w/o supervised learning" also improved, they could not beat PERIL due to not learning from perceptual data or reasoning data.   
Our first observation is that PERIL performed better than its two ablations in both overall reward, and interaction cost, except for the very early learning phase. 
Another observation is that relational learning plays an important role in the PERIL system. 
When relational learning was disabled, there was significant increase in interaction cost, in comparison to the ablation with supervised learning removed. 
This is potentially because the MLN-based reasoner can learn to ``compensate'' for the missing perception component. 

\begin{table}[t]
\scriptsize
\begin{center}
    \caption{The performances of PERIL and baselines in reward and cost under different perception qualities. 
    PERIL performed the best in both reward and cost with statistically significant improvement, as indicated using italic font. 
    }
    \begin{tabular}{|c|c|c|c|c|c|c|c|}
    \hline
    \multirow{3}{*}{Algorithm}  & \multicolumn{4}{|c|}{Perception quality}\\
    \cline{2-5}
    & \multicolumn{2}{|c|}{Low} & \multicolumn{2}{|c|}{High}  \\
    \cline{2-5}
    & Reward & Cost & Reward & Cost \\
    \hline
    PERIL & \textbf{\emph{46.5}} (0.5) & \textbf{\emph{28.7}} (0.3) &  \textbf{\emph{64.1}} (0.7) & \textbf{\emph{27.1}} (0.3)  \\
    \hline
    LCORPP & 43.5 (1.0) & 34.1 (0.3) & 62.4 (0.4) & 31.0 (0.2) \\
    \hline 
    PERIL w/o POMDP & 20.9 (0.9) & 45.0 (0.0) & 20.2 (1.0) & 45.0 (0.0) \\
    \hline 
    POMDP-LC & 40.2 (0.8) & 39.7 (0.1) & 62.4 (0.4) & 32.3 (0.2) \\
    \hline
    \end{tabular}
    \label{table:tab1}
\end{center}
\end{table}

\begin{figure}[t]
\begin{center}
\hspace{-.5em}
     \includegraphics[width=\columnwidth]{images/curveband.pdf}
     \caption{PERIL performed better than its two ablative versions as more data instances were provided for training. }
     \label{fig:curve_band}
\end{center}     
\end{figure}

\section{Conclusion and Future Work}
In this work, we develop an algorithm called PERIL that learns to reason with contextual knowledge for sequential decision-making. 
PERIL uses convolutional neural networks  for perception, Markov logic networks for reasoning, and partially observable Markov decision processes for planning under uncertainty. 
We have extensively evaluated PERIL in urban driving scenarios. 
Results suggest that PERIL outperformed competitive baselines, as well as its own ablations, in both overall reward and interaction cost. 

Currently, the vehicle learns to perceive the environment (road condition) from data, and learns to improve its reasoning capability using MLN. 
One direction of future work is to replace the POMDP-based planner with a reinforcement learning component. 
By doing that, the vehicle will be able to learn to select actions from its task-completion experience. 
Another direction is to actively acquire knowledge from people~\citep{amiri2019augmenting}, commonsense knowledge bases~\citep{speer2017conceptnet}, or pre-trained models~\citep{brown2020language} to avoid hand-coding rules.

The experimental setting focuses only on a small subset problem of autonomous driving and certain set-ups are simplified, due to the limitation of time and resources. To further verify the scalability of PERIL in solving real-world complicated autonomous driving problems, the data collection in autonomous driving (or other multiagent, interactive) domains could be expensive, time-consuming, and sometimes risky, which is far beyond the scope of this paper. But this is definitely something PERIL (ours) practitioners should consider. We will consider applying PERIL to other non-driving domains and incorporating robot control into the loop in the future.

%\begin{contributions} % will be removed in pdf for %initial submission 
					  % (without ‘accepted’ option in \documentclass)
                      % so you can already fill it to test with the
                      % ‘accepted’ class option
    %Briefly list author contributions. 
    %This is a nice way of making clear who did what %and to give proper credit.
    %This section is optional.

    %H.~Q.~Bovik conceived the idea and wrote the paper.
    %Coauthor One created the code.
    %Coauthor Two created the figures.
%\end{contributions}

\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
    %Briefly acknowledge people and organizations here.
This work has taken place at the Autonomous Intelligent
Robotics (AIR) Group, SUNY Binghamton. AIR research is
supported in part by grants from the National Science Foundation (NRI-1925044), Ford Motor Company (URP Award 2019-2022), OPPO (Faculty Research Award 2020), and SUNY Research Foundation.
    %\emph{All} acknowledgements go in this section.
\end{acknowledgements}

% References
\bibliography{PERIL_281}
\end{document}
