%\documentclass{uai2022} % for initial submission
 \documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
% \usepackage[american]{babel}
\usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools, amsmath, amssymb, colonequals} 
\usepackage{amsthm}
\usepackage{xcolor}
\usepackage{paralist}

\theoremstyle{definition}
\newtheorem{example}{Example}
\newtheorem{definition}{Definition}




% amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usetikzlibrary{calc,positioning,arrows,shapes,fit,backgrounds,scopes}
\usepackage{multirow}


%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Inductive Synthesis of Finite-State Controllers for POMDPs
% goes at the end of the paper
%\thanks{This work has been partially supported by the Czech Science Foundation grant \mbox{GJ20-02328Y} and the ERC AdG Grant \mbox{787914} (FRAPPANT).}
}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is automatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%

\usepackage{multirow}

\newcommand{\sinit}{s_{0}}
\newcommand{\Act}{Act}
\newcommand{\Obs}{Z}
\newcommand{\obs}{z}
\newcommand{\obsFun}{O}
\newcommand{\node}{n}
\newcommand{\nodes}{N}
\newcommand{\mpm}{P}
\newcommand{\mdpT}{(S,\sinit,\Act,\mpm)}
\newcommand{\mdp}{M}
\newcommand{\fscT}{(\nodes,\node_0,\gamma,\delta)}
\newcommand{\fsc}{F}
\newcommand{\ffsc}{\mathcal{F}}
\newcommand{\abst}[1]{\mathcal{A}^{#1}}
\newcommand{\ffscT}{(N,n_0,K_{\Act},K_N)}
\newcommand{\pomdpT}{(S,\sinit,\Act,\mpm,\Obs,\obsFun)}
\newcommand{\pomdp}{\mathcal{M}}
\newcommand{\imc}{\pomdp^\fsc}
\newcommand{\ifmc}{\pomdp^\ffsc}
\newcommand{\ifmck}{\pomdp^{\ffsc_k}}
\newcommand{\pathset}{\mathsf{Paths}}
\newcommand{\fcsset}{\mathsf{FCS}}
\newcommand{\last}[1]{\mathrm{last}(#1)}
\newcommand{\iverson}[1]{\ensuremath{[#1]}}

\renewcommand{\path}{\tau}
\newcommand{\policy}{\pi}
\newcommand{\policymin}{\policy_{\min}}
\newcommand{\policymax}{\policy_{\max}}
\newcommand{\given}{~\vert~}
{}
\newcommand{\pimin}{\pi_{\min}}
\newcommand{\pimax}{\pi_{\max}}


%\newcommand{\pomdp}{\mathcal{M}}

\newcommand{\FSCinit}{\langle  \rangle }
\newcommand{\unitinterval}{[0,1]}
\newcommand{\supp}{\mathrm{supp}}
\newcommand{\prob}{P}
\newcommand{\rew}{R}
\newcommand{\F}[1]{\Diamond#1}

\newcommand{\sj}[1]{\textcolor{red}{SJ: #1}}
\newcommand{\mc}[1]{\textcolor{blue}{#1}}
\newcommand{\ra}[1]{\textcolor{green}{#1}}
\newcommand{\jp}[1]{\textcolor{orange}{JPK: #1}}


% Add authors
%\author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2022 paper}{Jane~J.~von~O'L\'opez}{}}
\author[1]{Roman Andriushchenko}
\author[1]{\href{mailto:<ceskam@fit.vutbr.cz>?Subject=Your UAI 2022 paper}{Milan \v{C}e\v{s}ka}{}}
\author[2]{Sebastian Junges}
\author[3]{Joost-Pieter Katoen}
% Add affiliations after the authors
\affil[1]{%
    Brno University of Technology, Brno, Czech Republic
}
\affil[2]{%
   Radboud University, Nijmegen, The Netherlands
}
\affil[3]{%
   RWTH Aachen University, Aachen, Germany
  }
  
  \begin{document}
\maketitle

\begin{abstract}

We present a novel learning framework to obtain finite-state controllers (FSCs) for partially observable Markov decision processes and illustrate its applicability for indefinite-horizon specifications.
Our framework builds on oracle-guided inductive synthesis to explore a design space compactly representing available FSCs. 
The inductive synthesis approach consists of two stages: The outer stage determines the design space, i.e., the set of FSC candidates, while the inner stage efficiently explores the design space. 
This framework is easily generalisable and shows promising results when compared to existing approaches.
Experiments indicate that our technique is (i) competitive to state-of-the-art belief-based approaches for indefinite-horizon properties, (ii) yields smaller FSCs than existing methods for several POMDP models, and (iii) naturally treats multi-objective specifications.
\end{abstract}

%\vspace{-1em}
\section{Introduction}\label{sec:intro}
 Partially observable MDPs (POMDPs) model sequential decision making processes in which the agent only observes limited information about the current state of the system~\citep{smallwood1973optimal,kaelbling1998planning}. The key challenge in the analysis of POMDPs is to compute a policy satisfying some constraints, captured as a threshold on (discounted) reward or as a task description given in, e.g., a temporal logic. In full generality, policies need arbitrary memory to reflect the belief state of the agent. Point-based~\citep{DBLP:journals/jair/PineauGT06,DBLP:journals/jair/SpaanV05} and Monte Carlo methods~\citep{silver2010monte} excel in finding such policies. Solving the generally undecidable policy learning problem profits from having complementary approaches in the portfolio. A natural alternative is to search for (small) finite-state controllers (FSCs)~\citep{hansen1998solving}. Such controllers provide benefits in terms of explainability~\citep{bonet2010automatic,wang2019state}, resource-consumption~\citep{grzes2013isomorph}, and generalisability~\citep{DBLP:conf/iclr/InalaBTS20}. Recently, an automata learning framework has been proposed for synthesising permissive FSCs~\citep{Bo2021Supervised}. In this paper, we propose a novel approach---\emph{inductive synthesis}---to find FSCs for POMDPs.
 
Inductive synthesis is a technique developed in the context of program synthesis, originally proposed by Church in the 1950's, the task to construct a program that provably satisfies a given formal specification. As developing a program (or in this context, a controller) from scratch is mostly infeasible, variants emerged, most notably syntax-guided synthesis~\citep{DBLP:series/natosec/AlurBDF0JKMMRSSSSTU15-short,DBLP:journals/cacm/AlurSFS18} variations such as \emph{sketching}~\citep{DBLP:conf/asplos/Solar-LezamaTBSS06}.
In sketching, the user provides a sketch that outlines a controller implementation, and a specification that constrains the controller's behaviour. 
The principal engine behind many instances of sketching is (oracle-guided) \emph{inductive synthesis}~\citep{DBLP:journals/acta/JhaS17} and falls in a more general framework of learner-teacher frameworks. 
In a nutshell, this methodology suggests to heuristically \emph{guess} candidate solutions, to \emph{validate} them, and in case the solution is not satisfactory, \emph{learn} in order to refine the search heuristic. 
The successful application of inductive synthesis has inspired numerous applications beyond classical programming, including recent works on sketching of probabilistic programs~\citep{DBLP:conf/pldi/NoriORV15,DBLP:journals/fac/0002HJK21,DBLP:conf/cav/AndriushchenkoC21} and (variations of) programmatic reinforcement learning~\citep{DBLP:conf/icml/VermaMSKC18,DBLP:conf/iclr/InalaBTS20}. 
\emph{This paper proposes inductive synthesis to search for FSCs in POMDPs.}

% \begin{figure*}
% \includegraphics[scale=0.12]{figs/architecture.jpeg}
% \end{figure*}

% Partially observable Markov decision processes (POMDPs) provide an elegant formal model for automated decision making under uncertainty in partially observable environments~\citep{smallwood1973optimal,kaelbling1998planning}.
% They extend MDPs with observation labels, and restrict policies to be observation-based: paths with the same observation traces are indistinguishable and yield the same decisions.
% POMDPs have been successfully applied in many real-word systems ranging from robotic applications~\citep{pajarinen2017robotic}, aircraft collision avoidance systems~\citep{bai2012unmanned}, self-driving cars~\cite{brechtel2014probabilistic} to coordinating planetary rovers~\citep{becker2004solving}.  

\begin{figure}
    \centering
    \begin{tikzpicture}
        \node[inner sep=4pt,draw] (lout) {Learner};
        \node[inner sep=4pt,right=2cm of lout,draw,fill=black!5] (tout) {Teacher};
        
        \node[below=1.2cm of lout,inner sep=4pt, draw] (lin) {Searcher};
        \node[below=1.2cm of tout,inner sep=4pt, draw] (tin) {Eval};
        \node[left=2.6cm of lout,yshift=-2.8em,draw, minimum height=3cm] (abstr) {\rotatebox{90}{Abstr Oracle}};
        
          \draw[->] (lin) edge[bend left=10] node[above] (f) {\footnotesize{FSC}} (tin);
        \draw[->] (tin) edge[bend left=10] node[below] (p) {\footnotesize{value $\&$ conflicts}} (lin);
        
        \node[fit=(lin)(tin)(p)(f), inner sep=2pt, dashed, draw] (teacherdet) {};
        {[on background layer]
        \draw[dotted] (tout.north west) -- (teacherdet.north west);
        \draw[dotted] (tout.south east) -- (teacherdet.south east);
        
        \draw[dotted] (tout.south west) -- (teacherdet.south west);
        \draw[dotted] (tout.north east) -- (teacherdet.north east);
          
        \node[fit=(lin)(tin)(p)(f), inner sep=2pt, dashed, draw, fill=black!5] (teacherdet) {};
        }
        
        
        \draw[->] (lout) edge[bend left=10] node[above,fill=white] {\footnotesize{design space}} (tout);
        \draw[->] (tout) edge[bend left=10] node[below,fill=white] {\footnotesize{best FSC}} (lout);
        
        \draw[->] (lout) edge[bend right=10] node[above] {\footnotesize{design space}} (abstr.76);
        
        \draw[<-] (lout) edge[bend left=10] node[below] {\footnotesize{value bounds}} (abstr.71);
        
        \draw[->] (lin) edge[bend right=10] node[above] {\footnotesize{(sub)design space}} (abstr.-64);
        
        \draw[<-] (lin) edge[bend left=10] node[below] {\footnotesize{value bounds}}  (abstr.-72);
        
        
        
    \end{tikzpicture}
    \caption{Nested inductive synthesis framework with an abstraction oracle. The framework takes a POMDP and a specification and finds an FSC that satisfies the specification.}
   % \vspace{-1.1em}
    \label{fig:overview}
\end{figure}
Our inductive synthesis framework works in two stages, see Fig.~\ref{fig:overview}. 
Let us first discuss the outer stage.
Here, a learner constructs a \emph{design space} containing (finitely many) FSCs. A teacher provides the `best' FSC within this design space, and potentially additional diagnostic information. The learner either accepts the FSC provided by the teacher as final result, or adapts the design space. Naturally, teachers will provide much better FSCs much faster whenever the design space for these FSCs is small. The key ingredient for the outer stage is thus to start with a small design space and to strategically modify this design space based on the obtained feedback from the teacher. A similar idea was proposed in~\citep{kumar2015history}, where the entropy of the observations is used as criterion for adding memory to the FSC. We use the FSC returned by the teacher together with the state-values induced by this FSC. Additionally, we use an abstraction oracle, see below.

%\sj{Can we add one sentence how this compares to the max entropy approach here?}

The inner stage describes the internals of the teacher that determines the `best' FSC within the design space. 
The teacher may naively use enumeration, but can also be realised using  branch-and-bound~\citep{grzes2013isomorph} or mixed-integer linear programming (MILP)~\citep{DBLP:conf/aaai/AmatoBZ10,kumar2015history}. 
We realise the teacher by (another) inductive synthesis loop.
We search for an FSC by symbolically representing the design space as a propositional logic formula.
The policy evaluation analyses a fixed policy w.r.t.\ the given specification (e.g., a reward function and a threshold). 
If the policy refutes the specification, the evaluation engine indicates the distance to satisfaction (e.g., the achieved value) as well as conflicts---critical parts of the FSC that suffice to violate the specification---that are used to prune the search design  space~\citep{DBLP:journals/fac/0002HJK21}. 

Both learning stages have access to an additional oracle that, inspired by ~\cite{andriushchenko2021inductive}, \emph{over-approximates} the design space. 
This larger abstract design space can efficiently be analysed as the underlying problem solved by the abstraction oracle resembles the analysis of fully observable policies. 
The oracle yields constraints to what the best FSC within the original design space will possibly achieve.
This information is an essential ingredient to guide the search in both stages.

% The memory analyzer takes the POMDP and generates a search space for FSC candidates and passes this to a classical inductive synthesis loop inspired by \citep{andriushchenko2021inductive},  in which the learner selects an FSC and passes this to the teacher. The teacher checks whether that FSC satisfies all constraints and the search terminates if all constraints are satisfied. Otherwise, the teacher rejects the FSC and returns diagnostic information that the synthesiser can use to focus its search. We additionally consider an alternative teacher that answers queries about fully observable variants of the POMDP. Once the learner gives up, the memory analyser may improve its suggested search space. The efficacy of this framework critically relies on providing and utilising the diagnostic information.
 

The separate policy evaluation---a natural component in an inductive synthesis framework---brings some advantages.
The policy evaluation (i.e., solving systems of linear equations) via dedicated algorithms is faster than letting an (MI)LP solver solve these equations~\citep{DBLP:conf/atva/DehnertJWAK14}. 
This improves upon performance of MILP-based approaches (either primal \citep{winterer2020strengthening} or dual \citep{kumar2015history}) for  FSC synthesis.
Furthermore, as the policy is fixed, our framework provides an elegant alternative to existing approaches for constrained POMDPs~\citep{poupart2015approximate,khonji2019approximability} and multi-objective POMDPs~\citep{soh2011evolving,roijers2013survey,wray2015multi}.
It additionally paves the way to learn robust FSCs for POMDPs with imprecise probabilities, similar to~\citep{DBLP:conf/aaai/Cubuktepe0JMST21}. 

We instantiate our framework to learn \emph{deterministic} FSCs, i.e., FSCs that do not use randomisation.
Finding optimal deterministic FSCs is NP-complete whereas finding optimal randomised FSCs is ETR-complete\footnote{The class ETR lies between NP and PSPACE.}~\citep{junges2018finite,DBLP:journals/jcss/JungesK0W21}. 
Algorithmically, finding randomised FSCs requires solving non-convex optimisation problems with thousands of variables.
This often limits the guarantees on global (almost-)optimality that are practically feasible~\citep{kumar2015history}. 
Deterministic FSCs are additionally beneficial in terms of reproducibility of their behaviour, which is useful for debugging.
We use an evaluation framework that supports indefinite horizon queries, e.g., queries with a discount factor one. These queries generalize infinite horizon properties with a discount factor ${<}1$ and finite-horizon settings as used in Goal-POMDPs~\citep{DBLP:conf/ijcai/BonetG09,DBLP:conf/aips/KolobovMWG11}.
These queries naturally occur when using  temporal logic specifications and are particularly adequate for safety-critical aspects.


%\mc{I didn't find a suitable reference. I am saying somewhere later that our approach could be extended to randomised FSCs -- recall the branch of PAYNT supporting parameters in transition parameters} 




% The monolithic MILP optimisation (both the structure of the controller and the variables representing its quality has to optimised) is replaced by the oracle-guided exploration loop that separates the search for controller and its evaluation. This gives us a significantly better control over the exploration process. 

% \sj{Work in progress}
% In particular, we use two orthogonal oracles: \emph{abstraction refinement}) that reasons about a single controller by considering an aggregation of the controllers and \emph{counterexample analysis} that infer statements about a set of controllers by examining a single controller.


% In this paper, we propose a novel inductive approach for the synthesis of FCSs. As depicted in Fig~\ref{},  the synthesis loop includes two interleaving steps: i) \emph{inductive synthesis} explores the given family of FCSs and search for optimum solution with respect to the specification and ii) \emph{memory injection} incrementally adds memory to selected observation classes and thus enlarges the family of candidate controllers. The key novelty of our approach is the oracle-guided exploration strategy inspired by recent advances in the inductive synthesis of probabilistic programs~\citep{andriushchenko2021inductive} and the memory injection strategy levering topology of the sub-optimal controllers explored in the previous iteration. 


The experimental evaluation shows the applicability of our approach on a wide range of benchmarks with promising results. 
Particularly, it significantly outperforms approaches based on MILP optimisation. 
We further compare it with the state-of-the-art belief-based approaches, namely, with recent works in formal verification on under-approximation for indefinite-horizon specifications~\citep{norman2017verification,bork2022under}. 
Our inductive synthesis approach is highly competitive and for several POMDPs (having a moderate number of observations/actions and large/infinite belief-space), it is able to find small FSCs improving lower bounds of existing solutions.

%\paragraph{Contributions}
%Our contribution can be summarised as follows: 
%\begin{itemize}
%\item We design a memory injection strategy that incrementally enlarges the design-space of the FCSs by adding memory based on the topological properties of the controllers found in the previous iterations.  
%\item We frame the search for optimal FSC as an inductive synthesis problem and tailor the existing inductive synthesis algorithms to the structural characteristics of FSCs. 
%\item Using the experimental evaluation, we demonstrate the key benefits of the proposed approach: capability of finding small strategies for complex POMDPs, handling complex specification including infinite-horizon properties and additional constraints, and simple integration with alternative approaches.
%\end{itemize}





%\paragraph{Motivate the FCS synthesis against belief-based approaches}  
%\begin{itemize}
%    \item interpretable and executable polices
%    \item possible synergy between these approaches (bounds can be used in both the directions)
%    \item can handle more complicated specifications including multiple properties  
%\end{itemize}

%\paragraph{Deterministic vs randomised FCSs}
%\begin{itemize}
%  \item principally our approach can handle stochastic FCSs (recall that we have a branch of PAYNT that can handle moth discrete and continuous wholes)
%  \item in this paper, we will focus on deterministic FCSs -- add the motivation 
%\end{itemize}

%\paragraph{Our approach vs MILP formulation}
%\begin{itemize}
%  \item Try to argue (with the experimental support) that our approach combining efficient DTMC/MDP analysis with abstraction refinement and inductive reasoning has a better potential to scale to complex POMDPs comparing to methods based on a monolithic MILP optimisation (it optimises both the decision of the FCS and the variables representing the solution of the underlying MDP).
%\end{itemize}

%\paragraph{Adding memory}
%\begin{itemize}
%    \item Compare our memory-injection strategy with the existing history-based FCSs and entropy-based node splitting strategy
%\end{itemize}

%\mc{I think we can skip this Consider to add this reference~\citep{warnquist2013exploiting}:  using a notation of fully observable states that is probably loosely connected to inconsistent observation that are resolved by adding memory} 


\section{Problem Statement}
A \emph{(discrete) distribution} over a finite set $X$ is a~function $\mu \colon X \rightarrow \unitinterval$~s.t.~$\sum_x \mu(x) = 1$. The set $Distr(X)$ contains all distributions over $X$. 
%% The~\emph{support} of $\mu \in Distr(X)$ is $\supp(\mu) = \{ x \in X \mid \mu(x) > 0\}$. \jpk{this notion is not used in the paper}

%\begin{definition}
  A \emph{Markov decision process (MDP)} is a tuple $\mdp=\mdpT$ with a finite set $S$ of \emph{states}, an initial state $\sinit \in S$, a finite set $\Act$ of \emph{actions}, and a \emph{transition probability function} $\mpm(s' \given s,a)$ that gives the probability of evolving to $s'$ after taking action $a$ in $s$.
 A \emph{Markov chain} (MC) is an MDP with $|\Act| = 1$; its transition function is written as $\mpm(s' \given s)$.
%\end{definition}
MDPs can additionally be equipped with a reward function $r(s,a)$. We do not use discount factors, see the paragraph on specifications below.

%A \emph{deterministic policy} $\policy$ for MDP $M$ is a function $\policy\colon \pathset^M_{fin} \rightarrow \Act$. 
%such that $\policy(\path) \in \Act(\last{\path})$ for all $\path \in \pathset^M_{fin}$.
%$\policy$ is \emph{memoryless} if $\last{\path} = \last{\path'}$ implies $\policy(\path) = \policy(\path')$ for all $\path,\path' \in \pathset^M_{fin} $.

  %We call a pair $(s,a)\in S\times\Act$ of states and actions a \emph{transition}. 
  %For $s\in S$, $\Act(s)= \{ a\in\Act \mid \exists s'\in S: \mpm(s' \given s, a) > 0   \}$ denotes the \emph{available actions}.
%  \sj{Cant we just assume all actions are available? It is more common in this community} \mc{I assume we can}

%Deterministic observation functions are well-suited for distributed systems where there is no information about a remote environment. It is general in the sense that the widespread stochastic observation functions $\obsFun \colon S \to Distr(\Obs)$ can be reduced to deterministic observations via a (polynomial) reduction~\citep{ChatterjeeCGK16} that puts the uncertainty in the transition relation.\textcolor{orange}{JPK: Shorten this to: 
%\begin{definition}
A \emph{Partially Observable MDP (POMDP)} $\pomdp = \pomdpT$ extends MDP $\mdp$ with  a finite set $\Obs$ of \emph{observations}, and  a (deterministic) \emph{observation function} \footnote{Observation functions resulting in a distribution over observations can be encoded by deterministic observation functions at the expense of a polynomial blow-up~\citep{ChatterjeeCGK16}.}
$\obsFun$ that returns for every state $s$ an observation $\obsFun(s) = \obs \in \Obs$. The observation $z \in \Obs$ is said to be \emph{trivial} if there is only one state $s\in S$ with $\obsFun(s) = z.$ 
%\end{definition}

\textbf{Finite State Controllers (FSCs)}
are automata that compactly represent policies. We call its states (memory) nodes to distinguish them from POMDP states. We also refer to an FSC with $k$ nodes as having $k$ memory.
FSCs in the literature come in various styles, in particular either as Moore machines, with the output---the action it selects---determined by the node, or as Mealy machines, with the output determined by the taken transition~\citep{DBLP:conf/aaai/AmatoBZ10}.
%\footnote{Other variations include whether the memory update depends on the action, \mc{finish this}}.
In the context of sketching FSCs and their inductive exploration, it is convenient to describe FSCs as Mealy machines. 
Furthermore, we restrict ourselves to deterministic FSCs. 

%however, the proposed methodology could be extended to support randomised decisions.  
%\textcolor{orange}{I would mention this in the conclusions; not here as it raises the question why did we not do that?}

%\begin{definition}
Formally, a \emph{finite-state controller} (FSC) for a POMDP $\pomdp$ is a tuple $\fsc = \fscT$, where $N$ is a finite set of \emph{nodes}, $\node_0 \in N$ is the \emph{initial node}, $\gamma(\node,\obs)$ determines the action when the agent is in node $\node$ and observes $\obs$, while $\delta$ updates the memory node to $\delta(\node,\obs)$, when being in $\node$ and observing $\obs$. 
For $|\nodes|=k$, we call an FSC a $k$-FSC. 
%The set $\fcsset^{\pomdp}_k$ denotes the set of $k$-FSCs.
%\end{definition}






\begin{figure}
    \centering
    \includegraphics[width=0.24\textwidth]{figs/maze_2.pdf} 
    \includegraphics[width=0.2\textwidth]{figs/fsc+imc.pdf}
    \caption{A simple maze problem (left), a part of a 2-FSC (right, top) and a part of the induced MC (right, bottom).}
    \label{fig:maze_1}

\end{figure}


%\begin{definition}
Imposing $k$-FSC $\fsc$ onto POMDP $\mathcal{M}$ yields the \emph{induced} Markov chain $\imc = (S^{\fsc} , (s_0,n_0), \mpm^{\fsc})$ with ${S^{\fsc} = S \times N}$ and using\footnote{ Iverson-brackets: $\iverson{x} = 1$ if predicate $x$ is true, $0$ otherwise.} $z = \obsFun(s)$:
$$\mpm^{\fsc}((s',n') \given (s,n)) = \mpm (s' \given s, \gamma(n,z)) \cdot \iverson{\delta(n,z) = n'}.$$
%\end{definition}

\begin{example}
\label{ex:maze}
As running example, we use a simple variant of the maze problem~\citep{hauskrecht1997incremental}, where an agent tries to reach the state $s_T$, modelled by the POMDP $\pomdp$ with $S=\{s_0, \ldots s_9, s_T\}$, $\Act=\{u,d,l,r\}$, and $\Obs=\{z_0,\ldots,z_5\}$. The initial state is given by a uniform distribution over~$S$. Fig.~\ref{fig:maze_1} (left) depicts $P$ and $\obsFun$ where state $s_x$ is labelled by $x/y$ with $x$ the state index and $y$ its observation, i.e., $O(s_x) = z_y$. The arrow direction from $x/y$ to $x'/y'$ represents the action; e.g., $\rightarrow$ corresponds to action $r$. The maze is slippery. An action is successful with probability $0.9$; with $0.1$, the agent does not move. Actions without effect are omitted from the figure. Fig.~\ref{fig:maze_1} (right, top) illustrates a fragment of a 2-FSC where $\gamma(n_0,z_0) = \gamma(n_0,z_1) = r$ (for memory node $n_0$ and observations $z_0$ and $z_1$, action $r$ is chosen), $\gamma(n_0,z_3) = \gamma(n_1,z_1) = l$, $\delta(n_0,z_0) = \delta(n_0,z_1) = n_0$ (memory node $n_0$ is not changed for $z_0$ an $z_1$) and $\delta(n_0,z_3) = \delta(n_1,z_1) = n_1$. This FSC tries to resolve the inconsistency (formalised later) in the observation $z_1$, i.e., in $s_1$ the action $r$ is optimal but in $s_3$ action $l$ is optimal (w.r.t.\ reaching $s_T$). Fig.~\ref{fig:maze_1} (right, bottom) illustrates a fragment of the induced MC containing two copies of $s_2$.
\end{example}
%%


%\jp{add initial POMDP and FSC states to Fig.~\ref{fig:maze_1}}
%\mc{the initial state of the POMDP and induce MC is given as a ditribution -- see Ex.1. Added for the FSC}

\textbf{Specifications} 
contain two parts: a set of \emph{constraints} given by quantitative properties and a single optimisation \emph{objective}.
%%
Constraints are defined as indefinite-horizon reachability and expected reward properties, but our approach also supports more general probabilistic temporal logic properties~\citep{BK08}\footnote{These properties can describe the setting of  goal-POMDPs, finite horizon reachability and rewards, and discounted rewards.}
%% Formally, the constraints are defined on the induced MC of the given POMDP and FSC. 
Let target set $T\subseteq S$, \emph{thresholds} $\lambda_1 \in \unitinterval$ and $\lambda_2 \in \mathbb{R}^+$ %\textcolor{orange}{JPK: or $\infty$?}, 
and $\bowtie \, \in \{ \leq, \geq\}$. 
The POMDP $\pomdp$ under FSC $\fsc$ \emph{satisfies} the constraint $\prob_{\bowtie \lambda_1}$ if the probability $\Pr^{\fsc}$ of reaching $T$ in the induced MC $\imc$ meets $\bowtie \lambda_1$. 
Similarly, the constraint $\rew_{\bowtie \lambda_2}$ is satisfied if the expected reward $R^{\fsc}$ accumulated in MC $\imc$ until reaching $T$ meets $\bowtie \lambda_2$. We call an FSC $\fsc$ \emph{admissible} (for $\pomdp$), if $\pomdp$ under $\fsc$ satisfies the given (set of) constraint(s).
%%
Objectives either minimise or maximise reachability probabilities (as in goal-POMDPs) or (un)discounted expected reward properties, denoted as $\prob_*$ and $\rew_*$ respectively for $* \in \{\min,\max\}$. The probability or reward obtained by FSC $F$ on $\pomdp$ is called the \emph{value} of $F$.
For conciseness, we assume throughout the paper that the specification contains a maximisation objective. Minimisation is analogously supported (but may require flipping bounds and inequalities). 


\paragraph{Problem statement.}
%\sj{We can also move this back to the approach}
We aim to construct an algorithm that: 
i) quickly finds a (small) admissible FSC $F$ and ii) incrementally improves $F$ w.r.t.\ the optimisation objective. 
We can view the algorithm as solving a sequence of decision problems, where the first decision problem is to find some admissible FSC $F_0$ and decision problem $i{+}1$ is to find an admissible FSC $F_{i+1}$ whose value improves upon the value of the previous FSC $F_i$.


%and respects the optimisation objective. 
%We aim at an incremental search strategy that iteratively adds memory to candidate FSCs so as to find an admissible FSC and to improve the value of the optimisation objective that is achieved. 
%\sj{Two point problem statement}


%\textcolor{orange}{as this is part of the specification, it needs to be satisfied, right?} that is part of the specification and prefer to find small FSCs.
%\sj{Maybe we should make this slightly more prominent/formal} \textcolor{orange}{JPK: I agree; I prefer a formal problem statement.}

%\sj{The optimization objective makes this all a bit more vague in terms of guarantees. Shall we formulate specs without optimization problem and then explain at the end of the problem statement that we can extend this into an optimization problem blabla}
 

% In this section, we describe the key ideas underlying the proposed approach for synthesis of FSCs. For a given POMDP $\pomdp$, our ultimate goal is to find the smallest FSC that ensures the specification given by a set of constrains and an optimisation objective. 
% Since this goal is in general intractable, we relax it and and want to find a small FSC that satisfies the constraints and achieves a value that is close to the optimum. 


 
%\paragraph{Paths and Policies}
%\sj{Move these defs to where we actually need this. It is rather notation heavy and nonstandard for UAI readers}
%A  \emph{path} $\path=s_0\xrightarrow{a_0} s_1\xrightarrow{a_1}\cdots$ of an MDP $M$ is an (in)finite sequence of states and actions where
%$\mpm(s_i,a_i,s_{i+1}) > 0$ for all $i\in \mathbb{N}$. 
%For finite $\path$,  $\last{\path}$ denotes the last state of $\path$. 
%The set of all (in) finite paths of $M$ is $\pathset^M_{fin}$ ($\pathset^M$).

%A \emph{deterministic policy} $\policy$ for MDP $M$ is a function $\policy\colon \pathset^M_{fin} \rightarrow \Act$. 
%such that $\policy(\path) \in \Act(\last{\path})$ for all $\path \in \pathset^M_{fin}$.
%$\policy$ is \emph{memoryless} if $\last{\path} = \last{\path'}$ implies $\policy(\path) = \policy(\path')$ for all $\path,\path' \in \pathset^M_{fin} $.
%We may lift the observation function to paths and define \emph{observation sequence} $\obsFun(\pi) = \obsFun(s_0)\xrightarrow{a_0} \obsFun(s_1)\xrightarrow{a_1}\cdots$. An \emph{observation-based strategy} for POMDP $\pomdp$ is a strategy for the underlying MDP $M$ such that $\pi(\pi) = \pi(\pi')$ for all $\pi,\pi' \in \pathset^M_{fin}$ with $\obsFun(\pi) = \obsFun(\pi') $.  \sj{Where do we use this?} \mc{So far we don't explicitly use it.} 


% \section{Overall Approach}
% We phrase learning admissible FSCs as a search problem. Let us first give a bird's eye view.
% In an outer loop, we decide \emph{where} to search. In particular, that means that we select the subset of FSCs that we consider. 
% We implement an optimistic approach that starts searching within a restrictive class of FSCs and extends this class based on feedback we get from the search itself. The class of FSCs is represented as a family of FSCs. 
% In the inner loop, we realize the search itself. We use an oracle-guided search in which we analyze either abstract families of FSCs or where we analyze an individual FSC to learn facts about more than just that FSC. 

% We now first consider the inner loop. 

% \sj{Parked some text}
% \color{orange}
% The diagnostic information that the teacher provides can be of either of two types:
% A \emph{conflict} which determines that a substructure of the FSC cannot 

% Safe bounds on the quality of sets of FCSs and decisions that are crucial with respect to the observability in the POMPD. They are used prune the search space and drive the search towards the optimal controller. They also allow us to intelligently restrict the search space by sacrificing the completeness of the search and focusing only FCSs that are topologically close to the optimal controller in the fully observable POMDP. Finally, the information are essential for finding observation where additionally memory can improve the controllers.  
% \color{black}

% \sj{Is there something novel in the inner loop? This reads as if we use the exact same techniques.}\mc{I was trying to emphasise that we tailor the existing techniques to this problem -- see below. Note that we are moving to significantly larger families and we can somehow handle them thanks to the changes we made that leverage the topology of the families -- I agree this needs to be stated more explicitly} 
% We tailor the existing approaches for inductive synthesis of probabilistic programs to explore the family of FSCs. In particular, we use a MDP abstraction of family of induced Markov chains that provides safe bounds on the given quantitative properties~\citep{ceska2019shepherding}. We leverage the specific topology of the family and design several variants of the MDP refinement strategies that leads to different exploration strategies ranging from a complete exploration to an aggressive pruning of the family potentially discarding  some optimal solutions. Additionally, we prune the family using a generalisation of counterexamples constructed for candidate FSCs violating the specification~\citep{andriushchenko2021inductive}. We propose several generalisation methods affecting the pruning level. 


% Based on the inner loop, we design the outer loop as follows:
% We avoid the construction of full FSCs where memory is added uniformly to all observations~\citep{junges2018finite} and rather incrementally add memory nodes to the selected observations.
% %memory to the observation with the highest potential to improve the quality of the solution. 
% %In every iteration of the loop, we consider a family of FSCs given by the memory model that determines for every FSC the number of nodes and size of the action memory mappings.  
% The inductive exploration of the family in the inner loop provides  important diagnostic information about the topology of the optimal FSC found within the family. This information is used by the memory injection strategy 
% to construct a larger family by adding memory to the most promising observation. The strategy compares  the fully observable controller and the  optimal FSC obtained by removing the decisions that are inconsistent with respect the observation classes. It estimates  how the optimal FSC can be improved by resolving the particular inconsistent decisions by adding memory. 
% %How is not important here.
% %The estimation is weighted by the significance of the decision (i.e. how often the decision is taken).

% \sj{Parked some more text}
% \color{orange}
% The separation and the diagnostic information allow us to easily examine the topology of the selected candidate controllers that is essential for adding memory. 
% We propose a novel memory injection strategy that compares the decisions made by the running candidate and by the optimal controller for the fully observable POMDP. 
% It estimates how the different decisions in fully observable POMDP can improve the candidate controller and selects the most influential  decision. The memory is thus added to the observation where this decision is made.
% \sj{Do we have an intuitive example for this?} \mc{We can either refer to and example that will be used in section describing the memory injection or somehow describe e.g. the Hallway problem}.
% Comparing to the entropy driven strategy~\citep{kumar2015history}, we leverage more precise information\sj{This is vague} \mc{is it better} including the decisions of the fully observable controller that help to better estimate the impact of adding memory to particular observations.




% Finally, our exploration strategy allows us to integrate additional knowledge about the candidate FCSs. Imagine another oracle (e.g. analysis of the belief-state MDP) that gives us lower bounds on the optimal state values~\citep{bork2020verification}. These bounds can be used to prune the design space and speedup the exploration. In this way, we can use our approach to search for the smallest FSC implementing the belief-based control strategy.
% \sj{Are we doing this? Otherwise, probably leave this out of the intro} \mc{So far we can manually add a single bound on the optimal solution, but we can easily automate this. I think we will not have time/space to explain/demonstrate this.}  
% \color{black}


\section{Inductive exploration of FSCs}
\vspace{-0.5em}
\label{sec:innerloop}
This section presents the inner loop (see Fig.~\ref{fig:overview}) in which we search among a given set of $k$-FSCs. Before we describe the ingredients, we formalise the representation of the set of $k$-FSCs. We then outline the two oracles that our search can use to prune the search space. A \emph{hybrid strategy}~\citep{andriushchenko2021inductive} combines the two oracles by switching based on perceived performance while communication between the oracles takes place.
\vspace{-0.5em}
\subsection{Families of FSCs}
\label{sec:red_fsc}

A POMDP and a single FSC yield a single induced MC. A POMDP and a set of FSCs thus induces a set of MCs. The set of FSCs has additional structure which enables concisely describing the set of MCs. We first consider \emph{full} FSCs where for each observation the same amount of memory is used and where there are no restrictions on the memory updates. We generalise this to a class of \emph{reduced} FSCs that are more memory efficient.

%\sj{FYI: I added a discussion regarding memory nodes vs memory vs states in Sec 2} great

%The precise structure of the family depends on the set of FSCs. 

%\begin{definition}
%A \emph{family} of full $k$-FSCs is a tuple $\ffsc_k = \ffscT$, where $N$ is the set of $k$ nodes, $n_0$ is the initial node, $K_{\Act} = N\times \Obs$ and $K_{N} = N\times \Obs $ are finite sets of parameters with domains $V_{(n,z)}^{\Act}  \subseteq \Act$ for each $(n,z) \in K_{\Act}$ and $V_{(n,z)}^{N}  \subseteq N$ for each $(n,z) \in K_{N}$. 
%\end{definition}


\begin{definition}
A \emph{family} of full $k$-FSCs is a tuple $\ffsc_k = (N, n_0, K)$, 
%\textcolor{orange}{JPK: it seems $K_{Act}$ and $K_N$ became superfluous} \mc{I merged then as it simplifies the notation and we do not need to distinguish them in the paper.}
where $N$ is a set consisting of $k$ nodes, $n_0 \in N$ is the initial node, $K = N\times \Obs$ is a finite set of parameters each with domain $V_{(n,\obs)} \subseteq \Act\times N$.
\end{definition}

From a family, one may obtain a $k$-FSC by choosing values for each parameter, effectively determining the action $\gamma(n,\obs)$ and the next node $\delta(n,\obs)$. 
Thus, each family describes a set of FSCs by varying the substitutions of the parameters. We often use $\ffsc_k$ to denote such a set of $k$-FSCs. 
We remark that this set contains $\mathcal{O}((|\Act||N|)^{(|N||\Obs|)})$ many FSCs. 
%i.e. $\ffsc_k \subseteq \fcsset^{\pomdp}_k$. 
A POMDP $\pomdp$ and a family $\ffsc_k$ naturally induces the family of MCs $\pomdp^{\ffsc_k} = \{\imc \mid \fsc \in \ffsc_k  \}$.

%\begin{example}
%The family $\ffsc_2$ describing all 2-FSCs for the maze problem from Example 1 is given by $N = \{n_0,n_1\} $, $K_{\Act} = K_{N} = \{ (n_i,z_j) \mid i \in \{0,1\} \wedge  j \in \{u,d,l,r\} \}$, $V_{(n,z)}^{\Act} = \{u,d,l,r\}$ for all  $(n,z) \in K_{\Act}$ and $V_{(n,z)}^{N} = \{n_0,n_1\}$ for all  $(n,z) \in K_{N}$
%\end{example}

\begin{example}
The family $\ffsc_2$ of all 2-FSCs for our maze problem is given by $N = \{n_0,n_1\} $, $K = \{ (n_i,z_j) \mid i \in \{0,1\} \wedge  j \in \{0,\ldots ,5\} \}$, and $V_{(n,z)} = \{u,d,l,r\} \times \{n_0,n_1\}$ for all  $(n,z) \in K$.
\end{example}


%\mc{I think we don't need this}
%A \emph{realization} of a~family $\ffsc_k$ are two functions $r_{\Act}: K_{\Act} \rightarrow \Act$ and 
%$r_{N}: K_{N} \rightarrow N$ such that $r_{\Act}(k) \in V_k$, for %all $k \in K_{\Act}$ and $r_{N}(k) \in V_k$, for all $k \in K_{N}$. Realization~$(r_{\Act},r_N)$ naturally induces FSC $\fsc_{r_{\Act}}^{r_N}$, where $\forall n\in N, z\in \obs: \gamma(n,z) = r_{\Act}((n,z))$ and $\delta(n,z) = r_{N}((n,z))$. The set of all FSCs of $\ffsc_k$ is denoted as $\mathcal{R}^{\ffsc_k}$. 

While FSCs have $k$ available memory nodes in conjunction with every observation, memory may only be required in some observation (see e.g., the running example). Therefore, we consider reduced FSCs given by a \emph{memory restriction} $\mu: \Obs \rightarrow \mathbb{N}$, where $\mu(\obs)$ determines the number of memory nodes used in the observation $\obs$. 

\begin{definition}
 A \emph{reduced family} $\ffsc_{\mu}$ given by the memory model $\mu$ is a sub-family of $\ffsc_k$ for $k = \max_{\obs \in\Obs}\{\mu(\obs)\}$ where $(n,z)\in K$ implies $n\leq \mu(z)$, and the domains $V_{(n,z)}$ are as in $\ffsc_{k}$. If a memory update $\delta(n,z)=n'$ is invalid in the resulting observation $z'$ (i.e., $n'>\mu(z')$), then  $\delta(n,z)=n_0$. 
\end{definition}

%\sj{The set of FSCs that are represented by this family is only defined via a footnote... not sure how to fix this.}

The reduced family for $k$-FSCs yields a significantly reduced number of parameters $\sum_{\obs \in\Obs}\{\mu(\obs)\}$. This is less than $k \cdot |\Obs|$ if $\mu(\obs) < k$ for most observations $\obs$. Indeed, in many experiments, we can use $\mu(\obs) = 1$ for all but a few observations. % $\sum_{\obs \in\Obs}\{\mu(\obs)\}$ parameters, but in many benchmarks, memory is required only in a few observations dropping the number of parameters from $\mathcal{O}(|N|{\cdot}|\Obs|)$ to $\mathcal{O}(|N|{+}|\Obs|)$.}
%\sj{I found the big-O notation here more confusing but if you want we can add it again}
Such reduction has several key benefits: Foremost, the family of reduced FSCs induces a smaller design space, but it also yields FSCs where less memory is needed, which can be beneficial for their interpretability.
%(the number of parameters is given by the size of the mappings).

\vspace{-0.5em}
\subsection{MDP-abstraction teacher}
\vspace{-0.5em}
\label{sec:abst}

%\begin{definition}
%An MDP abstraction of $\ifmck$ is MDP $\overline{\ifmck} = (S\times N,(s_0,n_0), \Act^{\ffsc}, \mpm^{\ffsc})$ where 
%$\Act^{\ffsc} = \Act \times N$ and $\mpm^{\ffsc}((s,n),(a,n'),(s',n')) = \mpm(s,a,s') $ if $a\in V_{(n,\obsFun(s))}^{\Act}$ for the parameter  $(n,\obsFun(s)) \in K_{\Act}$ and $n'\in V_{(n,\obsFun(s))}^{N}$ for the parameter  $(n,\obsFun(s)) \in K_{N}$, 0 otherwise
%\end{definition}
We introduce a teacher which, for a POMDP and a FSC-family, provides safe upper and lower bounds on the value of the FSCs in this family. 
Instead of considering the individual FSCs, the oracle considesr an abstraction (represented as a single MDP) of the set of induced MCs. 

\begin{definition}
MDP $\abst{\ffsc} = (S{\times}N,(s_0,n_0), \Act^{\ffsc}, \mpm^{\ffsc})$ is an \emph{abstraction} of MC family $\ifmc$ with $\Act^{\ffsc} = \Act \times N$ and $\mpm^{\ffsc}((s',n') \given (s,n),(a,n')) = \mpm(s' \given s,a) $ if $(a,n')\in V_{(n,\obsFun(s))}$, and 0 otherwise.
\end{definition}

For MDPs and our specifications, it suffices to consider deterministic memoryless policies, i.e., policy $\policy$ for MDP~$\abst{\ffsc}$ is a function $\policy\colon S{\times}N \rightarrow \Act^{\ffsc}$.
%\footnote{This should not be confused with FSC policies for POMDPs.}
It is \emph{consistent} (w.r.t.\ the observations)  if ${\obsFun(s) = \obsFun(s')}$ implies $\pi((s,n)) = \pi((s',n))$ for all ${s,s' \in S}, n\in N$. 
The set of consistent policies in $\abst{\ffsc}$ corresponds to the policies for the family $\ffsc$.
The policy $\pi$ is \emph{inconsistent} in a FSC-family parameter $(n,z) \in K$ if $\exists {s,s'\in S}: \obsFun(s) = \obsFun(s') = z \wedge \pi((s,n)) \neq \pi((s',n))$. 
It is inconsistent in observation $\obs \in \Obs$, if it is inconsistent in the parameter $(n,z) \in K$ for some $n \in N$.  

%\sj{Right now observations are inconsistent in a policy, but I think it s more natural to flip this to a policy being inconsistent in an observation} Done

%Assume that $\exists {s_1,s_2\in S}$, $n\in N$ such that  $\obsFun(s) = \obsFun(s') = z \wedge \pi((s_1,n)) = (a_1,n_1) \  \wedge \ \pi((s_2,n)) = (a_2,n_2)$. We say that parameter $(n,z) \in K_{\Act}$ is inconsistent in $\pi$ if $a_1 \neq a_2$.
%Similarly, $(n,z) \in K_{N}$ is inconsistent if $n_1 \neq n_2$.
%Finally, observation $\obs \in \Obs$  is inconsistent if there is an inconsistent parameter $(n,z) \in K_{\Act} \cup K_{N}$ for some $n \in N$. Recall that the set of consistent policys in $\overline{\ifmck}$ matches the family of FSCs~$\ffsc_k$.  

\begin{example}
Assume we want to maximise the probability to reach $s_T$. 
The stars in Fig.~\ref{fig:maze_1} represent the optimal policy~$\pi^{*}$ in MDP $\abst{\ffsc}$ where $\ffsc$ is set of all \mbox{1-FSCs} for the maze problem. $\pi^*$ is inconsistent in the observations $z_1$ and $z_4$.  
\end{example} 

The analysis of MDP $\abst{\ffsc}$ provides useful information about the family $\ffsc$. For conciseness, consider the constraint $\prob_{\geq \lambda}$ bounding the reachability probability to states in $T$.
%(Reasoning for constraints of the form $\prob_{\leq \lambda}$ is dual and expected reward constraints are handled the same).
We can compute the policy $\pi^{*}$ in $\abst{\ffsc}$ that maximises this reachability probability. 
In particular, this policy achieves probability $\Pr^{\pi^{*}}$. 
If $\Pr^{\pi^{*}} < \lambda$, then all $F \in \ffsc$ violate the constraint $\prob_{\geq \lambda}$ and $\ffsc$ can be safely discarded. 
Otherwise, we check the consistency of policy $\pi^{*}$.  
If $\pi^{*}$ is consistent, it represents an FSC satisfying $\prob_{\geq \lambda}$.
Similarly, a minimising policy may witness that the entire family $\ffsc$ satisfies $\prob_{\geq \lambda}$.
If analysing $\abst{\ffsc}$ is inconclusive, we refine $\ffsc$ by splitting, see below.

The optimisation objective is handled by iteratively updating a new (initially trivial) constraint representing the running value of the optimum so far. 
Once an admissible policy $\pi$  is found, we update the new constraint according to the objective value that $\pi$ achieves. 
Reasoning about multiple constraints works as follows.
% If, in addition to optimisation objective, the specification involves additional constraints, the reasoning goes as follows.
If the entire family $\ffsc$ violates some constraint, $\ffsc$ is discarded.
If $\ffsc$ satisfies the constraint, this constraint will neither be checked again for $\ffsc$ nor for its subfamilies.
Otherwise, if the analysis of $\ffsc$ was inconclusive with respect to some constraint, $\ffsc$ is refined.
% As soon as a family satisfies all the constraints, the analysis for this family transposes to the case where only optimisation objective is considered.

%If the entire family satisfies all the constraints, we 
%check the consistency of policy for the the optimising constraint. If it is consistent, we update the constraint, store the solution and prune the family. Otherwise we pick a FSC, update the constraint and analyse the family again. 
% Otherwise, we investigate the consistency of the policies found for the constraints with the aim to find an FSC improving the optimum and eventually to prune $\ffsc$. \sj{I dont get this sentence}
% If $\ffsc$ cannot be pruned, it is \emph{refined}.     

Beyond pruning families, analysing $\abst{\ffsc}$ provides state-vectors $ub$ 
and $lb$ such that  $\forall s \in S$, $lb(s)$ and $ub(s)$ bound the probability to reach $T$ from $s$. These bounds are used in the inner and outer synthesis loop, as we will see below.\footnote{Furthermore, the state-vectors $ub$ and $lb$  allow \emph{bootstrapping} the analysis of MDP $\abst{\ffsc_i}$ where $\ffsc_i$ is a subfamily of $\ffsc$: 
This exploits the fact that $\ffsc_i$ shares the structure of $\ffsc$ while some actions for some states are removed.}


\paragraph{General refinement strategy}
The refinement strategy is a key component in driving the exploration of the family $\ffsc$. 
It decomposes $\ffsc$ into sub-families by splitting the domain of selected parameters from $K$. 
In contrast to the general strategy used in program synthesis~\citep{ceska2019shepherding}, we leverage the specific topology of the FSC families. 

The key idea is to examine the inconsistencies of the policy $\pi^*$ obtained for  a given constraint. 
%for a fixed constraint\footnote{We focus on the constraint derived from improving the objective value as it usually is the most restrictive.}. 
Assume $\pi^*$ is a maximising policy inconsistent in parameter $(n,z) \in K$.
% \jp{to what extent is minimising relevant for the remainder of this paragraph, and if so, what about maximising policies?}
We estimate the \emph{significance} of this inconsistency as
% each parameter $(n,z) \in K$ in which $\pi^*$ is inconsistent. In particular, the significance of parameter $(n,z)$ is estimated as
% an impact of changing $(n,z)$ in $\pi^*$ computed as
the average variance (with respect to inconsistent actions) of $ub((n,s))$ with $\obsFun(s) = z$, where $ub((n,s))$ is weighted by the expected number of visits of the state $(n,s)$ in the MC induced by $\pi^*$ (if $\pi^*$ is minimising, we use $lb$). 
%of the decisions corresponding to~$p$. \sj{Does this mean that we compute the expected times we visit $(n, \obs)$ under policy $\pi^*$ and multiply that with...??? I dont understand how we examine the impact of changing $p$}
We refine $\ffsc$ using the most significant inconsistent parameter $(n,z)$.
% We then select the most significant parameter~$(n,z)$.
Assume it has domain $V_{(n,z)}$ and $\pi^*$ selected options $v_1,\dots,v_i$.
% \sj{What if we select more than 2 options?}
We partition $V_{(n,z)}$ into $\{v_1\}, \dots, \{v_i\}$ and  $V_{(n,z)} {\setminus} \{v_1,\dots,v_i\}$, and create $i{+}1$ corresponding subfamilies.
% We partition $V_p$ into $V_p^1 = \{v_i\}$, $V_p^2 = \{v_j\}$ and $V_p^3 = V_p \setminus \{v_i, v_j\}$ and create the three corresponding subfamilies.
This removes the inconsistency of $(n,z)$ by considering the selected options $v_1,\dots,v_i$ within different sub-families.

\paragraph{Incomplete refinement strategy} 
We suggest the following incomplete refinement strategy to focus the search (at the cost of completeness). In particular, we restrict the exploration to FSCs that are structurally close to $\pi^*$ as follows:
we i) fix the options selected by $\pi^*$ in perfectly observable states, ii) fix the options in the consistent parameters, and iii) remove options in the inconsistent parameters that were not selected by $\pi^*$, i.e., the set $V_{(n,z)} {\setminus} \{v_1,\dots,v_i\}$.   

%\mc{if we there is a space, add an example of complete and incomplete refinement strategy}


\subsection{Counterexample-based Teacher}
\label{sec:counterexpruning}

The MDP abstraction employs deductive reasoning: It talks about a set of FSCs at once to deduce conclusions about the individual members of this set. In this subsection, we discuss the orthogonal, inductive, approach. We suggest a (smart) enumeration over individual FSCs inspired by~\cite{DBLP:journals/fac/0002HJK21}. If the FSC is satisfactory, i.e., it is admissible and has good value, this helps the teacher. Otherwise, if the FSC is not satisfactory, we learn facts, called \emph{counterexamples}, that help us to avoid considering other FSCs.

To realise this teacher, we represent the FSCs that have not been pruned as a propositional formula\footnote{For each parameter $k$, there is a corresponding integer variable $x_k$ whose domain $V_k$ corresponds to all possible realisations of $k$. Initially, the design space is encoded as a conjunction $\land_{k \in K}: x_k \in  V_k$. Pruning a subfamily corresponds to adding to this formula a negated clause describing all the pruned realisations. Details can be found in~\citep{DBLP:journals/fac/0002HJK21}.}. 
We use the SMT solver CVC5~\citep{cvc5} (over quantifier-free bounded integers) to effectively manipulate the propositional formula and to find FSCs that have not been pruned.
%We prioritise the selection of FSCs that are close $\pi^*$ and thus harder to prune by the MDP abstraction.  \mc{can be removed/shorten}\sj{I dont understand the last sentence. Is it important?}

We assume  a constraint  $\prob_{\geq \lambda}(\lozenge T)$, a family $\ffsc$, the state-vector $ub$ obtained from the maximising policy $\pi^*$ in MDP $\abst{\ffsc}$ as discussed in Sec.~\ref{sec:abst}, and an FSC $\fsc \in \ffsc$. 
%\begin{definition}
%\begin{align*}
%P'(s) =
%\begin{cases}
%P^{\fsc}(s) & \text{if } s\in S', \\
%[s_{\top} \mapsto ub(s),s_{\bot} \mapsto 1{-}ub(s)]  & \text{if } s \in S^{\fsc}\setminus S', \\
%[s \mapsto 1] & \text{if } s \in \{s_{\top} ,s_{\bot} \} \\
%\end{cases}
%\end{align*}
%\textcolor{orange}{JPK: this definition is stuck. Why is $P'$ defined for states that do not belong to the state space of $C$, as in the second clause?}
%that the probability to reach a state in $T\cup \{s_{\top}\}$ in $C$ is $< \lambda$.
%\end{definition}
\begin{definition}
A~\emph{counterexample} (CE) for FSC $\fsc$ and $\prob_{\geq \lambda}$  is a subset $C\subseteq S^{\fsc}$ that induces the sub-MC of $\imc$ given as $\imc_C = ({C \cup \mathrm{succ}(C) \cup \{s_{\bot}}, s_{\top} \}, (s_0,n_0), P')$ with $P'(s) =$
\begin{align*}
\begin{cases}
P^{\fsc}(s) & \text{if } s \in C, \\
[s_{\top} \mapsto ub(s),s_{\bot} \mapsto 1{-}ub(s)]  & \text{if } s \in \mathrm{succ}(C) \setminus C, \\
[s \mapsto 1] & \text{if } s \in \{s_{\top}, s_{\bot} \}, \\
\end{cases}
\end{align*}
where $\mathrm{succ}(C)$ is the set of direct successors of $C$, and the probability to reach $T\cup \{s_{\top}\}$ in $\imc_C$ is $< \lambda$.  %\sj{What is the probability to reach a target from a set of states? Shouldnt this be from the initial state? or in the subMC (which is not named)}
\end{definition}


 Intuitively, in the sub-MC, states $s$ outside the CE $C$ evolve to $s_{\top}$ with probability $ub(s)$, the maximal probability to reach $T$ from $s$ in the family $\ffsc$ (i.e., the worst-case possible in $\ffsc$).
 They evolve to $s_{\bot}$ with probability $1{-}ub(s)$, the minimal probability to avoid $T$ in $\ffsc$. 
 For $(s,n) \in C$, the parameter $(n,\obsFun(s)) \in K$ is called \emph{relevant}. The CE for the constraint $\prob_{\leq \lambda}$ is defined  similarly using $lb$ rather than $ub$.
 
For each $\fsc' \in \ffsc$ that for each relevant parameter in a CE $C$ uses the same values as $\fsc$ , it holds that $P^{\fsc'} < \lambda$. 
Therefore, we can safely remove $\fsc'$ from the design space. 
We say that $C$ \emph{generalised} to the set of all such $\fsc'$. 

Smaller CEs lead to generalisation to larger families of FSCs.
As computing minimal CEs is NP-complete ~\citep{funke2020farkas}, we adopt the greedy approach from~\citep{andriushchenko2021inductive}. 
Handling multiple constraints is straightforward as we can compute the CE for each constraint violated by the FSC $\fsc$. This can potentially improve the pruning.

%\paragraph{Incomplete generalisation}
Similarly as the incomplete refinement strategy in Sec.~\ref{sec:abst}, we consider an incomplete generalisation of the CEs. 
In particular, we redefine the notation of relevant parameters. 
The parameter $(n,\obsFun(s))$ for $(s,n) \in C$ is relevant only if the observation $\obsFun(s)$ is inconsistent in $\abst{\ffsc}$ or the option selected by $\fsc$ is different from the options selected by $\pi^*$. 
This leads to more aggressive pruning and restricts the exploration to the FSCs that are topologically close to $\pi^*$.
%\textcolor{orange}{JPK: I would just present the above as a possible improvement; not as incomplete generalisation.}

\begin{example}
Consider a variant of our maze problem with initial state $s_0$, family $\ffsc$ of all 1-FSCs, where the available set of actions in the observation $o_3$ is restricted s.t.\ $V_{(n_0,z_3)} \in \{u,d,r\} \times \{0\}$. Let FSC $\fsc$ as in Fig~\ref{fig:CE}~(left). The right part illustrates the induced MC and the middle part shows the CE $C$ for the constraint $\prob_{\geq 1}$.
Note $P((s_4,n_0),s_{\bot}) = 1$ as $ub(s_4)=0$.
Thus, the relevant parameters are $(n_0,z_i)$ for $i\in \{0,1,2\}$. The generalisation of $C$ enables pruning a significant part of $\ffsc$. 
Under incomplete generalisation, the parameter $(n_0,z_0)$ is not relevant as it is consistent in $\abst{\ffsc}$ and $\fsc$ picks the same option as $\pi^*$.  
\end{example}


\begin{figure}
    \centering
   % \includegraphics[width=0.5\textwidth]{figs/CE2-2.pdf} 
     \includegraphics[width=0.47\textwidth]{figs/CE-3.pdf} 
    \caption{A CE for the given FSC in the maze problem.}
    \vspace{-0.5em}
    \label{fig:CE}
\end{figure}

%\vspace{-0.5em}
\section{Memory injection strategy}
% We phrase learning admissible FSCs as a search problem. Let us first give a bird's eye view.
%% Recall from Fig.~\ref{fig:overview} that the search of FSCs from a fixed (small) set of candidates is embedded in a larger outer stage. 
This section discusses the outer stage of our approach, cf. Fig.~\ref{fig:overview}, in which the learner decides \emph{where} to search. 
In particular, a subset of FSCs is selected and passed onto the teacher, as outlined in Sec.~\ref{sec:innerloop}.
% \sj{What is the contract? We give a timeout and get the best? What is the best if there are multiple objectives? etc} \mc{It depends on the exploration strategy: either the teacher performs the complete search and returns the optimum, or it runs the incomplete search. We also implemented an early termination, but we will not use it in the experimental part.  We consider only the optimising constrain (if there is) as it is usually the most restrictive.}
We assume access to an abstraction oracle that yields bounds on the value for every state based on an abstraction scheme outlined in Sec.~\ref{sec:abst}.
The learner processes this information and derives a new design space. 
It does so by combining three ingredients:
\begin{inparaenum}
    \item \emph{Adding memory}: By allowing FSCs to store more information, we (drastically) increase the design space. We allow to locally increase memory to keep the growth manageable.
    %\footnote{This explains the term \emph{memory injection strategy}.}. 
    \item \emph{Removing symmetries}: Similar to \citep{grzes2013isomorph}, it is unnecessary to include symmetric FSCs in the design space.
    \item \emph{Analysing abstractions}: Use the results from the abstraction to guide the search.
\end{inparaenum}



% probably not necessary[Add an example of a reduce FSC for the maze]

\vspace{-0.5em}
\subsection{Adding Memory}
\label{subsec:memory}

The key idea of the memory injection strategy is to use the diagnostic information obtained from the preceding inner loop exploring the design space represented by a family $\ffsc$ to construct a new family $\ffsc'$, say. 
These families are based on two fixed memory structures (either as a full or reduced FSC). 
On constructing $\ffsc'$, memory can be added that corresponds to one of the observations, see Sec.~\ref{sec:red_fsc}. This section outlines where to add the memory.

To decide how to extend the family $\ffsc$, we use the following information: 
\begin{inparaenum}
    \item The maximising policy  $\pi^*$ in MDP $\abst{\ffsc}$ together with its corresponding bounds $ub$\footnote{
The state space of MDP $\abst{\ffsc}$ includes copies of states where memory has been added.  Without symmetry reduction, an optimal policy (mostly) takes the same action in these copies, thus making the copies redundant. However, in combination with the symmetry reduction, the outgoing transitions of these copies differ and the copies are then no longer redundant.}. 
    \item The FSC $\fsc^*$ in $\ffsc$ obtained from the teacher. 
    %\sj{Do we not use the bounds/performanceof this FSC?} \mc{It was typo -- fixed} 
   That FSC is not available if the FSC is inadmissible or if the teacher is aborted.
\end{inparaenum}
%% For both $F^*$ and $\pi^{*}$, we may additionally obtain the expected number of visits of the states in the MC induced by $F^*$ or $\pi^*$.

If only $\pi^{*}$ is available, the memory injection strategy employs  a similar idea as the refinement strategy described in Sec.~\ref{sec:counterexpruning}. 
In particular, it evaluates the significance of the inconsistent observations w.r.t.\ $\pi^{*}$
by aggregating the significance of their inconsistent parameters as described above.
By adding the memory to the most significant inconsistent observation, we try to resolve the inconsistency and drive the search towards an FSC that mimics $\pi^*$ as often as possible, i.e., along most paths. 

If additionally $\fsc^*$ is available, the idea is similar: to obtain a better FSC in the next iteration, we identify the observation in which the choices of $\fsc^*$ differ from the observation-free policy $\pi^*$ the most.
That is, the inconsistency measure $\chi(s)$ for state $s$ is now the absolute difference in $ub(s)$ (for the maximising property) w.r.t.\ two actions: the one provided by $\pi^*$ and the one from $\fsc^*$.
The inconsistency measure of the observation $z$ is now the weighted average of $\chi(s)$ across all states $s$ with $O(s) = z$, where the weight for state $s$ is the expected number of visits of $s$ in the FSC $\fsc^*$.

Note that in both cases the proposed inconsistency measure is just a heuristic and neither guarantees that adding memory to the selected observation will improve the value of the FSC, nor that the memory injection strategy will eventually find the optimal $\fsc^*$. In fact, without additional modifications to the strategy, the heuristic will keep adding memory to a single observation because adding memory does not \emph{resolve} the inconsistencies. To mitigate this problem, it is crucial to employ the symmetry reduction.

% If additionally $\fsc^*$ is available, the injection strategy tries to improve the latter by adding memory to the observations where 
% $\fsc^*$ selects a different action than $\pi^*$. Similarly as before, we evaluate the significance of the observations by evaluating the significance of the corresponding parameters. (This is possible as the expected number of visits of states in the MC induced by $F^*$ or $\pi^*$ can be used).
% We estimate how changing an action in $\fsc^*$ can improve its value by looking at the state values in the upper bound $ub$. We use the expected visits in $\fsc^*$\sj{in the FSC or in the induced MC?} to estimate the impact of these changes.\sj{isnt that the same as two lines above?} Here, we also consider consistent (but non-trivial) observations in $\pi^*$. %Adding memory to such a observation may allow $\fsc^*$ to behave as $\pi^*$ (i.e. more optimally) only in some states with this observation.
% \sj{Does this always converge? Couldnt it happen that we keep adding memory to a single observation?}

\vspace{-0.5em}
\subsection{Symmetry reduction} 
Adding memory to a selected observation typically creates a family $\ffsc$ that includes FSCs having the same value due to certain symmetries in their topology. Removing these symmetries from $\ffsc$ reduces: i) the size of $\ffsc$ and ii) the number of inconsistencies of the policy $\pi^*$ obtained from $\abst{\ffsc}$, see below. As a result of resolving these inconsistencies, the memory injection strategy is capable of better recognising where \emph{else} in the POMDP memory is needed.
The importance of symmetry breaking has been recognised by~\cite{grzes2013isomorph}, who proposed a generation strategy for isomorphism-free Moore automata.
We propose a different approach based on restricting the family of candidate FSCs.
We illustrate where the symmetries are introduced as well as how to deal with them in the example below.
% We first describe where the symmetries are introduced, and then how we avoid this.
%\sj{In the paper that we cite, they consider the symmetry of nodes within an FSC, right? And we consider two FSCs and say that they are symmetric?} \mc{Sorry do not understand.}

\begin{example}
Consider again the maze problem from Example~\ref{ex:maze} and the specification to minimise the expected number of steps to reach $T$.
%% minimisation objective $\mathbb{E}^{\mathrm{steps}}_{\min} \left[ \mathrm{F} \, \{s_T\} \right] $
Note that this includes an implicit constraint to reach $T$.
Let $\ffsc_1$ be the family of all 1-FSCs (no memory was added).
The inner loop detects that there is no admissible 1-FSC satisfying the constraint: in observation $z_1$ we need to be able to pick both actions $r$ and $l$, and similarly for observation $z_4$. Assume that the minimising  policy $\pi^*_1$ in $\abst{\ffsc_1}$ reveals that the most significant inconsistency is the one in observation $z_1$.
Adding memory to $z_1$ has a twofold effect on the resulting design space $\ffsc_2$.  \smallskip\\
First, a new parameter $(n_1,z_1)$ is introduced that encodes action selection in newly created copies of states with observation~$z_1$. Second, each state having successor $s$ with observation~$z_1$ must be able to choose whether to go to $(s,n_0)$ or its copy~$(s,n_1)$. 
In our case, the domains of parameters $(n_0,z_0), (n_0,z_2), (n_0,z_3)$ as well as of parameters $(n_0,z_1)$ and $(n_1,z_1)$ (remember the self-loops) will now be $\{u,r,d,l\} \times \{n_0,n_1\}$. \smallskip\\
Before proceeding with the inner loop, we first remove symmetric assignments from the family $\ffsc_2$. That is, consider FSC $(\gamma,\delta)$ with $\gamma(n_0,z_1) = l$ and $\gamma(n_1,z_1) = r$. Clearly, this FSC achieves the same value as the symmetric FSC $(\gamma',\delta')$ with $\gamma'(n_0,z_1) = r$, $\gamma'(n_1,z_1) = l$ and $\delta'(\cdot,z) = n_1$ if $\delta(\cdot,z) = n_0$ and vice versa, for each predecessor observation $z$ of $z_1$. The family $\ffsc_2'$ should  include only one of the two FSCs, so we modify the domains of parameters $(\cdot,z_1)$ as follows: $V_{(n_0,z_1)} = \{u,d,r\} \times \{n_0,n_1\}$ and $V_{(n_1,z_1)} = \{u,d,l\} \times \{n_0,n_1\}$. \smallskip\\
The inner loop again detects that no admissible solution exists. Minimising policy $\pi^*_2$ now contains a single inconsistency in observation $z_4$, which we amend by adding memory to $z_4$. In the resulting family $\ffsc_3$, we introduce parameter $(n_1,z_4)$ for action/memory selection in the new copy, and modify parameters $(n_0,z_5), (n_0,z_4)$ and $(n_1,z_4)$ to enable the transition to the newly created copies (other successors of observation $z_4$ already account for both possible memory updates). As in the previous case, we break the symmetry in action selection in $(\cdot,z_4)$: $\ V_{(n_0,z_4)} = \{u,r,l\} \times \{n_0,n_1\}$ and $V_{(n_1,z_1)} = \{d,r,l\} \times \{n_0,n_1\}$. The third iteration of the inner loop finally yields an optimal FSC with value $7.1\overline{6}$. No additional memory injection can improve upon this.
\end{example}

More generally, for each observation $z$, we keep a list $\mathcal{I}^z_{Act}$
% of inconsistent actions.
of actions in which parameters $(\cdot,z)$ were inconsistent.
This list is updated each time we update memory for observation~$z$.
% (based on inconsistencies for~$z$).
Upon adding memory to $z$, we apply the symmetry reduction to the corresponding domains of parameters $(n_i,z)$. For simplicity, let $\left| \mathcal{I}^z_{Act} \right| = \mu(z)$, as in Example~5.
In such a case, $V_{(n_i,z)}$ is set to $\left( Act(z) {\setminus} \mathcal{I}^z_{Act} \cup \{\mathcal{I}^z_{Act}[i]\} \ \right) \times N$, where $\mathcal{I}^z_{Act}[i]$ is the $i$-th action in the list $\mathcal{I}^z_{Act}$. This ensures that for each action $a \in \mathcal{I}^z_{Act}$ there is exactly one parameter $V_{(\cdot,z)}$ where $a$ is available.
If $\left| \mathcal{I}^z_{Act} \right| \neq \mu(z)$, the construction of domains $V_{(\cdot,z)}$ becomes more involved: one must take possible inconsistencies in memory updates into account. For a comprehensive description of the symmetry reduction, we refer to the implementation (see Sec.~\ref{sec:experiments}).
In general, there are two extreme ways one can proceed with the symmetry reduction.
Either one can enable all of the actions/memory updates in $V_{(\cdot,z)}$, which will undermine the observation selection discussed in Sec. \ref{subsec:memory}.
% render the memory injection\sj{ symmetry reduction?} heuristic discussed in \ref{subsec:memory} irrelevant.\sj{That it memory injection does not work without symmetry reduction is irrelevant at this point?}
The other extreme is to disable choices in $V_{(\cdot,z)}$ arbitrarily. 
This may lead to incompleteness, as the following example shows.




% \sj{After the example: I would still expect a more general rule that says which actions we exclude}
% For each observation, consider the inconsistencies in a list
% Assume there is only one inconsistency per observation, then we modify the family in the exemplified (Example 5). 
% When there are more inconsistencies this is tricky (one must remove multiple actions from a node): 
% There are two extremal options to proceed: dont do symmetry reduction or make an arbitrary choice in removing actions from nodes. This choice becomes even more involved as we must also handle the inconsistencies in memory updates\footnote{We refer to the implementation for the choices}. 
% We remark that making arbitrary choices leads to incompleteness, however  even the simple case from Example 5 is incomplete, as shown by the following example. 
 

% It is worth noting that, in general, policy $\pi^*$ can be also inconsistent in memory updates: memory injection and symmetry breaking are then executed analogously as in the case of action inconsistencies. 
% \sj{I think this sentence can better be moved upwards}
% Reducing the symmetries leads to smaller families of controllers that need to be considered within the inner loop, although the primary goal of modifying parameters $(\cdot,z)$ is to eliminate existing inconsistencies in observation $z$ in the quotient $\abst{\ffsc'}$.  \sj{repetitive from before example;}
% As a result, the memory injection strategy is capable of better recognising where \emph{else} in the POMDP a memory is needed.
% \sj{The analysis in section 4.1? lets be specific?}
% Finally, in the spirit of the incomplete refinement strategy that trades completeness for speed, symmetry breaking may also remove optimal solutions from $\ffsc$, as discussed in the example below.

\begin{example}
Assume the following modification of Example~\ref{ex:maze}, where an agent now experiences a slight drift to the west: upon choosing the direction, the agent will move to the selected direction with probability 0.9 and will otherwise move one cell to the left (if available). For instance, when moving down from the state~2, the agent might instead end up in the state~1. FSC synthesis proceeds similarly as in Example~5, so assume that after two memory injections and symmetry reductions we end up with the same family $\ffsc_2'$ having $V_{(n_0,z_1)} = \{u,d,r\} \times \{n_0,n_1\}, \ V_{(n_1,z_1)} = \{u,d,l\} \times \{n_0,n_1\}$ and $V_{(n_0,z_4)} = \{u,r,l\} \times \{0,1\}, \ V_{(n_1,z_1)} = \{d,r,l\} \times \{0,1\}$. The optimal assignment for the state 2 is $\gamma(n_0,z_2) = d$ (moving down) and $\delta(n_0,z_2) = n_1$, since $(n_1,z_4)$ is the only parameter that enables movement down from the state~6. However, if due to drift the agent ends up in state $(s_1,n_1)$, it cannot move right ($r \not \in V_{(n_1,z_1)}$) and is forced to move sub-optimally.
% We could of course aid this by switching the logic of either of symmetry reductions, but, in general, this might lead to sub-optimal choices in other parts of the FSC.
In our case, we could have preserved the optimal solution if we had switched the order of e.g.~second symmetry reduction, although predicting a non-conflicting reduction order would require additional analysis.

\end{example}







\vspace{-1em}
\section{Experimental evaluation}
\label{sec:experiments}
\vspace{-0.5em}
Our evaluation focuses on the following questions:


\noindent
\emph{Q1: How does our approach compare with state-of-the-art belief-based approaches?} 
Belief-based approaches are the widespread approach to solving POMDPs. They implicitly or explicitly approximate the large or infinite belief-MDP instead of searching for policies with a particular structure.
%differ in some key aspects affecting the performance such as the type of the specification (finite vs.~infinite-horizon) and the type of observations (deterministic vs.~stochastic). 
We compare with the approaches by~\cite{norman2017verification} (implemented in  PRISM~\cite{DBLP:conf/cav/KwiatkowskaNP11}) and by ~\cite{bork2022under} (implemented in Storm~\citep{STORM}).
These methods provide state-of-the-art techniques for finding policies in belief MDPs for indefinite-horizon specifications, i.e., without discounting.  %These algorithms also assume deterministic observations.
%and thus the lower bounds on the policy value they provide are directly comparable with the value of our FSCs.  

\noindent
\emph{Q2: How does our approach compare to state-of-the-art approaches to synthesise deterministic FSCs?} 
To this end, we qualitatively compare with the state-of-the-art dual MILP formulation from~\citep{kumar2015history} which uses a max-entropy strategy for adding memory nodes. 
%To the best of our knowledge, this method provides the best performance on a large set of benchmarks. 
We also consider a recent alternative formulation of a primal MILP in  ~\citep{winterer2020strengthening} for multi-objective specifications.


\noindent
\emph{Q3: What is the effect of our heuristics on the run-time and the value of the resulting FSCs?}  
We discuss additional insights from an ablation study in which we discuss which settings yield the best performance.

\paragraph{Selected benchmarks and setup}
The framework outlined above has been implemented in PAYNT~\cite{DBLP:conf/cav/AndriushchenkoC21}, a tool for inductive synthesis of probabilistic programs\footnote{See  \url{https://github.com/randriu/synthesis}}. Unless mentioned otherwise, we used benchmarks from \citep{bork2020verification,bork2022under} extended by a few more involved variants. Table~\ref{tab:main} lists the statistics of the models including the number of states, the overall number of actions, and the number of observations.
 Our experiments  run on a single core of a machine equipped with an Intel i5-12600KF @4.9GHz CPU and 32~GB of RAM. An artefact allowing one to reproduce our experimental evaluation is available at \url{https://doi.org/10.5281/zenodo.6637489}.

\paragraph{Threads to validity}
This  evaluation focuses on showing the \emph{potential} of our approach using benchmarks from the verification literature. While the comparison with implementations in Storm and PRISM is thorough, other algorithms were not available and thus we resort to comparing with the performance reported in those papers. There is a dire need for a more structural comparison over different algorithms.   
Furthermore, the experiments suffice to provide insights in the performance of the algorithm, but not to draw conclusions about the relative relevance of individual heuristics. 
Finally, while our approach is general and could be applied to queries such as expected discounted rewards, it can only be competitive if it is  tailored to~that~setting. 

%  \begin{table*}[t]
%   \centering
%  \renewcommand{\arraystretch}{0.8}
%  \setlength{\tabcolsep}{3pt}
%  \scalebox{0.95}{
% \begin{tabular}{|c|rrr|r||c|rrr|r||c|rrr|r|}
% \hline
% Model & $S$ & $\Act$ & $\Obs$ & Spec. & Model & $S$ & $\Act$ & $\Obs$ & Spec. & Model & $S$ & $\Act$ & $\Obs$ & Spec. \\ \hline %\hline
 
%  Grid-av 4-0 & 17 & 59 & 4 & $P_{\max}$ &  Grid-av 4-0.1 & 17 & 59 & 4 & $P_{\max}$ &  Grid 30-sl & 900 & 3587 & 37 & $P_{\max}$\\ %\hline
 
 
%  Maze sl & 15 & 54 & 3 & $R_{\min}$ & Crypt 4 & 1972 & 4612 & 510 & $P_{\max}$ &   Nrp 8 & 125 & 161 & 41 & $P_{\max}$
%  \\ %\hline
  
%     Hallway & 61 & 301 & 23 & $P_{\max}$ & Drone 4-1 & 1226 & 3026 & 384 & $P_{\max}$ & Drone 4-2 & 1226 & 3026 & 761 & $P_{\max}$ \\% \hline
   
   
%     Refuel 6 & 208 & 565 & 50 & $P_{\max}$ & Netw-p  2-8-20 & $3\!\cdot\!10^4$ & $3\!\cdot\!10^4$ & 4909 &  $R_{\max}$ &    Rocks 12 & 6553 & $3\!\cdot\!10^4$ & 1645 & $R_\mathrm{min}$ \\ \hline
 
 

% \end{tabular}
% }
%  \caption{The  statistics of the twelve POMDP models and the optimisation objective.}
%  \vspace{-0.5em}
%  \label{tab:stats}
%   \end{table*}


\begin{table*}[!h]
 \centering
 \renewcommand{\arraystretch}{0.95}
 \scalebox{0.9}{
 \begin{tabular}{|cc||rr||r||r|r||r|r||r|}
\hline
\multicolumn{2}{|c||}{Benchmark} & 
\multicolumn{2}{c||}{Size} & \multicolumn{1}{c||}{PRISM} &  \multicolumn{2}{c||}{Storm}&   \multicolumn{2}{c||}{Inductive synthesis} &
\multicolumn{1}{c|}{Upper-}\\
\multicolumn{1}{|c}{Model} & \multicolumn{1}{c||}{Spec.} & \multicolumn{1}{c}{$S$/$\Act$} & \multicolumn{1}{c||}{$\Obs$} & 
\multicolumn{1}{c||}{}& \multicolumn{1}{c|}{first} & \multicolumn{1}{c||}{best} & \multicolumn{1}{c|}{fastest} & \multicolumn{1}{c||}{best} &
\multicolumn{1}{c|}{bounds}\\ \hline \hline

Grid-av & \multirow{2}{*}{$P_\mathrm{max}$} & $17$ & \multirow{2}{*}{$4$} & [$0.21$,1] & $0.86$ & $\mathbf{0.93}$ & \textbf{0.93 (3)} & 0.93(4)$^\dagger$ & \multirow{2}{*}{$\leq 0.98$} \\ 
4-0 &  & $59$ &  &  <1s & <1s & \textbf{<1s}   & \textbf{<1s} & <1s & \\ \hline



Grid-av & \multirow{2}{*}{$P_\mathrm{max}$} & $17$ & \multirow{2}{*}{$4$} & [$0.21$,1] & $0.82$ & $0.85$ & 0.92 (4) & \textbf{0.93 (5f)} & \multirow{2}{*}{$\leq 0.99$} \\ 
4-0.1 &  & $59$ &  &  <1s & <1s & 124s   & <1s  & \textbf{53s*} & \\ \hline



Grid & \multirow{2}{*}{$R_\mathrm{min}$} & $900$ & \multirow{2}{*}{$37$} &  \multirow{2}{*}{TO/MO} & $ 121$ & - &  \textbf{119 (6)} &  - & \multirow{2}{*}{$\geq 116.1$} \\ 
30-sl &  & $3587$ &  &   & 1s & -   &   \textbf{150s$^*$} & - & \\ \hline


Maze& \multirow{2}{*}{$R_\mathrm{min}$} & $15$ & \multirow{2}{*}{$8$} & [$\mathbf{7.09}$,$7.09$] & $7.67$ & - & \textbf{7.14 (3)} &\textbf{7.09 (3f)} & \multirow{2}{*}{$\geq 7.08$} \\ 
sl &  & $54$ &  & \textbf{2s} & <1s & -   & \textbf{<1s}  & \textbf{1s$^*$} & \\ \hline

Crypt & \multirow{2}{*}{$P_\mathrm{max}$} & $1972$ & \multirow{2}{*}{$510$} & [$0.33$,$0.79$] & \textbf{0.33} & - & \textbf{0.33 (0)} & - & \multirow{2}{*}{$\leq 0.33$} \\ 
4 &  & 4612 &  & 6s & \textbf{<1s} & -  & \textbf{<1s} & - & \\ \hline 

Nrp & \multirow{2}{*}{$P_\mathrm{max}$} & $125$ & \multirow{2}{*}{$41$} & [$0.13$,0.24] & \textbf{0.13} & - & \textbf{0.13 (0)}& - & \multirow{2}{*}{$\leq 0.13$} \\ 
8 &  & $161$ &  &  3s &  \textbf{<1s} & - & \textbf{<1s} & -  & \\ \hline 

Hallway & \multirow{2}{*}{$R_\mathrm{min}$} & $61$ & \multirow{2}{*}{$23$} & \multirow{2}{*}{TO/MO} & $19.3$ & $ 19.2$ & \textbf{16.3 (1)} & \textbf{14.9 (4f)} & \multirow{2}{*}{$\geq 12.4$} \\ 
 &  & $301$ &  &  & <1s & <1s & \textbf{<1s}   & \textbf{218s$^*$}  & \\ \hline

Drone & \multirow{2}{*}{$P_\mathrm{max}$} & $1226$ & \multirow{2}{*}{$384$} & \multirow{2}{*}{TO/MO} & $ 0.79$ & - & 0.71 (0) & \textbf{0.87 (2)} & \multirow{2}{*}{$\leq 0.94$} \\ 
4-1 &  & $3026$ &  &  & <1s & -  &  1s & \textbf{915s}  & \\ \hline

Drone & \multirow{2}{*}{$P_\mathrm{max}$} & $1226$ & \multirow{2}{*}{$761$} & \multirow{2}{*}{TO/MO} & $ 0.86$ & $ 0.91$ & 0.94 (0) & \textbf{0.97 (2)} & \multirow{2}{*}{$\leq 0.97$} \\ 
4-2 &  & $3026$ &  &  & <1s & 138s  & <1s & \textbf{326s}  & \\ \hline

Refuel& \multirow{2}{*}{$P_\mathrm{max}$} & $208$ & \multirow{2}{*}{$50$} & [$0.67$,$0.72$] & $\mathbf{0.67}$ & - & 0.44 (2) & 0.67 (2f)
& \multirow{2}{*}{$\leq 0.69$} \\ 
6 &  & $565$ &  &  136s & \textbf{<1s} & - & <1s & 45s$^*$ & \\ \hline

Netw-p & \multirow{2}{*}{$R_\mathrm{max}$} & $2\!\cdot\!10^4$ & \multirow{2}{*}{$4909$} & [$\mathbf{557}$,557] & $ 537$ & - & 540 (0) & - & \multirow{2}{*}{$\leq 558$} \\ 
2-8-20 &  & $3\!\cdot\!10^4$ &  &  \textbf{1099s} & <1s & -   & 105s  & - & \\ \hline

Rocks & \multirow{2}{*}{$R_\mathrm{min}$} & $6553$ & \multirow{2}{*}{$1645$} & \multirow{2}{*}{TO/MO} & $ 38$ & $\mathbf{ 20}$ & 42 (0) & - & \multirow{2}{*}{$\geq 20$} \\ 
12 &  & $3\!\cdot\!10^4$ &  &  & <1s & \textbf{47s}   & 1s  & - & \\ \hline  












\end{tabular}
}
\caption{Results for \textbf{Q1}. Bold entries denote the best solutions, -- indicates that no better solution was found within 30 minutes, * indicates that non-default settings were used, TO/MO  denotes timeout (30 minutes) or out of memory.}
\label{tab:main}
  \end{table*}



\subsubsection*{Q1: Comparison to belief-based methods}


Table~\ref{tab:main} summarises key experimental results related to \textbf{Q1}. 
The columns list the following information (from left to right): the model and its variant, the model statistics,
the bounds provided by~\citep{norman2017verification}  and its run-time, the lower bounds provided by~\citep{bork2022under} and its run-time (for two settings: the fastest synthesis and the best bound), the results provided by our approach (including the number of added memory nodes) and its run-time (the first interesting solution and the best solution found), and the upper bounds provided by~\cite{bork2020verification} allowing us to judge the quality of the lower bounds.

To simplify the presentation, this table shows results achieved by our approach using the default setting (different settings are used for the entries denoted by $^*$): the inner loop is instantiated by the pure MDP abstraction oracle with the incomplete refinement strategy, and the outer loop uses the memory injection strategy with symmetry reduction. 
The impact of our optimisation heuristics as well as the results for multi-objective specifications are discussed under \textbf{Q3}.

The results demonstrate that \emph{our inductive approach is  competitive with the belief-state space approximation for indefinite-horizon specifications}. 
For models with a moderate number of observation/actions, we provide better trade-offs between the run-time and the values of the found policies.
Moreover, we found small FSCs that improve the lower-bounds in~\citep{bork2022under}.  
For models with a large number of observations/actions, we found small high-value FSCs in comparable run-time. 
For the \emph{Rocks} model and a larger \emph{Netw} model, we failed to find a good solution.
We highlight two interesting results: 
For \emph{Grid-av 4-0}, our strategy injected four memory nodes (see $\dagger$) and achieved the bound provided by the observation-free MDP abstraction which guarantees the global optimum.
For \emph{Drone 4-2}, we found a very small FSC that achieves the known upper bound on the solution value~\citep{bork2020verification}. 
  

\subsubsection*{Q2: Comparison to MILP-based FSC synthesis} 
A direct comparison with MILP-based approaches is complicated due to limited availability of standardised implementations. Qualitative comparisons are furthermore complicated by differences in benchmarks. 

However, based on the \emph{Hallway} model from~\citep{kumar2015history} and manually translating it to mimic the effect of discounting and stochastic rewards to an almost equivalent model~\footnote{The values of the resulting FSCs are comparable.}, we make the following preliminary observation. 
The dual MILP optimisation for the fixed-size reactive FSC (equivalent to our 1-FSC) achieved the value 0.32 in 15 minutes.
Given the discount factor 0.95, this corresponds to an FSC where the expected number of steps to reach the target equal to $\log_{0.95}(0.32) = 22.2$.
% \sj{I dont understand this correspondence?}. 
Using the memory injection strategy, they found an FSC with 14 additional memory nodes in $\sim$1 hour achieving value 0.46, i.e., 15.1 expected steps. 
Our complete strategy explored all 1-FSCs in less than 1s and found a solution achieving 18.5 expected steps. 
The restricted exploration of full 4-FSCs found a solution achieving 14.9 expected steps in 218s. 
The default strategy used in Table~\ref{tab:main} (see above) added one memory node and found a solution achieving 16.1 expected steps in less than 1s.
%\footnote{Within 30 min. timeout, our approach did not find a better solution.}. 
Despite all the limitations of this experimental setup, \emph{these results indicate that our approach can be at least as successful as MILP-based synthesis methods.} 

Similarly, we compare with~\citep{winterer2020strengthening} on the \emph{Grid-av 4-0} model  with a constraint on the reachability probability and the minimisation of an reward. The best solution of the MILP optimisation with a restricted randomisation and memory injection has value 3.43 (found within seconds). 
This solution is obtained by our default strategy within 1s by adding one memory node. 
In 21s, it added seven memory nodes and found a better solution having value 3.29.
This shows that \emph{our inductive approach outperforms the MILP optimisation also on multi-objective specifications}.
  
\vspace{-0.3em}
\subsubsection*{Q3: The effect of optimisation heuristics}

\emph{Efficacious heuristics:}
We generally remark that the design spaces in this paper are several orders of magnitude bigger than the design spaces supported by the more general-purpose inductive synthesis framework in ~\citep{DBLP:conf/cav/AndriushchenkoC21}. The superior performance  can mostly be explained by the tailored representation of the design space  and novel search heuristics. 

\emph{Incomplete search:} To find a good policy, it is not necessary to be complete. The (default) incomplete refinement strategy and CE generalisation is beneficial for handling large number of observations/actions. The complete exploration fails to find a good solution for, e.g., the \emph{Drone} models. In our experiments, we did not observe that the incomplete exploration discards important solutions except the \emph{Grid-av \mbox{4-0.1}} model. For that model, the incomplete strategy performs 10 memory injections and finds in 840s a solution with value 0.92. The complete strategy performs 8 memory injections and finds in 1189s a solution with value 0.93. For this benchmark, the best synthesis result reported in Table~\ref{tab:main} (column \emph{best}) relied on complete exploration of full 5-FSCs.

\emph{Memory injection:} this prevents the blowup that just increasing memory nodes has. Without symmetry reduction, the abstraction-based framework has trouble guiding the search. Symmetry reduction thus not only reduces the design space but it also guides the memory injection strategy correctly select the most promising observation. 
For example, in the \emph{Maze sl} model, the memory injection without symmetry reduction repeatedly adds memory to a single observation and the optimal solution is not found. 
On the other hand, symmetry reduction can discard an optimal solution as demonstrated on the  \emph{Grid-av {4-0.1}, Maze sl, Hallway} and \emph{Refuel 6} models. 
For these models, the column \emph{best} of Table~\ref{tab:main} lists the results of the exploration of full $k$-FSCs denoted as $k$f for $k\in\{2,3,4,5\}$.


\emph{Hybrid teacher:} For the models in Table~\ref{tab:main}, the use of CEs or a hybrid teacher (combining MDP abstraction and CE pruning) does not improve the synthesis process. We believe that future work towards CE-guided inductive synthesis may change this balance. 
As of now, only for models where the MDP abstraction is significantly larger than the induced MCs corresponding to the candidate FSCs, the hybrid approach is superior: e.g., for the \emph{Grid 30-sl} model, a larger variant of the grid-like model, the MDP abstraction is 15x larger than the individual induced MCs. With default settings no admissible FSC is found within 30 minutes. A hybrid teacher helps finding an FSC within 150s that improves the solution found by the belief-based method. 

\emph{Multi-objective (MO) specifications:} Apart from the MO variant of the \emph{Grid-av 4-0} model discussed in \textbf{Q2}, we also considered a MO variant of the \emph{Maze sl} model including a  more complicated specification with an additional reach-avoid constraint. The constraint restricts the optimal FSC, but the run-time of the synthesis remains < 1s.  

%\mc{if there is time/space, demonstrate the impact of adding external bounds on the quality of the solution -- probably better to leave this for a journal version.} 

\vspace{-0.5em}
\section{Conclusion}
\vspace{-0.5em}
This paper presents a first inductive-synthesis based framework for finding finite-state controllers (FSCs) in POMDPs. Key ingredients are the novel heuristics to incrementally construct the memory structure of the FSC as well as two oracles for searching and evaluating families of FSCs. The experimental results show promising results indicating that this framework is competitive with the state-of-the-art alternatives. Future work includes the integration of belief-based approaches as an additional oracle.

%% we should ack Alexander Bork for his assistance in running the experiments with Storm.

% \mc{Two ideas that can be mentioned in future work (if there is a space): 1) using CE for MDPs and 2) using belief-based techniques as another oracle for driving the search.}


%\section{Related Work} 
%\sj{Just parked some text from intro here} 
 
% In contrast to MPDs, optimal controllers for POMDPs requires memory and/or randomisation even for simple objectives~\citep{} and the optimal control synthesis for POMDPs is generally undecidable as an infinite memory can be required~\cite{}. The synthesis of more complicated controllers is indeed computationally more demanding.
%In the past decade, we have seen significant improvement in the scalability of the POMDP solvers. The underlying techniques can be divided into three categories:

%\emph{Approximation of the belief-state space} that represents the %likelihood of the system being in a particular state. Point-base approximation~\citep{pineau2003point} implemented in the state-of-the-art solvers HSVI~\citep{smith2005point} and SARSOP~\citep{kurniawati2008sarsop} provides the best trade-offs between the quality of the controller and run-time. Iterative refinement of the the belief-state abstraction can be used to obtain safe bounds on the quality of the controllers including  infinite-horizon properties~\citep{bork2020verification}.

% \emph{Synthesis of finite-state controllers (FCSs)}~\citep{hansen1998solving} that represents compact and easy-to-execute representation of the control strategies that does not required belief updates. Additionally, FCS provide significantly more semantic information about the strategy than alpha vectors used by belief-state space approaches -- this is namely crucial in safe-critical applications. Efficient synthesis of FCSs typically leverages a mixed-integer linear program (MILP) formulation of the problem~\citep{amato2010optimizing,kumar2015history,winterer2020strengthening}. A notable exception is synthesis of stochastic FCSs via parameter synthesis in parametric Markov chains~\citep{junges2018finite}. 

% \emph{Monte-Carlo planning}~\citep{silver2010monte} that sacrifices formal guarantees but can handle very large POMDPs and scenarios where explicit probability distributions of the POMDP is not known. Alternatively, evolutionary driven strategies can be used to search for optimal controllers which turned to be beneficial namely for multi-objective specifications~\citep{soh2011evolving}.



%\begin{contributions} % will be removed in pdf for initial submission,
                      % so you can already fill it to test with the
                      % ‘accepted’ class option
%    Briefly list author contributions.
%    This is a nice way of making clear who did what and to give proper credit.
%\end{contributions}

\vspace*{-0.3cm}
\begin{acknowledgements} 
This work has been supported by the Czech Science Foundation grant \mbox{GJ20-02328Y} and the ERC AdG Grant \mbox{787914} (FRAPPANT).
The authors thank Alexander Bork and Filip Mac\'{a}k for their support in running the experiments.
% will be removed in pdf for initial submission,
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
%    Briefly acknowledge people and organizations here.
%   \emph{All} acknowledgements go in this section.
\end{acknowledgements}

\clearpage
\pagebreak
\bibliography{andriushchenko_479}


% \appendix

% \section{Symmetry reduction}\label{app:symmetry}

% Consider state $s$ (having observation $z$) with two successors: $s'$ and $s''$ (having observations $z'$ and $z''$, respectively). Let $\pi^*_1$ be the scheduler associated with the family $\mathcal{F}_1$ of 1-FSCs. Assume that $\pi^*_1$ is inconsistent in the following observations: for observation $z'$, $\pi^*_1$ selects actions $a_i$ and $a_j$; for observation $z''$, $\pi^*_1$ selects actions $a_k$ and $a_l$.

% Analysis of $\pi^*_1$ indicates that inconsistency in $z'$ is more significant, suggesting adding memory to this observation. This decision induces two changes in the design space. First, a new parameter $(z',1)$ is introduced that encodes action selection in newly created copies of states having observation~$z'$. Second, each state having successor $s^*$ with observation~$z'$ must be able to choose whether to go to $(s^*,0)$ or its copy~$(s^*,1)$. In particular, domain of parameter $(z,0)$ will now be $\Act \times \{0,1\}$.

% To break newly created symmetry in parameters $(z',0)$ and $(z',1)$, we remove action $a_i$ from $V(z',0)$ and action $a_j$ from $V(z',1)$. This way, if in state $s'$ the optimal action is~$a_i$, where it is available only in $(s,1)$, then predecessors of~$s'$ must pick the corresponding memory update: $\delta(z,0) = 1$.

% Assume that $\pi^*_2$ is the scheduler associated with the resulting design space $\mathcal{F}_2$. $\pi^*_2$ can no longer be inconsistent in actions~$a_i$ and $a_j$ for parameters $(z',0)$ or $(z',1)$, although it can be inconsistent wrt.~other combination of actions. However, similarly to $\pi^*_1$, scheduler $\pi^*_2$ is still inconsistent in observation $z''$, so let us add memory to this observation as well. This creates copies $(s^*,1)$ of states $s^*$ with $O(s^*)=z''$, introduces new parameter $(z'',1)$ for such states, and modifies parameters for all predecessors of $z''$ to account for new copies. In our case, parameter $(z,0)$ already considers that successors might have two copies, so $V(z,0)$ remains unchanged.

% Similar to the previous iteration, we break symmetry between new parameters by removing $a_k$ from $V(z'',0)$ and removing $a_l$ from $V(z'',1)$. Let $\mathcal{F}_3$ denote the resulting design space and let $F_3 \in \mathcal{F}_3$ denote the corresponding optimal FSC.

% To see why $F_3$ might be worse than the best 2-FSC, let us assume that in state $s'$ the optimal action is $a_i$ and in state~$s''$ the optimal action is $a_l$. Thus, to be able to pick the best action in $s'$, from state $s$ we must go to $(s',1)$, where this action is available, so $\delta(z,0)$ must be 1. However, once we perform action in state $s$, setting memory to 1, we might end up in state $(s'',1)$. Here, the optimal action is $a_l$ and, unfortunately, this action is no longer available in~$V(z'',1)$, so the sub-optimal action must be chosen. This example illustrates a dependency between subsequent symmetry reductions. We could of course aid this by modifying the second symmetry breaking sequence: remove $a_k$ from~$V(z'',1)$ and $a_l$ from $V(z'',0)$. This would restore optimal actions taken in states~$s'$ and $s''$, but might lead to sub-optimal choices in another parts of the chain.



\end{document}
