% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros

\usepackage{enumitem}
\usepackage{subcaption}
\usepackage{float}
\usepackage{amsthm}
\usepackage{amsfonts}

\newtheorem{theorem}{Theorem}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{fact}[theorem]{Fact}
\newtheorem{definition}{Definition}
\newtheorem{remark}{Remark}
\newtheorem{example}{Example}

\usetikzlibrary{decorations.pathreplacing}
\usetikzlibrary{arrows}

\DeclareMathOperator*{\E}{\mathbb{E}}

\newcommand{\be}{\begin{eqnarray}}
\newcommand{\ee}{\end{eqnarray}}
\newcommand{\bes}{\begin{eqnarray*}}
\newcommand{\ees}{\end{eqnarray*}}
\newcommand{\complexset}{\mathbb{C}}
\newcommand{\natset}{\mathbb{N}}
\newcommand{\realset}{\mathbb{R}}
\newcommand{\A}{\mathbb{A}}
\newcommand{\B}{\mathbb{B}}
\newcommand{\F}{\mathbb{F}}
\newcommand{\Z}{\mathbb{Z}}
\newcommand{\setminusx}{\!\setminus\!}
\newcommand{\tO}{\tilde{O}}

\DeclareMathOperator{\coef}{coef}

\usepackage{xspace}
\newcommand{\BNSL}{\textsc{BNSL}\xspace}
\newcommand{\BNSS}{\textsc{BNSS}\xspace}
\newcommand{\BNSC}{\textsc{BNSC}\xspace}
\newcommand{\VC}{$\mathcal{V}_k$-\textsc{BNSL}\xspace}
\newcommand{\VCS}{$\mathcal{V}_k$-\textsc{BNSS}\xspace}
\newcommand{\VCC}{$\mathcal{V}_k$-\textsc{BNSC}\xspace}
\newcommand{\REF}{{\textbf{\color{teal} [REF]}}\xspace}

\renewcommand{\sharp}{\#}
%\newcommand{\dpmax}{d^{\mathrm{opt}}} % BNSL
%\newcommand{\dpper}{d^{\mathrm{per}}} % periphery
%\newcommand{\dpcore}{d^{\mathrm{core}}} % periphery

\DeclareMathOperator{\dpmax}{opt}
\DeclareMathOperator{\dpper}{peri}
\DeclareMathOperator{\dpcore}{core}

\DeclareMathOperator*{\dec}{\mathrm{par}}

\newcommand{\numberP}{\mathrm{\#P}}



%\title{Learning and Sampling Bayesian Networks under Vertex Cover Constraints}
%\title{Learning, Sampling, and Counting Bayesian Networks under Constraints}
%\title{Learning Directed Acyclic Graphs with Small Vertex Cover Revisited}
%\title{Learning Bayesian Networks with Small Vertex Cover Revisited}
%\title{Revisiting Bayesian Network Learning with Vertex Cover Constraints}
\title{Revisiting Bayesian Network Learning with Small Vertex Cover}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<juha.harviainen@helsinki.fi>?Subject=Your UAI 2023 paper}{Juha Harviainen}{}}
\author[1]{Mikko Koivisto}
%\author[1,2]{Further~Coauthor}
%\author[3]{Further~Coauthor}
%\author[1]{Further~Coauthor}
%\author[3]{Further~Coauthor}
%\author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Department of Computer Science\\
    University of Helsinki\\
    Helsinki, Finland
}
%\affil[2]{%
%    Second Affiliation\\
%    Address\\
%    …
%}
%\affil[3]{%
%    Another Affiliation\\
%    Address\\
%    …
%  }
  
  \begin{document}
\maketitle

\begin{abstract}
The problem of structure learning in Bayesian networks asks for a directed acyclic graph (DAG) that maximizes a given scoring function. 
Since the problem is NP-hard, research effort has been put into discovering restricted classes of DAGs for which the search problem can be solved in polynomial time. Here, we initiate investigation of questions that have received less attention thus far: Are the known polynomial algorithms close to the best possible, or is there room for significant improvements? If the interest is in Bayesian learning, that is, in sampling or weighted counting of DAGs, can we obtain similar complexity results? Focusing on DAGs with bounded vertex cover number---a class studied in Korhonen and Parviainen's seminal work (NIPS 2015)---we answer the questions in the affirmative. We also give, apparently the first, proof that the counting problem is $\sharp$P-hard in general. In addition, we show that under the vertex-cover constraint counting is $\sharp$W[1]-hard.
\end{abstract}

\section{Introduction}

The structure of a Bayesian networks is a directed acyclic graph (DAG). The task of learning the DAG from given data is often formulated as maximization of some appropriate scoring function. Common scoring functions are decomposable, meaning that the score of a DAG is obtained as a sum (or product) of vertex-wise local scores, each of which only depends on the vertex and its parents in the graph. 
%
This structural property---which isolates the combinatorial problem from the specifics of the scoring function and the data---has motivated fruitful algorithmic research. The problem is known to be NP-hard \citep{Chickering95NP} even if every vertex is allowed to have at most two parents. On the other hand, an optimal $n$-vertex DAG can be found by dynamic programming over vertex subsets in time $O(2^n n^2)$ under any indegree or other local constraints \citep{Ott04,Singh05,Silander06}. For almost two decades, we have seen essentially no progress in the worst-case time bound, albeit numerous algorithms have been developed for heuristic search \citep{Scanagatta15,Yuan11}.

Given that the problem is hard in general, we may ask whether it becomes tractable if we restrict the search space by some constraints (other than the plain maximum indegree). Taking the viewpoint of \emph{parameterized complexity}, the question is whether the complexity of the problem can be controlled by some parameter of the input or output. While restricting the input never increases the complexity, restricting the output (i.e., the search space) may increase or decrease the complexity. For example, parameterizing by the \emph{treewidth} of the DAG appears to only make the problem harder \citep{Korhonen13}. On the other hand, parameterizing by the \emph{vertex cover number} renders the problem easier \citep{Korhonen15}. Other positive results are known for polytrees that are close to branchings \citep{Gaspers15} and for DAGs with bounded \emph{feedback edge number} \citep{Ganian21}. 
%
Upper bounds can often be complemented with lower bounds, that is, parameterized hardness results within the \emph{W-hierarchy} \citep{Downey95}.
%
Typically, the time complexities are polynomial in the number of vertices for any constant value of the parameter:
\cite{Gruttemeier22} provide a summary of many complexity results for different parameters.

In this paper, we put forward two questions that have received little attention in previous works on structure learning in Bayesian networks. 
%
First, we ask whether the known parameterized algorithms are close to the best possible. For example, if an algorithm with running time $O(T)$ is known, can we find an algorithm that runs in time $O(\sqrt{T})$? Such a speedup would be a significant \emph{quantitative} improvement, even if it did not affect the \emph{qualitative} complexity classification, which has been the primary interest of previous works.   
%
Second, we ask to what extent the parameterized complexity results for the optimization problem in question can be transferred to related problems of sampling and weighted counting of DAGs. 
%
These variants are motivated chiefly by the Bayesian approach to learning Bayesian networks \citep{Heckerman95, Madigan95}. Both exact algorithms \citep{Koivisto04,Tian09,Talvitie19,Koivisto20} and numerous sampling-based approximate methods (see,  e.g., \cite{Friedman03,Niinimaki16,Kuipers17} and references therein) have been developed. 
%
However, these works have not exercised the parameterized complexity viewpoint.

To initiate the investigation of these question, we focus on parameterization by the vertex cover number, studied in the seminal work by \cite{Korhonen15}. They gave an algorithm running in time $n^{2k+O(1)}$ for any fixed vertex cover number $k$. Furthermore, they showed that the parameterized problem is W[1]-hard, suggesting that the obtained time bound is qualitatively perhaps the best possible: the exponent of $n$ must depend on $k$.  
%

After formalizing the setup in Section~2, 
we begin in Section~3 by giving an algorithm that finds an optimal DAG in time $n^{k+O(1)}$, thus achieving a nearly quadratic improvement. 
%
In Section~4, we turn to the counting variant, with several results: 
We start by showing that the basic, unconstrained problem is $\sharp$P-hard; while this result may be unsurprising, to our knowledge, it has not been proven before.
%
We then give an analogous $\sharp$W[1]-hardness result for the parameterized variant, but also an algorithm running in time $n^{2k + O(1)}$. 
%
In Section~5, we consider the sampling variant. While the handling of duplicates appears to make counting more difficult than maximization, we observe that by having a control of the number of duplicates enables relatively efficient rejection sampling. 
%
Finally, we end the paper by pointing directions to future research in Section~6.

\section{Problem Formulations}

A Bayesian network is a graphical representation of a multivariate probability distribution. Its structure is given as a DAG $G = (V, E)$, where each vertex in $V$ corresponds to one random variable and the edge set $E$ encodes conditional independence relations between the variables. If $(u, v)$ is an edge in $G$, we call $u$ a \emph{parent} of $v$ and $v$ a \emph{child} of $u$. We write $G_v$ for the \emph{parent set} of $v$ in $G$, i.e., $G_v = \{ u \in V \mid (u, v) \in E\}$. The joint distribution of the variables is obtained by specifying a univariate conditional probability distribution for each vertex $v$ given its parent set $G_v$, and then taking the product of these distributions.

In score-based structure learning of Bayesian networks, every possible DAG $G$ is associated with a nonnegative \emph{weight} $f(G)$ that, roughly speaking, measures how well $G$ fits the given data. How the weights are obtained from the data and background knowledge varies depending on the adopted learning paradigm and performance measures \citep[Ch.~18]{Koller09}. Important for our purposes is that the commonly used weight functions are \emph{modular}, i.e., they decompose into a product of vertex-wise weights:  
\[f(G) = \prod_{v \in V} f_v(G_v).\] 

We consider three different ways to formulate structure learning as an algorithmic problem. 
To this end, let $\mathcal{D}$ denote the set of DAGs on the vertex set $V$. 
%
We assume that the values $f_v(G_v)$, for all vertices $v$ and their possible parent sets $G_v$, have been precomputed and given as input. We assume the \emph{non-zero representation}, in which a weight for a parent set is (explicitly) given only when it is non-zero. Furthermore, we assume that the size of this representation is polynomial in the number of vertices.


First, consider maximizing the weight function:
\begin{description}
     \item \textsc{Bayesian Network Structure Learning} (\BNSL)\\
     \emph{Objective:} Compute $\max_{G \in \mathcal{D}} f(G)$.
\end{description}
While our formulation only asks for the maximum weight, the algorithms typically also yield a maximizing DAG. 
To conform to the common nomenclature used in literature, we retain the term ``learning'' in the problem name.
Sometimes \BNSL is stated as maximization of an additively decomposable scoring function; one obtains this equivalent form by considering a logarithm of the weight function. 

Second, consider weighted counting of DAGs: 
\begin{description}
     \item \textsc{Bayesian Network Structure Counting} (\BNSC)\\
     \emph{Objective:} Compute $\sum_{G \in \mathcal{D}} f(G)$.
\end{description}
In applications, $f(G)$ may equal an unnormalized posterior probability of $G$, in which case the sum equals the normalizing constant. More generally, $f(G)$ may equal a product of the posterior probability of $G$ and some other function $h(G)$, e.g., an indicator function telling whether or not some edges of interest are present in $G$ \citep{Friedman03,Tian09}. Then the sum equals the posterior expectation of $h(G)$, or the posterior probability of the event of interest in the case of an indicator function. 

Third, consider sampling DAGs with probabilities proportionally to the weights: 
\begin{description}
     \item \textsc{Bayesian Network Structure Sampling} (\BNSS)\\
     \emph{Objective:} Sample $G \in \mathcal{D}$ with $\Pr(G) \propto f(G)$.
\end{description}
Here $f(G)$ typically is an unnormalized posterior probability of $G$, albeit one could also consider sampling from distributions that are biased, e.g., towards some graph features in interest; cf.\ indicators of edge sets discussed above.

\subsection{Structural Constraints}

In this paper, we focus on the constraint studied by \cite{Korhonen15}, the \emph{vertex cover number of the moralized graph}. Let $G_\mathrm{M}$ be the undirected graph obtained by \emph{moralizing} $G = (V, E)$, meaning that we add an edge between $u$ and $v$ if there is a directed edge between them or they form a \emph{v-structure}: both of them are parents of the same vertex without being connected by an edge.

With a slight abuse of terminology, $G$ is said to have a \emph{vertex cover} $S$ of size $k$ if all edges in $G_\mathrm{M}$ have at least one endpoint in $S$. The \emph{vertex cover number} $\tau(G)$ of a graph is the size of the smallest vertex cover of $G_\mathrm{M}$. We denote the set of DAGs $G$ with $\tau(G) \le k$ by $\mathcal{V}_k$ and the constrained variants of the problems restricted to this set by \VC, \VCC, and \VCS.

\begin{figure}[t]
		\small
	    \centering
		\begin{tikzpicture}
	        \node[shape=circle,draw,inner sep=4] (N11) at (0,0) {};
	        \node[shape=circle,draw,inner sep=4] (N12) at (0,-1.5) {};
	        \node[shape=circle,draw,inner sep=4] (N13) at (0,-3) {};
	        \node[shape=circle,draw,inner sep=4] (N21) at (-2,-0.75) {};
	        \node[shape=circle,draw,inner sep=4] (N22) at (-2,-2.25) {};
	        \node[shape=circle,draw,inner sep=4] (P1) at (2,0) {};
	        \node[shape=circle,draw,inner sep=4] (P2) at (2,-1) {};
	        \node[shape=circle,draw,inner sep=4] (P3) at (2,-2) {};
	        \node[shape=circle,draw,inner sep=4] (P4) at (2,-3) {};
	        \node[] (T1) at (0,0.8) {$N_1$};
	        \node[] (T2) at (-2,0.8) {$N_2$};
	        \node[] (T3) at (2,0.8) {$P$};
	        \path [-triangle 45] (N11) edge node[] {} (N12);
	        \path [-triangle 45] (N12) edge node[] {} (N13);
	        \path [-triangle 45] (N11) edge[bend left] node[] {} (N13);
	        \path [-triangle 45] (N21) edge node[] {} (N11);
	        \path [-triangle 45] (N22) edge node[] {} (N12);
	        \path [-triangle 45] (N11) edge node[] {} (N22);
	        \path [-triangle 45] (N21) edge node[] {} (N13);
	        \path [-triangle 45] (N11) edge node[] {} (P1);
	        \path [-triangle 45] (N12) edge node[] {} (P1);
	        \path [-triangle 45] (N13) edge node[] {} (P1);
	        \path [-triangle 45] (N11) edge node[] {} (P2);
	        \path [-triangle 45] (N12) edge node[] {} (P3);
	        \path [-triangle 45] (N13) edge node[] {} (P2);
	        \draw[dotted, rounded corners, thick] (-2.4,0.4) rectangle (-1.6,-3.4);
	        \draw[dotted, rounded corners, thick] (-0.4,0.4) rectangle (0.4,-3.4);
	        \draw[dotted, rounded corners, thick] (1.6,0.4) rectangle (2.4,-3.4);
	    \end{tikzpicture}
	\caption{One network structure partitioned into a core and a periphery with a vertex cover $N_1$ of size $k=3$.}
	\label{fig:structure}
\end{figure}

\section{Maximization}

Observe that the vertices in the vertex cover can have at most one parent outside the vertex cover: Otherwise, it would no longer be the vertex cover of the moralized graph as the parents would be connected. Using this idea, \cite{Korhonen15} proposed the following algorithm for solving the problem: Brute force search over all possible vertex covers $N_1$ of the graph and the set of their parents $N_2$. These two sets are the \emph{core} of the structure, and the remaining vertices $P$ are the \emph{periphery} (see Fig.~\ref{fig:structure}). Importantly, the core and the periphery can be optimized independently.

For the core, they use the exponential-time algorithm to find its optimal structure in roughly $2^{|N_1| + |N_2|} \le 2^{2k}$ operations (see, e.g., \cite{Silander06}). The vertices in the periphery are then connected to their best parent sets in $N_1$, which is achieved efficiently with precomputation: For each node and a set $S$ of possible parents of size at most $k$, find the optimal parent set for the node among the subsets of $S$. The total running time of their algorithm is $(2\mathrm{e}n/k)^{2k} n^{O(1)}$ where $n \coloneqq |V|$. The time complexity has an atypical form because the sets $N_1$ and $N_2$ are unordered, meaning that there are roughly $(n^{k}/(k!))^2$ possibilities for the sets. We improve this complexity to $3^k n^{k+O(1)}$, achieving nearly quadratic speedup in $n$:

\begin{theorem}
	\VC can be solved in time $3^k n^{k+O(1)}$.
\end{theorem}
\begin{proof}
	Recall that a DAG defines a \emph{partial order} $\ge$ of the vertices where $v \ge w$ if there is a directed path from $v$ to $w$. This property is reflexive, antisymmetric, and transitive. However, some of the vertices can be incomparable under this partial order if there is no directed path between them. A \emph{linear extension} of a partial order extends it such that any pair of vertices is comparable. For example, any topological ordering of a DAG is its valid linear extension.

	Unlike in the algorithm of \cite{Korhonen15}, we only brute force over the set $N_1$ and test all of its linear orders $v_1 > v_2 > \dots > v_k$ in roughly $n^k$ operations. Every DAG has at least one linear extension, and thus the partial order of $N_1$ in the optimal structure is covered by at least one of these linear orders. We then complete the structure of the core using dynamic programming to decide how the vertices are distributed between $N_2$ and $P$. The dynamic programming computes the optimal structure for the ordered vertices $N_1$ and the first $i$ of the (arbitrarily ordered) remaining vertices $w_1, w_2, \dots, w_{n-k} \in V$ such that some subset of $N_1$ has parents.
	
	More formally, let $g_{v}(S)$ be the highest weight for $v$ from a parent set belonging to $S$ and similarly $g_{v}(S, u)$ the highest weight such that the parent set is a subset of $S \cup \{u\}$ and contains $u$. Then, we initialize
	\[ \dpmax(\emptyset,0) \coloneqq \prod_{j=1}^k g_{v_j}(v_{1:(j-1)}) \]
	and
	\[ \dpmax(S,0) \coloneqq 0 \text{ for all } S \subseteq v_{1:k} \]
	with $v_{1:{j}}$ denoting the sequence $v_1, v_2, \dots, v_j$. In other words, $\dpmax(\emptyset,0)$ is the highest scoring DAG of $v_{1:k}$ whose one of the linear extensions is $v_1 > v_2 > \dots > v_n$. As no other vertices have been added yet, the vertices in $N_1$ lack parents outside itself.
	
	If a vertex $w$ is assigned to the periphery, it always chooses the best parent set from $N_1$. Otherwise, it is the parent of some subset $T$ of the vertices in $N_1$, in which case each such vertex $v$ picks the best parent set from the union of its predecessors and $w$. To prevent cycles, any parent $v_j$ of $w$ must have $v_j > v_\ell$ for all $v_\ell \in T$. Hence, we define $\dpmax(S,i)$ as
	\begin{equation}\label{eq:optimization}
		\dpmax(S,i) \coloneqq \max_{T \subseteq S}\, \dpmax(S \setminus T,i-1) \cdot r(w_i, T),
	\end{equation}
	where $r(w_i, T)$ is the factor by which the optimal weight changes if $w_i$ is a parent of $T \subseteq N_1$:
	\[ r(w_i, T) \coloneqq g_{w_i}\big(\left\lbrace v_j \mid v_j > T \right\rbrace\big) \cdot \prod_{v_j \in T} \frac{g_{v_j}\big(v_{1:(j-1)}, w_i\big)}{g_{v_j}\big(v_{1:(j-1)}\big)} \]
	with $v_j > T$ if $v_j > v_\ell$ for all $v_\ell \in T$.
	
	Now, the highest weight of a DAG that respects the order of the vertices of $N_1$ can be found as $\max_S \dpmax(S,n-k)$. Iterating over all orders of $N_1$ then covers the set of all DAGs. We needed to consider $3^k$ sets $S$ and $T$ over the execution of the algorithm for each of the $n-k$ dynamic programming layers, and there are $O(n^k)$ options for the ordered elements of $N_1$, giving the total running time $3^k n^{k + O(1)}$.
\end{proof}

Consider instead the more common (equivalent) problem of optimizing the logarithm of the weight. Then, Equation~\ref{eq:optimization} can be interpreted as subset convolution over the max--sum semiring, for which fast algorithms exist assuming that the values are integers with a bounded absolute value $W$ \citep{Bjorklund07}. In such a case, the values $\dpmax(S,i)$ are computable in roughly $2^k W$ operations for each fixed $i$.

\begin{corollary}
	\VC can be solved in time $2^k W n^{k+O(1)}$ if the logarithms of the input weights, in some base, are integers with absolute value at most $W$.
\end{corollary}

This corollary also leads to an approximation algorithm for the optimal network structure. Suppose that we use logarithms to the base $c$ and round the weights down to the greatest value whose logarithm is an integer. Each weight decreases by at most a factor of $c$, and thus the optimal DAG using the new weights is at most $c^{n}$ times worse than with the original weights. Supposing we want to find an $a$-approximation, we may choose $c = a^{1/n}$. Finally, observe that
\[ \log_c f_v(G_v) = \frac{\log_2 f_v(G_v)}{\log_2 c} = \frac{n \cdot \log_2 f_v(G_v)}{\log_2 a}, \] meaning that the logarithms of the weights will not grow unreasonably large. These observations lead to the following corollary:

\begin{corollary}
	The weight of the optimal network structure can be approximated up to a fixed factor $a > 1$ in time $2^k W n^{k+O(1)}$ if the logarithms of the input weights to the base $2$ have an absolute value at most $W$.
\end{corollary}

\section{Counting}


We start by showing that weighted counting of DAGs is hard. We also prove that the parameterized version is hard, but that there exists at least a simple albeit slow algorithm for the problem that outperforms the nonparameterized algorithm.


\subsection{General Case is $\sharp$P-hard}

\begin{figure*}
	\small
	\centering
	\begin{tikzpicture}
	    \node[shape=circle,draw,inner sep=1.6] (D) at (0,0) {$D$};
	    \node[shape=circle,draw,inner sep=0.5] (X1) at (-3,-1.5) {$X_1$};
	    \node[shape=circle,draw,inner sep=0.5] (X2) at (-1,-1.5) {$X_2$};
	    \node[shape=circle,draw,inner sep=0.5] (X3) at (1,-1.5) {$X_3$};
	    \node[shape=circle,draw,inner sep=0.5] (X4) at (3,-1.5) {$X_4$};
	    \node[shape=rectangle,draw,inner sep=2] (C1) at (-3,-3) {$X_1 \lor X_2$};
	    \node[shape=rectangle,draw,inner sep=2] (C2) at (-1,-3) {$X_1 \lor X_3$};
	    \node[shape=rectangle,draw,inner sep=2] (C3) at (1,-3) {$X_2 \lor X_3$};
	    \node[shape=rectangle,draw,inner sep=2] (C4) at (3,-3) {$X_3 \lor X_4$};
	    \node[shape=circle,draw,inner sep=0.5] (B1) at (-3,-4.5) {$B_1$};
	    \node[shape=circle,draw,inner sep=0.5] (B2) at (-1,-4.5) {$B_2$};
	    \node[shape=circle,draw,inner sep=0.5] (B3) at (1,-4.5) {$B_3$};
	    \node[shape=circle,draw,inner sep=0.5] (B4) at (3,-4.5) {$B_4$};
	    \path [triangle 45-] (C1) edge node[] {} (X1);
	    \path [triangle 45-] (C1) edge node[] {} (X2);
	    \path [triangle 45-] (C2) edge node[] {} (X1);
	    \path [triangle 45-] (C2) edge node[] {} (X3);
	    \path [triangle 45-] (C3) edge node[] {} (X2);
	    \path [triangle 45-] (C3) edge node[] {} (X3);
	    \path [triangle 45-] (C4) edge node[] {} (X3);
	    \path [triangle 45-] (C4) edge node[] {} (X4);
	    \path [triangle 45-,dashed] (X1) edge node[above left] {$1$} (D);
	    \path [triangle 45-,dashed] (X2) edge node[left] {$1$} (D);
	    \path [triangle 45-,dashed] (X3) edge node[right] {$1$} (D);
	    \path [triangle 45-,dashed] (X4) edge node[above right] {$1$} (D);
	    \path [triangle 45-,dashed] (B1) edge node[left] {$M$} (C1);
	    \path [triangle 45-,dashed] (B2) edge node[left] {$M$} (C2);
	    \path [triangle 45-,dashed] (B3) edge node[right] {$M$} (C3);
	    \path [triangle 45-,dashed] (B4) edge node[right] {$M$} (C4);
	    \path [triangle 45-,draw] (D) to[out=155,in=90]++ (-5.5,-3) to[out=-90,in=-160] node[] {}  (B4);
	    \path [triangle 45-,draw] (D) to[out=160,in=90]++ (-5.2,-3) to[out=-90,in=-155] node[] {} (B3);
	    \path [triangle 45-,draw] (D) to[out=165,in=90]++ (-4.9,-3) to[out=-90,in=-150] node[] {} (B2);
	    \path [triangle 45-,draw] (D) to[out=170,in=90]++ (-4.6,-3) to[out=-90,in=175] node[] {} (B1);
        \node[shape=circle,draw,inner sep=0.5] (B4C) at (8,-4.5) {$B_4$};
	    \node[shape=circle,draw,inner sep=5.5] (A1) at (5,-2.25) {$ $};
	    \node[shape=circle,draw,inner sep=5.5] (A2) at (6.5,-2.25) {$ $};
	    \node[shape=circle,inner sep=5.5] (A3) at (8,-2.25) {$\cdots$};
	    \node[shape=circle,draw,inner sep=5.5] (A4) at (9.5,-2.25) {$ $};
	    \node[shape=circle,draw,inner sep=5.5] (A5) at (11,-2.25) {$ $};
	    \node[shape=rectangle,draw,inner sep=2] (C4C) at (8,0) {$X_3 \lor X_4$};
	    \path [triangle 45-,dashed] (A1) edge node[above left] {$1$} (C4C);
	    \path [triangle 45-,dashed] (A2) edge node[left] {$1$} (C4C);
	    \path [triangle 45-,dashed] (A3) edge node[below right] {$1$} (C4C);
	    \path [triangle 45-,dashed] (A4) edge node[right] {$1$} (C4C);
	    \path [triangle 45-,dashed] (A5) edge node[above right] {$1$} (C4C);
	    \path [-triangle 45,draw] (A1) edge node[above left] {} (B4C);
	    \path [-triangle 45,draw] (A2) edge node[left] {} (B4C);
	    \path [-triangle 45,draw] (A3) edge node[right] {} (B4C);
	    \path [-triangle 45,draw] (A4) edge node[right] {} (B4C);
	    \path [-triangle 45,draw] (A5) edge node[above right] {} (B4C);
	    \draw [decorate,decoration={brace,amplitude=10pt,aspect=0.20},xshift=-4pt,yshift=0pt]
(4.5,-4.7) -- (4.5,0.2) node [] {};
	\end{tikzpicture}
    \caption{An example reduction from a \textsc{Monotone $2$-SAT} instance to \VCC. Solid edges are always on (weight $1$, other choices have weight $0$) while dashed edges are either on (weight written next to edge) or off (weight $1$). Each $M$-weighted edge can be replaced with the binary-weighted component on the right.} % and parent sets with more than two children are replaced by a binary tree. 
    \label{fig:reduction}
\end{figure*}

We prove that the problem is $\sharp$P-\emph{hard}: at least as hard as counting the number of satisfying assignments for a SAT formula. To show this, we construct a reduction from a $\sharp$P-\emph{complete} problem \textsc{Monotone 2-SAT} \citep{Valiant79}, where we are given a conjunction of $m$ clauses of two positive literals, $(a_1 \lor b_1) \land (a_2 \lor b_2) \land \dots \land (a_m \lor b_m)$ with $a_i, b_i \in \{x_1, x_2, \dots, x_n\}$. The objective is to count the number of its satisfying assignments.


\begin{theorem}
	The \BNSC problem is $\sharp$\textsc{P}-hard.
\end{theorem}
\begin{proof}
Consider the following reduction into counting the total weight of all DAGs visualized in Figure~\ref{fig:reduction}.

We have $n + 2m + 1$ vertices: $n$ vertices $X_1, X_2, \dots, X_n$ corresponding to the variables, $m$ vertices $C_1, C_2, \dots, C_m$ corresponding to the clauses, one additional node $B_i$ for each clause $C_i$, and a single node $D$ for choosing a subset of positive variables. If $D$ is a parent of $X_j$, then we interpret that as if the variable $x_j$ is set to be true, and false otherwise.

Let $V_i$ be the set of two variables in the clause $a_i \lor b_i$, and define the following weights:
\begin{itemize}
    \item $f_{C_i}(V_i) = 1$;
    \item $f_{B_i}(\{C_i\}) = M$ with $M \coloneqq 2^{n+1} - 1$;
    \item $f_{B_i}(\emptyset) = 1$;
    \item $f_{D}(\{B_1, B_2, \dots, B_m\}) = 1$;
    \item $f_{X_j}(\{D\}) = f_{X_j}(\emptyset) = 1$.
\end{itemize}

First, consider a satisfying assignment of variables $(x_j)$. The parent set of each $B_i$ has to be empty since otherwise there would be a cycle in the graph: there is a directed path from $B_i$ to $C_i$ through some $X_j$. Thus, the weight is $1$.

If the assignment $(x_j)$ does not satisfy all clauses, each unsatisfied clause $C_i$ contributes a factor $(M + 1)$ to the weight of the assignment: vertex $B_i$ can either have $C_i$ as a parent or have no parents, as there is no path from $D$ to $C_i$.

Now, the total weight modulo $M+1$ gives the number of satisfying assignments as any unsatisfying one is removed from the sum and there are exactly $2^n$ assignments.
\end{proof}

With some slight modifications, we get even stronger results:

\begin{corollary}
%	The \BNSC problem is $\sharp$\textsc{P}-hard even if each vertex can have at most $2$ parents and the weights are either $0$ or $1$.
	\BNSC is $\sharp$\textsc{P}-hard even if each vertex can have at most $2$ parents and the weights are either $0$ or $1$.
\end{corollary}
\begin{proof}
	In the original construction, $D$ has exactly $m$ parents. We fix this by instead constructing a binary tree of at most $2m$ vertices with the leaf layer connected to the clauses $C_i$.

	For achieving binary weights, we replace the possible edge between each $B_i$ and $C_i$ by an \emph{amplifier} that produces a big number if the edge could be there: For each node $B_i$, build a binary tree with $n+1$ leaf nodes with edges directed towards $B_i$. Then, let each leaf node have parent sets $\{C_i\}$ and $\emptyset$ with weight $1$. Again, none of them can have parents for a satisfying assignment, resulting in a weight of $1$. However, each of them has two choices for an unsatisfying assignment, producing a factor of $2^{n+1}$ to the sum of weights. Again, we take the result modulo $M + 1$.
\end{proof}


\subsection{Parameterized Case is $\sharp$W[1]-hard}

While constructing the same structure with different partitions into $N_1$, $N_2$, and $P$ is not an issue when searching for the best structure, it certainly is in counting the sum of the weights of all valid DAGs. Consider, for example, a network of four nodes $v_1$, $v_2$, $v_3$, and $v_4$ such that $v_1$ and $v_2$ are the parents of $v_3$, which is the parent of $v_4$. What should be its canonical representation? If we fix $k=2$, then $v_3$ has to be in $N_1$. However, now $v_1$ and $v_2$ cannot both be in $N_2$ nor in $N_1$ due to the v-structure and the parameter $k$. Further complicating the matter, we can put $v_4$ to $P$ or to $N_2$.

We proceed to show that \VCC is $\sharp$W[1]-hard, meaning that no algorithm of time complexity $f(k)n^{O(1)}$ is likely to exist under typical complexity theoretical assumptions.

\begin{theorem}
	\VCC is $\sharp$\textsc{W}$[1]$-hard even if each vertex can have at most $2$ parents.
\end{theorem}
\begin{proof}
	The result follows rather directly from the W$[1]$-hardness result of \cite{Korhonen15} for \VC. Counting the cliques of size $k$ is a $\sharp$W$[1]$-complete problem \citep{Flum04}, and we show how to adapt \VCC into counting them.
	
	For each edge $\{v, w\}$ of the input graph $G = (V, E)$ for counting cliques, add a new \emph{edge vertex} $u$ whose parent set $\{v, w\}$ has an indeterminate weight $x$, an empty parent set has weight $1$, and the weights are $0$ otherwise. For vertices in $V$, let only the empty parent set have weight $1$.
	
	Now, assume that the vertices of $N_1$ form a clique of size $k$ in $G$. Then, there are exactly $k(k-1)/2$ edge vertices who can have a weight of $x$. Assuming all of their weights are $x$, then the remaining vertices need to be parentless for the graph to achieve a positive weight. Additionally, the number of ways the vertices can be distributed between $P$ and $N_2$ is
	\[ \sum_{s=0}^k \binom{|V|+|E|-k}{s}. \]
	
	On the other hand, a set $N_1$ that is not a $k$-clique (for $k > 3$) cannot achieve a weight with a factor $x^{k(k-1)/2}$:
	% , that is, have $k(k-1)/2$ edges in $G$ connecting its vertices
	If $N_1$ contains $a$ edge vertices, then there can be at most \[ \binom{|N_1| - a}{2} + a = \binom{|N_1|}{2} + \frac{a^2 - (2 \cdot |N_1| - 3)a}{2} \]
	edge vertices with a parent set of value $x$. The rightmost summand is nonnegative only after $a \ge 2 \cdot |N_1| - 3$, but on the other hand $a \le |N_1|$, meaning that the summand can be nonnegative only if $|N_1| \le 3$. Thus, there are fewer than $\binom{k}{2}$ parent sets of value $x$ when $k > 3$ and $N_1$ is not a $k$-clique of size $k$.
	
	Computing the sum $\binom{k}{2} + 1$ times over different values of $x$ enables us to discover the coefficient of the term $x^{k(k-1)/2}$ using polynomial interpolation. This is then divided by the constant number of ways of distributing the vertices between $P$ and $N_2$ to obtain the number of $k$-cliques in $G$.
\end{proof}

Curiously, the maximization variant is known to be W$[2]$-hard by a reduction from \textsc{Set Cover} \citep{Gruttemeier22}, but transforming that proof to show $\sharp$W$[2]$-hardness of \VCC appears challenging due to having to count each set cover only once.

\subsection{Exact Counting}

To prevent duplicate counting, we need to determine a canonical decomposition into the sets $N_1$, $N_2$, and $P$ for each valid structure. We achieve this by demanding that the core vertices must have children:

\begin{definition}
	A directed acyclic graph $G$ with $\tau(G) \le k$ and a partition into sets $N_1$, $N_2$, and $P$ are called a \emph{parent decomposition} if all vertices in $N_1$ and $N_2$ have a child. Furthermore, we require that either $|N_1| = k$ or $N_2 = \emptyset$.
\end{definition}

The latter constraint simplifies later proofs. Next, we show each valid DAG has such a decomposition and bound the number of ways of decomposing a DAG (used for sampling).

\begin{lemma}\label{lem:decomposition}
	Each directed acyclic graph $G$ with $\tau(G) \le k$ has at least one and at most $2^k$ parent decompositions. The upper bound is tight up to a polynomial factor in $k$ and is achieved by a core that is a chain of $3k/2$ vertices.
\end{lemma}
\begin{proof}
	Take any DAG $G$ with $\tau(G) \le k$ and its partition into the sets $N_1$, $N_2$, and $P$. Note that any childless vertex in $N_2$ can be moved to $P$. Similarly, any childless vertex in $N_1$ can be moved to $P$ if its (possible) parent in $N_2$ is moved to $N_1$. The size of the set $N_1$ does not increase, keeping it as a valid vertex cover. If $|N_1| \neq k$, we can always move vertices from $N_2$ to $N_1$ until it becomes full or $N_2$ is empty.
	
	Each parent decomposition of a DAG has the same set of core vertices $N_1 \cup N_2$ but distinct sets $N_2$. Therefore, we prove the upper bound by bounding the number of possible sets $N_2$ for any set of core vertices. Furthermore, we show that a \emph{chain} --- a directed path --- has the greatest number of such sets. First, note that $N_2$ has to be an independent set in the moralized graph and cannot contain core vertices with children in $P$. We proceed to restrict the graph structure and prove with injections that the maximum number of independent sets will not decrease.
	
	Pick any independent set in the moralized core. That set remains independent even if we removed all but one outgoing edge from each vertex before moralization. Thus, we have restricted ourselves into a directed forest.
	
	Now, consider any tree in the forest. If it is not a chain, there always exists a vertex $v$ whose ancestors would form multiple chains if $v$ were removed. We show that detaching all but one of these chains and joining them together into a longer chain has at least as many independent sets in the moralized graph using the crucial property that $v$ and its parents form a clique after moralization. We define the injection as follows: If none of the parents of $v$ is in the independent set, then the resulting independent set will remain the same in the longer chain. Otherwise, replace all descendants of the parent of $v$ in the independent set by their children. This works because the parents of $v$ separate the subchains from each other and only one of them can be in the independent set.
	
	Finally, observe that each chain ends in a vertex with children in the periphery since we required the partition to be a parent decomposition. Such vertices cannot be in $N_2$, meaning that concatenating the chains cannot decrease the number of parent decompositions. Thus, a single chain core has the largest number of them. If the chain consists of less than $k$ vertices, then $N_2$ is empty by the definition of a parent decomposition. Otherwise, the chain has $c$ vertices and there are $\binom{k}{c - k}$ ways to choose an independent set of size $c - k$. This is maximized when $c=3k/2$ and approaches asymptotically $2^k/\sqrt{k\pi/2}$ by Stirling's approximation.
\end{proof}

To summarize our approach, we iterate over all cores and its structures, connect $N_1$ to the periphery, and verify that
\begin{enumerate}[label=\roman*]
	\item it is a valid DAG,
	\item each core vertex has a child, and
	\item the choice of the set $N_2$ is lexicographically the smallest for that core (with a fixed vertex ordering).
\end{enumerate}

Before going through with the detailed proof of the algorithm, we need a data structure for efficiently computing the weights from connecting $N_1$ to the periphery.

\begin{lemma}\label{lem:periphery}
	Given a partition into sets $N_1$, $N_2$, and $P$, a data structure can be built in time $4^k n^{O(1)}$ that allows querying the total weight of parent set choices for $P$ with exactly the subset $S \subseteq N_1$ having children in $P$ in time $n^{O(1)}$. 
\end{lemma}
\begin{proof}
	Index vertices in $P = \{p_1, p_2, \dots, p_{n - |N_1| - |N_2|}\}$ arbitrarily. We build a dynamic programming table $\dpper(S,i)$ describing the total weight of parent set choices of $p_{1:i}$ with exactly vertices in $S \subseteq N_1$ having children in $P$. For notational convenience, we omit $N_1$ and $P$ from the $\dpper$ even though it depends on them. The table is initialized with $\dpper(\emptyset, 0) \coloneqq 1$ and has the recurrence relation
	\[ \dpper(S, i) \coloneqq \sum_{T \subseteq S} \sum_{\substack{U \subseteq S \\ U \supseteq S \setminus T}} \dpper(U, i-1) \cdot f_{p_i}(T). \]
	Here, $U$ is the set of vertices in $N_1$ that $p_{1:(i-1)}$ cover and $T$ is the parent set of $p_i$.
	
	To further simplify notation, we let 
	\[ \dpper(S) \coloneqq \dpper(S, n - |N_1| - |N_2|) \]
	for all $S$. This is the answer to the query for a given $S$.
\end{proof}

% The complexity of building the structure could be optimized to $3^k n^{O(1)}$ by nontrivial precomputation of the inner sums, but we ignore that here as other parts dominate the computational complexity.

We may now analyze the total complexity of our algorithm:

\begin{theorem}
	\VCC can be solved in time $2^{\binom{2k}{2}} 12^k n^{2k+O(1)}$.
\end{theorem}
\begin{proof}
	Assume that we have already partitioned the vertices into $N_1$, $N_2$, and $P$. We start by iterating over all permutations of $N_1 \cup N_2$ and use that as a topological ordering for our core DAG. Then, we iterate over parent sets of core vertices respecting that topological ordering. Finally, we verify in polynomial time that the topological ordering used to generate the DAG is the lexicographically smallest one obtainable from that DAG. This generation of all core DAGs requires roughly $(2k)!2^{\binom{2k}{2}}$ work, matching the number of DAGs up to a single exponential factor \citep{Stanley73}.
	
	Next, we need to assign edges from the core to the periphery. To this end, iterate over subsets $S$ of $N_1$ that should have children in $P$. We consider all such parent set assignments simultaneously by making use of the data structure of Lemma~\ref{lem:periphery}. Consider the graph resulting from any one of them. Any such graph clearly satisfies the property i. Property ii holds if vertices in $N_2$ and $N_1 \setminus S$ have a child in the core, which is checkable in polynomial time. Finally, we need to verify that $N_2$ is the lexicographically smallest possible set for the given structure. The set $N_2$ needs to be an independent set in the moralized graph without children in $P$. To check this, we moralize the core and iterate over independent subsets $T$ of $N_1 \setminus S$. If there is a subset of $N_2$ of size $|N_2| - |T|$ that is independent of $T$, then $T$ together with those vertices is also a valid option for the set $N_2$. Repeating this for all $T$ allows us to find the lexicographically smallest set. If all properties hold, then we add the weight of the core multiplied by the weight of parent set choices for $P$ such that exactly $S$ has children in $P$ to the total weight. Overall, there are $3^k$ choices for the sets $S$ and $T$.
	
	Finally, there are $\binom{n}{k}\binom{n-k}{k}$ unordered sets $N_1$ and $N_2$ over which we need to iterate. Thus, the total running time is $2^{\binom{2k}{2}} 12^k n^{2k+O(1)}$.
\end{proof}

\section{Sampling}

The exact counting algorithm is unfortunately doubly exponential to avoid duplicate counting, but can we do better in estimating the sum of weights? It is a common phenomenon that approximating is computationally simpler than exact counting (consider, e.g., estimating the permanent of a matrix \citep{Jerrum04}). As counting and sampling objects with probabilities proportional to their weights are inherently connected, we approach the issue through sampling.

We develop a sampling method for network structures using \emph{rejection sampling} in which we sample structures from a superset and then check whether they should be \emph{accepted} or \emph{rejected}. More precisely, we let valid structures appear multiple times in the superset and demand that exactly one of the duplicates should be accepted for each structure. This ensures the correct distribution of accepted samples and yields an acceptance probability that is the ratio of the total weight of the structures and the superset. Additionally, the sum of the weights is estimable by multiplying the empirical acceptance rate by the weight of the superset.

The key component of our algorithm is the fact of Lemma~\ref{lem:decomposition} that each valid DAG has at most $2^k$ parent decompositions. We use this property to first compute an approximation of the sum of the weights that is at most $2^k$ times greater than the exact sum. For this purpose, we iterate over all partitions of vertices into $N_1$, $N_2$, and $P$ to compute the sum of weights of network structures that have a parent decomposition consistent with the partition. In addition, we further separate the structures by considering the unique sets $S \subseteq N_1$ that are the sinks of the subgraph induced by the core, because they need children in the periphery.

Before continuing, we need one more tool to aid dynamic programming: The \emph{sink-layering} $L = (L_1, L_2, \dots, L_\ell)$ of a DAG is an ordered partition of the vertices such that the last layer $L_\ell$ contains the sink vertices, $L_{\ell - 1}$ the sinks after removing the vertices in $L_{\ell}$, and so on. More formally, the layer $L_k$ comprises of the sinks of the subgraph induced by $V \setminus (L_{k+1} \cup L_{k+2} \cup \dots \cup L_{\ell})$. The concept is illustrated in Figure~\ref{fig:root}. This structure lets us to go through the core layer by layer while only maintaining few subsets of vertices: the previous layer, used vertices, and the guess for the current layer. The strategy has been applied succesfully on sampling and counting DAGs by, for example, \cite{Kuipers15} and \cite{Talvitie19}.

\begin{figure}
        \small
        \centering
        \begin{tikzpicture}
            \node[shape=circle,draw,inner sep=1.5] (N11) at (0,-0.25) {$3$};
            \node[shape=circle,draw,inner sep=1.5] (N12) at (0,-1.5) {$2$};
            \node[shape=circle,draw,inner sep=1.5] (N13) at (0,-2.75) {$1$};
            \node[shape=circle,draw,inner sep=1.5] (N21) at (-1.5,-0.75) {$5$};
            \node[shape=circle,draw,inner sep=1.5] (N22) at (-1.5,-2.25) {$4$};
            \node[] (T1) at (0,0.55) {$N_1$};
            \node[] (T2) at (-1.5,0.55) {$N_2$};
            \path [-triangle 45] (N12) edge node[] {} (N13);
            \path [-triangle 45] (N11) edge[bend right] node[] {} (N13);
            \path [-triangle 45] (N21) edge node[] {} (N11);
            \path [-triangle 45] (N21) edge node[] {} (N13);
            \path [-triangle 45] (N22) edge node[] {} (N12);
            \draw[dotted, rounded corners, thick] (-1.9,0.15) rectangle (-1.1,-3.15);
            \draw[dotted, rounded corners, thick] (-0.4,0.15) rectangle (0.4,-3.15);
            
			\draw[dotted, thick] (2.5,0.55) -- (2.5,-3.15);
			\draw[dotted, thick] (3.5,0.55) -- (3.5,-3.15);
            \node[] (L1) at (2,0.55) {$L_1$};
            \node[] (L2) at (3,0.55) {$L_2$};
            \node[] (L3) at (4,0.55) {$L_3$};
            
            \node[shape=circle,draw,inner sep=1.5] (RN11) at (3,-0.75) {$3$};
            \node[shape=circle,draw,inner sep=1.5] (RN12) at (3,-2.25) {$2$};
            \node[shape=circle,draw,inner sep=1.5] (RN13) at (4,-1.5) {$1$};
            \node[shape=circle,draw,inner sep=1.5] (RN21) at (2,-0.75) {$5$};
            \node[shape=circle,draw,inner sep=1.5] (RN22) at (2,-2.25) {$4$};
            \path [-triangle 45] (RN12) edge node[] {} (RN13);
            \path [-triangle 45] (RN11) edge node[] {} (RN13);
            \path [-triangle 45] (RN21) edge node[] {} (RN11);
            \path [-triangle 45] (RN21) edge[bend right] node[] {} (RN13);
            \path [-triangle 45] (RN22) edge node[] {} (RN12);
        \end{tikzpicture}
    \caption{A core DAG and its sink-layering.} 
    \label{fig:root}
\end{figure}

To efficiently utilize sink-layerings, we require a data structure that allows us to query the total weight of parent assignments for the vertices of the current layer $C = L_i$ such that the union of their parents covers the previous layer $F = L_{i-1}$ and is a subset of vertices $U = \bigcup_{j=1}^{i-1} L_j$ that have already been used. We denote this sum by $W(C, F, U)$. Note that these values depend on the chosen sets $N_1$ and $N_2$ (vertices of $N_1$ can have only one parent from $N_2$), but we omit them from the notation for the sake of readability.

\begin{lemma}\label{lem:Wtime}
	The values of $W(C, F, U)$ can be computed in time $O\left(4^{2k}\right)$ for given sets of core vertices $N_1$ and $N_2$.
\end{lemma}
\begin{proof}
	We will iterate over all $O(3^{2k})$ choices for $C$ and $U$, and apply fast subset convolution to obtain the results for all values of $F \subseteq U$. This results in each of the $4^{2k}$ values of $W$ being computed in time that is polynomial in $k$ on average. 
	
	First, initialize $W_0(C, F, U) = 0$ for all $F \subseteq U$ with the exception of $W_0(C, \emptyset, U) = 1$. For $i > 0$, we index the set $C = \{v_1, v_2, \dots, v_{|C|}\}$ arbitrarily, and compute $W_i(C, F, U)$ as the sum \[ \sum_{T \subseteq F} W_{i-1}(C,T,U) \cdot \tilde{f}_{v_{i}}\big(F \setminus T \subseteq G_{v_i} \subseteq U \big), \] with $\tilde{f}_v(\psi(G_v))$ being the sum of weights of parent sets $G_{v}$ that satisfy the condition $\psi(G_v)$. In other words, for $v_1, v_2, \dots, v_i$ to cover a set $F$ of vertices, the parents of $v_1, v_2, \dots, v_{i-1}$ cover a subset of $F$ and $v_i$ the rest (and possibly some vertices that are already covered or outside $F$). As the left-hand side and the right-hand side factors depend only on $T$ and $F \setminus T$, respectively, for fixed $C$, $U$, and $i$, we can apply fast subset convolution to compute the value for all choices of $F$ simultaneously. Finally, we let $W(C, F, U) \coloneqq W_{|C|}(C, F, U)$.
	
	We can precompute $\tilde{f}_v$ in $O(4^{2k})$ time before starting to compute the values of $W_i$. Additionally, there are $4^{2k}$ possible values of $C$, $F$, and $U$, and each $W_i(C, F, U)$ takes $k^{O(1)}$ time to compute on average, proving the time complexity.
\end{proof}

\begin{lemma}\label{lm:coresum}
	Given a partition into sets $N_1$, $N_2$, and $P$ in addition to a subset $S$ of $N_1$, we can compute the total weight of valid parent set choices for the core such that exactly the vertices in $S$ are the sinks of the core in time $O(4^{2k})$.
\end{lemma}
\begin{proof}
	Begin by computing the values $W(C, F, U)$ with core vertices $N_1$ and $N_2$ in time $O\left(4^{2k}\right)$. Inspired by \cite{Talvitie19}, we proceed layer by layer from $L_1$ towards $L_\ell$. No layer is empty, so we need to consider at most $2k$ layers. Let $\overline{S}$ be the set $(N_1 \cup N_2) \setminus S$ and $\dpcore(F, U)$ denote the total weight of DAGs with a vertex set $U$ such that their sinks are $F$. Again, $\dpcore$ depends on the sets $N_1$ and $N_2$, but we omit them for notational convenience. We initialize \[\dpcore(U, U) \coloneqq \prod_{v \in U} f_v(\emptyset)\] for all $U \subseteq \overline{S}$: the vertices in in the first layer lack parents.
	
	For the following layers, we guess the previous layer, the current layer, and the set of used vertices. This results in the following recurrence for all $U, C \subseteq \overline{S}$ with $U \cap C = \emptyset$:
	\begin{align*}
		\dpcore(U \cup C, C) \coloneqq \sum_{F \subseteq U} \dpcore(U, F) \cdot W(C, F, U)
	\end{align*}
	
	Finally, we need to include $S$ on the last layer by defining \[ \dpcore(N_1 \cup N_2, S) \coloneqq \sum_{F \subseteq \overline{S}} \dpcore(\overline{S}, F) \cdot W(S, F, \overline{S}). \]
	
	The variable $\dpcore(N_1 \cup N_2, S)$ now contains the desired value. There are roughly $4^{2k}$ choices for the sets $C$, $F$, and $U$, proving the complexity. %so the time complexity is dominated by the precomputation of the values of $W$.
\end{proof}

\begin{theorem}
	A $2^k$-approximation of the solution to \VCC can be computed in time $(4\mathrm{e}n/k)^{2k} n^{O(1)}$.
\end{theorem}
\begin{proof}
	We calculate the $2^k$-approximation by iterating over the parent decompositions and the sets $S$ of core vertices without children in the core. This corresponds to the sum \[ \mathrm{UB} \coloneqq \sum_{(N_1, N_2, P)} \sum_{S \subseteq N_1} \dpcore(N_1 \cup N_2,S) \sum_{T \supseteq S} \dpper(T). \]
	
	By Lemma~\ref{lem:decomposition}, each valid DAG has at least one and at most $2^k$ parent decompositions, proving the accuracy guarantee.
	
	The rightmost sum can be precomputed beforehand for all $S$. Then, we compute the values of $\dpcore$ for all vertex sets $N_1$ and $N_2$ of size at most $k$. By Lemma~\ref{lm:coresum}, each of them takes $O\left(4^{2k}\right)$ time to compute, and we consider roughly $\binom{n}{k}\binom{n-k}{k} \approx n^{2k} / (k!)^2$ sets $N_1$ and $N_2$. By Stirling's approximation, this takes $(4\mathrm{e}n/k)^{2k} n^{O(1)}$ time in total.
\end{proof}

We can now prove our main result on sampling. Hereinafter, write $\dec G$  for the number of parent decompositions of $G$.

\begin{theorem}
	\VCS can be solved with preprocessing time $(4\mathrm{e}n/k)^{2k} n^{O(1)}$ and expected sampling time $4^k n^{O(1)}$ using a randomized algorithm.
\end{theorem}
\begin{proof}
	Nearly all our data structures use only simple summation over sets of vertices with constraints. This allows the use of standard stochastic backtracking routines to sample from the structures: If the summands are saved in an array, we can run binary search (after trivial preprocessing) to move backwards in the dynamic programming tables to deduce the structure of the graph. As the sizes of the tables are exponential in $k$, the running time of the binary search is only linear in $k$ and logarithmic in $n$.
	
	First, we sample the partition $(N_1, N_2, P)$ of the vertices of the DAG and the set $S$, and then proceed on figuring out the structure of the core and the periphery. At this point, the probability of drawing $G$ is proportional to $f(G) \cdot \dec G$.
	
	Finally, we need to prevent sampling the same structure in multiple ways. This is solved with rejection sampling by accepting only one parent decomposition for each DAG. Similar to exact counting, we accept the parent decomposition if and only if $N_2$ is the lexicographically smallest set possible among all parent decompositions of $G$. We iterate over all subsets $T$ of $N_1$ that are independent in the moralized graph, and find $|N_2| - |T|$ lexicographically smallest vertices in $N_2$ that are independent of $T$. If the current independent set $N_2$ is the lexicographically smallest obtainable set, accept the sample, and otherwise reject it. Thus, the acceptance probability of a DAG $G$ is $1 / \dec G \ge 2^{-k}$.
\end{proof}

Overall, the probability of accepting a sample is then 
\[ \frac{\sum_{G \in \mathcal{V}_k} f(G)}{\sum_{G \in \mathcal{V}_k} f(G) \cdot \dec G}, \] where the normalizing constant is already known, $\mathrm{UB}$. Therefore, rejection sampling also enables us to estimate the sum of weights in arbitrary precision by approximating the acceptance ratio empirically and multiplying by $\mathrm{UB}$.

We say that an algorithm computes an $(\epsilon, \delta)$-approximation of a quantity if its relative error is at most $\epsilon$ with probability at least $1 - \delta$. By a result of \cite{Dagum00}, we have an $(\epsilon, \delta)$-approximation of the acceptance ratio immediately after getting $1+4(\mathrm{e}-2)(1+\epsilon)\epsilon^{-2} \ln(2 / \delta)$ accepted draws. Thus, we have the following result:

\begin{theorem}
	\VCC admits $(\epsilon, \delta)$-approximation in expected time $\left((4\mathrm{e}n/k)^{2k} + 4^k \epsilon^{-2} \ln\big(\delta^{-1}\big) \right) n^{O(1)}$.
\end{theorem}
\begin{proof}
	The preprocessing takes $(4\mathrm{e}n/k)^{2k} n^{O(1)}$ time, and sampling takes $4^k \epsilon^{-2} \ln\big(\delta^{-1}\big) n^{O(1)}$ time on average.
\end{proof}

\section{Concluding Remarks}

We advanced three problems on learning Bayesian network structures---maximization, weighted counting, and sampling---parameterized by the vertex cover number of the moralized DAG. First, we showed a nearly quadratic speedup on a previous maximization algorithm, thus significantly extending the scope of practical instances. Second, we presented the first parameterized algorithms for the sampling and counting variants, with polynomial running times for any constant value of the parameter. It remains an open question whether a quadratic speedup can be achieved also for these variants. We complemented our algorithmic results by complexity-theoretical hardness results on weighted counting both in the general and in the parameterized case.

The difficulties we encountered in avoiding duplicate counting raise an intriguing question: are there parameterized classes of DAGs where counting is significantly harder than maximization? One candidate parameter is the number of edges in the DAG, which renders the exponent of $n$ independent of the parameter for maximization \citep{Gruttemeier22}; the problems resemble those of finding and counting paths of desired length, the latter variant known to be significantly harder \citep{Chen07}. 
Overall, counting and sampling have been studied little compared to the maximization variant, leaving numerous questions for future research.

\begin{acknowledgements} % will be removed in pdf for initial submission,
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
    We thank the reviewers for their suggestions on improving the paper. This research was partially supported by the Academy of Finland, grants 316771 and 351156.
\end{acknowledgements}

\bibliography{harviainen_342}

\appendix
% NOTE: necessary when ptmx or no mathfont class option is given
%\providecommand{\upGamma}{\Gamma}
%\providecommand{\uppi}{\pi}
%\section{Math font exposition}
%How math looks in equations is important:
%\begin{equation*}
%  F_{\alpha,\beta}^\eta(z) = \upGamma(\tfrac{3}{2}) \prod_{\ell=1}^\infty\eta \frac{z^\ell}{\ell} + \frac{1}{2\uppi}\int_{-\infty}^z\alpha \sum_{k=1}^\infty x^{\beta k}\mathrm{d}x.
%\end{equation*}
%However, one should not ignore how well math mixes with text:
%The frobble function \(f\) transforms zabbies \(z\) into yannies \(y\).
%It is a polynomial \(f(z)=\alpha z + \beta z^2\), where \(-n<\alpha<\beta/n\leq\gamma\), with \(\gamma\) a positive real number.

\end{document}
