\documentclass[accepted]{uai2022}

\usepackage[american]{babel}

\usepackage[round]{natbib}
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools}

\usepackage{booktabs}
\usepackage{zref-xr}
\usepackage{nameref}
\usepackage{hyperref}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{bbm}
\usepackage{xifthen}
\usepackage{graphicx}
\usepackage{adjustbox}
\usepackage{bibentry}

\usepackage{standalone}
\usepackage[dvipsnames]{xcolor}
\usepackage{tikz}
\usetikzlibrary{positioning}
\usetikzlibrary{arrows}
\usetikzlibrary{calc,fit}
\usetikzlibrary{shapes.geometric}
\usetikzlibrary{shapes.misc}
\usetikzlibrary{decorations.pathmorphing}
\usetikzlibrary{decorations.pathreplacing}
\usetikzlibrary{snakes}
\usepackage{pgfplots}
\usepgfplotslibrary{groupplots}
\pgfplotsset{compat=1.16}
\usepgfplotslibrary{statistics}

\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}

\definecolor{darkblue}{rgb}{0.0, 0.0, 0.55}
\hypersetup{
	pdftitle={Bayesian Structure Learning with Generative Flow Networks},
	pdfkeywords={},
	pdfborder=0 0 0,
	pdfpagemode=UseNone,
	colorlinks=true,
	linkcolor=darkblue,
	citecolor=darkblue,
	filecolor=darkblue,
	urlcolor=darkblue,
	pdfview=FitH,
	pdfauthor={Tristan Deleu, Antonio Gois, Chris Emezue, Mansi Rankawat, Simon Lacoste-Julien, Stefan Bauer, Yoshua Bengio}
}

% if you use cleveref..
\usepackage[capitalize,noabbrev]{cleveref}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

\input{math_commands.tex}

\crefformat{section}{#2Section~#1#3}
\crefformat{appendix}{#2Appendix~#1#3}

\crefformat{equation}{(#2#1#3)}
\crefmultiformat{equation}{#2Equations~#1#3}%
{ \&~#2#1#3}{, #2#1#3}{, \&~#2#1#3}

\crefformat{table}{#2Table~#1#3}
\crefformat{figure}{#2Figure~#1#3}

\crefformat{theorem}{#2Theorem~#1#3}
\crefmultiformat{theorem}{#2Theorems~#1#3}%
{ \&~#2#1#3}{, #2#1#3}{, and~(#2#1#3)}

\crefformat{lemma}{#2Lemma~#1#3}
\crefformat{proposition}{#2Proposition~#1#3}
\crefmultiformat{proposition}{#2Propositions~#1#3}%
{ \&~#2#1#3}{, #2#1#3}{, and~(#2#1#3)}

\crefformat{algorithm}{#2Algorithm~#1#3}
\crefmultiformat{algorithm}{#2Algorithms~#1#3}%
{ \&~#2#1#3}{, #2#1#3}{, and~(#2#1#3)}

\crefformat{corollary}{#2Corollary~#1#3}
\crefformat{definition}{#2Definition~#1#3}

\crefformat{assumption}{#2Assumption~#1#3}
\crefmultiformat{assumption}{#2Assumptions~#1#3}%
{ \&~#2#1#3}{, #2#1#3}{, and~(#2#1#3)}

\newcommand{\children}{\mathrm{Ch}}
\newcommand{\Pa}{\mathrm{Pa}}

\zxrsetup{toltxlabel=true,tozreflabel=false,verbose}
\zexternaldocument*{deleu_607}

\title{Bayesian Structure Learning with Generative Flow Networks\newline(Supplementary material)}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\renewcommand{\Authsep}{\hspace{2em}}
\renewcommand{\Authand}{\hspace{2em}}
\renewcommand{\Authands}{\leading{18pt}\hspace{2em}}

% No star for \thanks
\makeatletter
\def\thanks#1{\protected@xdef\@thanks{\@thanks\protect\footnotetext{#1}}}
\makeatother

% Affiliations on single line
\makeatletter
\renewcommand\AB@affilsepx{\hspace{1em}\protect\Affilfont}
\makeatother

\author[1]{Tristan~Deleu}
\author[1]{Ant\'{o}nio~G\'{o}is}
\author[2]{Chris~Emezue}
\author[1]{Mansi~Rankawat}
\author[1,4]{\authorcr{Simon~Lacoste-Julien}}
\author[3,5]{Stefan~Bauer}
\author[1,4,6]{Yoshua~Bengio}
\affil[1]{Mila, Universit\'{e} de Montr\'{e}al}
\affil[2]{Technical University of Munich}
\affil[3]{KTH Stockholm\protect\\[0.5em]}
\affil[4]{CIFAR AI Chair}
\affil[5]{CIFAR Azrieli Global Scholar}
\affil[6]{CIFAR Senior Fellow}
  
\begin{document}
\onecolumn
\maketitle
\appendix
\renewcommand{\theequation}{\thesection.\arabic{equation}}
\setcounter{equation}{0}
\setcounter{figure}{5}

\section{Limitations of DAG-GFlowNet}
\label{app:limitations}
Although we have shown in the main paper that DAG-GFlowNet is capable of learning an accurate approximation of the posterior distribution $P(G\mid \gD)$ when the size of the dataset $\gD$ is moderate (a situation where the benefits of a Bayesian treatment of structure learning are larger), we observed that as the size of the dataset increases, fitting the detailed-balance loss in \cref{eq:expected-detailed-balance-loss} was more challenging. This can be explained by the fact that with a larger amount of data, the posterior distribution becomes very peaky \citep{koller2009pgm}. As a consequence, in this situation, the delta-score in \cref{eq:delta-score}, which is required to calculate the loss, can take a wide range of values: adding an edge to a graph can drastically increase or decrease its score. In turn, the neural network parametrizing $P_{\theta}(G_{t+1}\mid G_{t})$ needs to compensate for these large fluctuations, making it harder to train.

Unfortunately, some of the standard techniques used in Machine Learning to tackle this issue, such as normalization of the inputs, cannot be applied here. Normalizing the delta-score is equivalent to normalizing the rewards $R(G)$ and $R(G')$ themselves, and as a consequence it would change the distribution that is being approximated: instead of approximating the posterior distribution $P(G \mid \gD)$, we would approximate a distribution $P(G\mid \gD)^{\tau}$ under some temperature $\tau$. Solutions to this problem include a schedule of temperature, similar to simulated annealing, or a reparametrization of $P_{\theta}(G_{t+1}\mid G_{t})$ to better handle large fluctuations of delta-scores; this exploration is left as future work.

\section{Detailed-balance condition with all complete states}
\label{app:detailed-balance-condition}
In this section, we will prove a special case of the \emph{detailed-balance condition} introduced by \citet{bengio2021gflownetfoundations} applied to the case where all the states of the GFlowNet are complete (except the terminal state $s_{f}$). To simplify the presentation, we will follow the notations of \citet{bengio2021gflownetfoundations}, and denote the forward transition probability by $P_{F}(s_{t+1} \mid s_{t})$---instead of $P_{\theta}(s_{t+1} \mid s_{t})$ in the main paper. Recall that the detailed-balance condition \citep[][Def. 17]{bengio2021gflownetfoundations} is given by
\begin{equation}
    F(s_{t})P_{F}(s_{t+1}\mid s_{t}) = F(s_{t+1})P_{B}(s_{t}\mid s_{t+1}).
    \label{eq:vanilla-detailed-balance-condition}
\end{equation}
In the case where all the states are complete, we also know that \citep[][Def. 16]{bengio2021gflownetfoundations}
\begin{equation*}
    P_{F}(s_{f}\mid s_{t}) := \frac{F(s_{t} \rightarrow s_{f})}{\sum_{s'\in\mathrm{Ch}(s_{t})}F(s_{t} \rightarrow s')} = \frac{R(s_{t})}{F(s_{t})} \quad \Leftrightarrow \quad F(s_{t}) = \frac{R(s_{t})}{P_{F}(s_{f}\mid s_{t})},
\end{equation*}
where $F(s \rightarrow s')$ represents the flow from state $s$ to $s'$, as described in \cref{sec:generative-flow-networks}, $F(s)$ is the total flow through state $s$, and we used Proposition~4 \& Equation~34 of \citet{bengio2021gflownetfoundations} to introduce $F(s_{t})$ and $R(s_{t})$ respectively. Replacing $F(\cdot)$ in \cref{eq:vanilla-detailed-balance-condition} yields the expected condition:
\begin{equation}
    R(s_{t})P_{F}(s_{t+1}\mid s_{t})P_{F}(s_{f}\mid s_{t+1}) = R(s_{t+1})P_{B}(s_{t}\mid s_{t+1})P_{F}(s_{f}\mid s_{t}).
\end{equation}

The original formulation in \cref{eq:vanilla-detailed-balance-condition} would require us to parametrize both $P_{F}(s_{t+1} \mid s_{t})$ and $F(s)$. On the other hand, using this alternative condition, we only have to parametrize $P_{F}(s_{t+1} \mid s_{t})$ (including when $s_{t+1} = s_{f}$ is the terminal state).

\section{Definition and update of the mask over actions}
\label{app:mask}
In \cref{sec:structure-gflownet}, we introduced a mask $\vm$ associated with a DAG $G$ to indicate which edges could be legally added to $G$ to obtain a new valid DAG $G'$. This mask must ignore (1) the edges already present in $G$ (which cannot be added further), and (2) any edge whose addition leads to the introduction of a cycle. The mask $\vm$ is constructed using (1) the adjacency matrix of $G$, and (2) the adjacency matrix of the transitive closure of $G^{\top}$, the transpose graph of $G$; recall that $G^{\top}$ is obtained from $G$ by inverting the direction of its edges.

\citet{giudici2003improvingmcmc} use a similar construction to efficiently obtain the legal actions their MCMC sampler may take. In particular, they show that this mask $\vm$ can be updated very efficiently online as edges are added one by one. In practice, this allows us to circumvent an expensive check for cycles at every stage of the construction of a sample DAG in the GFlowNet. Since the mask can be composed in 2 parts (as explained above), we can simply update each part anytime a new edge is added to a DAG $G$.

\begin{figure*}[t]
    \centering
    \includestandalone[width=0.8\linewidth]{figures/online-update-mask}
    \caption{Online update of the mask $\vm$. The mask $\vm_{t}$ associated with $G_{t}$ represents (in black) the edges that cannot be added to $G_{t}$ to obtain a valid DAG. $\vm_{t}$ is decomposed in two parts: the adjacency matrix of $G_{t}$ (top), and the transitive closure of $G_{t}^{\top}$ (bottom). To update the mask and obtain $\vm_{t+1}$ associated with $G_{t+1}$, the result of adding the edge $C \rightarrow A$ to $G_{t}$, each component must be updated separately, and then recombined. The diagonal elements of $\vm_{t}$, corresponding to self-loops (which are always invalid actions to take) are integrated into the transitive closure of $G_{t}^{\top}$ by convention.}
    \label{fig:online-update-mask}
\end{figure*}

In \cref{fig:online-update-mask}, we show how the mask $\vm_{t}$ associated with a graph $G_{t}$ can be updated after adding a new edge $C \rightarrow A$ to obtain the mask $\vm_{t+1}$. The mask is decomposed in 2 parts: the adjacency matrix of $G_{t}$, and the transitive closure of $G_{t}^{\top}$. After adding $C \rightarrow A$, each component is updated separately:
\begin{enumerate}
    \item \textbf{Adjacency matrix:} To update the adjacency matrix, the entry in the adjacency matrix must be set (here, the entry corresponding to the edge $C \rightarrow A$).
    \item \textbf{Transitive closure:} To update the transitive closure of the transpose, we need to compute the outer product of the column corresponding to the target of the edge (here $A$, in blue) with the row corresponding to the source of the edge (here $C$, in red). The outer product is added (more precisely, this is a binary OR) to the initial transitive closure.
\end{enumerate}
These two operations can be done very efficiently in $O(d^{2})$, where $d$ is the number of nodes in the DAG.

\section{Additional experimental results}
\label{app:additional-experimental-results}

\subsection{Details about the metrics}
\label{app:details-metrics}
Throughout this paper, we used mainly two metrics to compare the performance of DAG-GFlowNet over alternative Bayesian structure learning algorithms: the \emph{expected SHD} ($\E\mathrm{-SHD}$), and the \emph{area under the ROC curve} (AUROC). Let $\{G_{1}, \ldots, G_{n}\}$ be samples from the posterior approximation to be evaluated, and $G^{\star}$ be the ground truth graph. The $\E$-SHD to $G^{\star}$ can be estimated as
\begin{equation}
    \E\mathrm{-SHD} \approx \frac{1}{n}\sum_{k=1}^{n}\mathrm{SHD}(G_{k}, G^{\star}),
\end{equation}
where $\mathrm{SHD}(G, G^{\star})$ counts the number of edges changes (adding, removing, reversing an edge) necessary to move from $G$ to $G^{\star}$.

\subsection{Simulated data}
\label{app:simulated-data-lingauss50}
In addition to the experiments on simulated data with graphs over $d=20$ nodes, we also compared DAG-GFlowNet with other methods on graphs with $d=50$ nodes. The experimental setup described in \cref{sec:simulated-data} remains unchanged, and the data generation process is detailed below. We show this comparison in \cref{fig:lingauss50}, in terms of $\E$-SHD, AUROC, and the joint log-likelihood $P(\gD', G \mid \gD)$ on some held-out dataset $\gD'$. We observe that DAG-GFlowNet is still competitive compared to the other algorithms, even though it suffers from a higher variance. This can be partly explained by the neural network parametrizing the forward transition probability $P_{\theta}(G_{t+1} \mid G_{t})$ (see \cref{sec:forward-transition-probabilities,sec:parametrization-linear-transformers}) underfitting the data, and therefore not accurately matching the detailed-balance conditions, necessary for a close approximation of the posterior distribution $P(G\mid \gD)$. Similar to our observations in \cref{sec:application-flow-cytometry-data}, we also noticed that algorithms that tend to perform better in terms of $\E$-SHD (e.g. BCD Nets, Bootstrap-PC) tend to have an order of magnitude fewer edges in the sampled DAGs.
\begin{figure*}[t]
    \centering
    \includestandalone[width=\linewidth]{figures/lingauss50}
    \caption{Bayesian structure learning of linear-Gaussian Bayesian networks with $d = 50$ nodes. Results for $\E$-SHD \& AUROC are aggregated over 10 randomly generated datasets $\gD$, sampled from different (ground-truth) Bayesian networks. Results for $\log P(G, \gD' \mid \gD)$ are given for a single dataset $\gD$; the dashed line corresponds to the log-likelihood of the ground truth graph. Labels: B-PC = Bootstrap-PC, B-GES = Bootstrap-GES, BCD = BCD Nets, GFN = DAG-GFlowNet.}
    \label{fig:lingauss50}
\end{figure*}

\paragraph{Data generation} For our experiments on simulated data, we followed the generation process described in \citep{lorch2021dibs}. The data was generated in the following way:
\begin{enumerate}
    \item We sampled a DAG using an Erd\H{o}s-R\'{e}nyi model \citep{erdos1960ergraphs}, with $2d$ edges on average; the value of the probability of creating an edge between two nodes was scaled accordingly.
    \item Once the structure of the graph is known, we sampled the parameters of the linear-Gaussian model randomly from a standard Normal distribution $\gN(0, 1)$. The linear-Gaussian model is therefore defined as, $\forall j \in [1, d]$
    \begin{equation*}
        X_{j} = \sum_{X_{i} \in \mathrm{Pa}_{G}(X_{j})} \beta_{ij}X_{i} + \varepsilon,
    \end{equation*}
    where $\beta_{ij} \sim \gN(0, 1)$, and $\varepsilon \sim \gN(0, 0.01)$. The defines all the conditional probability distribution of the generative model.
    \item Once the full Bayesian Network is known, we used ancestral sampling to generate $N = 100$ datapoints to fill our dataset $\gD$.
\end{enumerate}

\subsection{Flow Cytometry Data}
\label{app:sachs-comparison}
In \cref{sec:application-flow-cytometry-data}, we described an application of DAG-GFlowNet to real-world flow cytometry data. In particular, we showed in \cref{fig:sachs-comparison-mcmc} that DAG-GFlowNet was capable of modeling a distribution that was not only capable of capturing the mode of the posterior distribution (i.e., graphs with a high score), but also had diversity in the graphs sampled, both in terms of the different Markov Equivalence Classes (MECs) those graphs belong to, but also multiple unique DAG instances of the same MEC (depicted by the size of each point in \cref{fig:sachs-comparison-mcmc}).

\begin{figure}[t!]
    \centering
    \includestandalone[width=\linewidth]{figures/sachs-comparison/comparison_all}
    \caption{Coverage of the posterior approximations learned on flow cytometry data \citep{sachs2005causal}. Each point corresponds to a sampled Markov equivalence class, and its size represents the number of different DAGs (in the equivalence class) sampled from the posterior approximation.}
    \label{fig:sachs-comparison-all}
\end{figure}

However for clarity, we only compared DAG-GFlowNet to methods based on MCMC in \cref{fig:sachs-comparison-mcmc}. In \cref{fig:sachs-comparison-all}, we also added a comparison to BCD Nets \citep{cundy2021bcdnets} and DiBS \citep{lorch2021dibs}, two methods based on Variational Inference. Although we saw in \cref{tab:sachs-continuous} that those two methods were comparing favorably against other algorithms in terms of $\E\mathrm{-SHD}$ and AUROC, including against DAG-GFlowNet, we can assess more precisely the quality of the posterior approximation returned by BCD Nets and DiBS:
\begin{itemize}
    \item Out of $1,\!000$ graphs sampled with BCD Nets, those graphs belonged to one of only two MECs (with a BGe score around $-10,\!950$). Furthermore, as shown by the size of each point, those MECs happen to only contain a single unique DAG. Overall, this means that BCD Nets only returned 2 unique DAGs (out of the $1,\!000$ samples), showing the lack of diversity of the posterior approximation learned with BCD Nets.
    \item DiBS sampled a significant number of very low scoring DAGs, with BGe scores as low as $-12,\!600$ (whereas the best MEC obtained with GES \citep{chickering2002ges} had a score of $-10,\!716.12$).
    \item With our choice of the BGe score, the true posterior distribution would assign the same probability to graphs in the same MEC. However, we can see that DiBS only returned graphs belonging to unique MECs, as opposed to having multiple unique DAGs from the same MEC. This shows that while DiBS has a high diversity in terms of MECs (mainly due to covering low-scoring DAGs), DiBS suffers from a lack of diversity with a single MEC, which would be expected from a faithful approximation of the posterior distribution.
\end{itemize}

\nobibliography{references}

\end{document}
