\documentclass[accepted]{uai2024}
\input{math_commands.tex}
\newcommand{\soft}{\mathrm{soft}}

\usepackage{hyperref}
\usepackage{url}
\usepackage{amsmath,amsfonts,amsthm,amssymb}
\usepackage{mathtools}
\usepackage{thmtools}
\usepackage{thm-restate}
\usepackage{bbm}
\usepackage{booktabs}
\usepackage{adjustbox}
\usepackage{enumitem}
\usepackage{multirow}
\usepackage{colortbl}
\usepackage{floatpag}

\usepackage{xcolor}
\newcommand\TODO[1]{\textcolor{red}{#1}}
\usepackage[mode=buildnew]{standalone}
\usepackage{tikz}
\usepackage{subcaption}
\usepackage{pgfplots}
\pgfplotsset{compat=1.17}
\floatpagestyle{empty}

\usetikzlibrary{positioning}
\usetikzlibrary{arrows}
\usetikzlibrary{calc,fit}
\usetikzlibrary{shapes.geometric}
\usetikzlibrary{shapes.misc}
\usetikzlibrary{decorations.pathmorphing}
\usetikzlibrary{decorations.pathreplacing}
\usepackage[capitalize,noabbrev]{cleveref}

\definecolor{mydarkblue}{rgb}{0,0.08,0.45}
\hypersetup{
    pdftitle={},
    pdfauthor={},
    pdfsubject={},
    pdfkeywords={},
    pdfborder=0 0 0,
    pdfpagemode=UseNone,
    colorlinks=true,
    linkcolor=mydarkblue,
    citecolor=mydarkblue,
    filecolor=mydarkblue,
    urlcolor=mydarkblue,
}

\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

\crefformat{chapter}{#2Chapter~#1#3}
\crefformat{section}{#2Section~#1#3}
\crefformat{appendix}{#2Appendix~#1#3}

\crefformat{equation}{(#2#1#3)}
\crefmultiformat{equation}{#2Equations~#1#3}%
{ \&~#2#1#3}{, #2#1#3}{, \&~#2#1#3}

\crefformat{table}{#2Table~#1#3}
\crefformat{figure}{#2Figure~#1#3}
\crefmultiformat{figure}{#2Figures~#1#3}%
{ \&~#2#1#3}{, #2#1#3}{, and~(#2#1#3)}

\crefformat{theorem}{#2Theorem~#1#3}
\crefmultiformat{theorem}{#2Theorems~#1#3}%
{ \&~#2#1#3}{, #2#1#3}{, and~(#2#1#3)}

\crefformat{lemma}{#2Lemma~#1#3}
\crefformat{proposition}{#2Proposition~#1#3}
\crefmultiformat{proposition}{#2Propositions~#1#3}%
{ \&~#2#1#3}{, #2#1#3}{, and~(#2#1#3)}

\crefformat{algorithm}{#2Algorithm~#1#3}
\crefmultiformat{algorithm}{#2Algorithms~#1#3}%
{ \&~#2#1#3}{, #2#1#3}{, and~(#2#1#3)}

\crefformat{corollary}{#2Corollary~#1#3}
\crefformat{definition}{#2Definition~#1#3}

\crefformat{assumption}{#2Assumption~#1#3}
\crefmultiformat{assumption}{#2Assumptions~#1#3}%
{ \&~#2#1#3}{, #2#1#3}{, and~(#2#1#3)}

%% Choose your variant of English; be consistent
\usepackage[american]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\title{Discrete Probabilistic Inference as Control\\in Multi-path Environments}

% No star for \thanks
\makeatletter
\def\thanks#1{\protected@xdef\@thanks{\@thanks\protect\footnotetext{#1}}}
\makeatother

% Affiliations on single line
\makeatletter
\renewcommand\AB@affilsepx{\hspace{1em}\protect\Affilfont}
\makeatother

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,3]{Tristan~Deleu\thanks{Correspondence to: Tristan Deleu <\href{mailto:deleutri@mila.quebec}{deleutri@mila.quebec}>}\thanks{Code: \href{https://github.com/tristandeleu/gfn-maxent-rl}{https://github.com/tristandeleu/gfn-maxent-rl}}}
\author[2]{Padideh~Nouri}
\author[1]{Nikolay~Malkin}
\author[2,4]{Doina~Precup}
\author[1]{Yoshua~Bengio}
% Add affiliations after the authors
\affil[ ]{\protect\hspace*{-1.3em}Mila -- Quebec AI Institute\protect\\[1em]}
\affil[1]{Universit\'{e} de Montr\'{e}al}
\affil[2]{McGill University}
\affil[3]{Valence Labs}
\affil[4]{Google DeepMind}

\begin{document}
\maketitle

\begin{abstract}
  We consider the problem of sampling from a discrete and structured distribution as a sequential decision problem, where the objective is to find a stochastic policy such that objects are sampled at the end of this sequential process proportionally to some predefined reward. While we could use maximum entropy Reinforcement Learning (MaxEnt RL) to solve this problem for some distributions, it has been shown that in general, the distribution over states induced by the optimal policy may be biased in cases where there are multiple ways to generate the same object. To address this issue, Generative Flow Networks (GFlowNets) learn a stochastic policy that samples objects proportionally to their reward by approximately enforcing a conservation of flows across the whole Markov Decision Process (MDP). In this paper, we extend recent methods correcting the reward in order to guarantee that the marginal distribution induced by the optimal MaxEnt RL policy is proportional to the original reward, regardless of the structure of the underlying MDP. We also prove that some flow-matching objectives found in the GFlowNet literature are in fact equivalent to well-established MaxEnt RL algorithms with a corrected reward. Finally, we study empirically the performance of multiple MaxEnt RL and GFlowNet algorithms on multiple problems involving sampling from discrete distributions.
\end{abstract}

\section{Introduction}
\label{sec:introduction}
Approximate probabilistic inference has seen a tremendous amount of progress, notably from a variational perspective coupled with deep neural networks. This is particularly true for continuous sample spaces, where the Evidence Lower Bound (ELBO) can be maximized with gradient methods, thanks to methods such as pathwise gradient estimation \citep{kingma2013vae,rezende2014dlgm,rezende2015nf}. In the case of discrete and highly structured sample spaces though, this ``reparametrization trick'' becomes more challenging since it often requires continuous relaxations of discrete distributions \citep{jang2017gumbelsoftmax,maddison2017gumbelsoftmax,maddison2018gumbelsinkhorn}. In those cases, variational inference can also be carried out more generally thanks to score function estimation \citep{williams1992reinforce}, albeit at the expense of high variance \citep{mohamed2020montecarlogradient}. An alternative for approximate inference is through sampling methods, based on Markov chain Monte Carlo (MCMC; \citealp{hastings1970mcmc,gelfand1990mcmc}), when the target distribution is defined up to an intractable normalization constant.

\citet{bengio2021gflownet} introduced a new class of probabilistic models called \emph{Generative Flow Networks} (GFlowNets), to approximate an unnormalized target distribution over discrete and structured sample spaces from a variational perspective \citep{malkin2022gfnhvi,zimmermann2022vigfn}. GFlowNets treat sampling as a sequential decision making problem, heavily inspired by the literature in Reinforcement Learning (RL). Unlike RL though, which seeks an optimal policy maximizing the cumulative reward, the objective of a GFlowNet is to find a policy such that objects can be sampled proportionally to their cumulative reward. Nevertheless, this relationship led to a number of best practices from the RL literature being transferred into GFlowNets, such as the use of a replay buffer \citep{shen2023understandingtraininggfn,vemgal2023gfnreplaybuffer} and target network \citep{deleu2022daggflownet}, and advanced exploration strategies \citep{rectorbrooks2023thompsongfn}.

Although \citet{bengio2021gflownet} proved that GFlowNets were exactly equivalent to maximum entropy RL (MaxEnt RL; \citealp{ziebart2010maxent}) in some specific cases which we will recall in \cref{sec:sampling-sequential-decision-making}, it has long been thought that this connection was only superficial in general. However recently, \citet{tiapkin2023gfnmaxentrl} showed that GFlowNets and MaxEnt RL are in fact one and the same, up to a correction of the reward function. This, along with other recent works \citep{mohammadpour2023maxentgfn,anonymous2023gfnpolicygradient}, paved the way to show deeper connections between GFlowNets and MaxEnt RL algorithms. In this work, we extend this correction of the reward to a more general case, and establish novel equivalences between GFlowNet \& MaxEnt RL objectives, notably between the widely used Trajectory Balance loss in the GFlowNet literature \citep{malkin2022trajectorybalance}, and the Path Consistency Learning algorithm in MaxEnt RL \citep{nachum2017pcl}. We also introduce a variant of the Soft Q-Learning algorithm \citep{haarnoja2017sql}, depending directly on a policy, and show that it becomes equivalent to the Modified Detailed Balance loss of \citet{deleu2022daggflownet}. Finally, we show these similarities in behavior empirically on three different domains, and include the popular Soft Actor-Critic algorithm (SAC; \citealp{haarnoja2018sac}) in our evaluations, which has no existing GFlowNet counterpart, under similar conditions on the same domains.

\section{Marginal sampling via sequential decision making}
\label{sec:sampling-sequential-decision-making}
Given an energy function $\gE(x)$ defined over a discrete and finite sample space $\gX$, our objective is to sample objects $x \in \gX$ from the \emph{Gibbs distribution}:
\begin{equation}
    P(x) \propto \exp(-\gE(x) / \alpha),
    \label{eq:gibbs-distribution}
\end{equation}
where $\alpha > 0$ is a temperature parameter. We assume that this energy function is fixed and we can query it for any element in $\gX$. Sampling from this distribution is in general a challenging problem due to the partition function $Z = \sum_{x\in\gX}\exp(-\gE(x)/\alpha)$ acting as the normalization constant for $P$, which is intractable when the sample space is (combinatorially) large. Throughout this paper, we will focus on cases where the objects of interest have some compositional structure (\emph{e.g.}, graphs, or trees), meaning that they can be constructed piece by piece.

\subsection{Maximum entropy Reinforcement Learning}
\label{sec:maxent-rl}
We consider a finite-horizon Markov Decision Process (MDP) $\gM_{\soft} = (\gS, \gA, s_{0}, T, r)$, where the state space $\gS$ and the action space $\gA$ are discrete and finite. We assume that this MDP is deterministic, with a transition function $T: \gS \times \gA \rightarrow \bar{\gS}$ that determines how to move to a new state $s' = T(s, a)$ from the state $s$, following the action $a$. We identify an initial state $s_{0} \in \gS$ from which all the trajectories start. Moreover, since we are in a finite-horizon setting, all these trajectories are finite and we assume that they eventually end at an abstract terminal state $s_{f} \notin \gS$ acting as a ``sink'' state; we use the notation $\bar{\gS} = \gS \cup \{s_{f}\}$. The state space is defined as a superset of the sample space $\gX \subseteq \gS$, and is structured in such a way that $x \in \gX$ iff we can transition from $x$ to the terminal state $s_{f}$ (\emph{i.e.}, there exists an action $a \in \gA$ such that $T(x, a) = s_{f}$); the states $x \in \gX$ are called \emph{terminating states}, following the naming convention of \citet{bengio2023gflownetfoundations}. We also set the discount factor $\gamma = 1$ throughout this paper. This setting with known deterministic dynamics is well-studied in the Reinforcement Learning literature \citep{todorov2006linearlysolvablemdp,kappen2012optimalcontrolgraphicalmodel}.

Since the MDP is deterministic, we can identify the action $a$ leading to a state $s' = T(s, a)$ with the transition $s \rightarrow s'$ in the state space itself. As such, we will write all quantities involving state-action pairs $(s, a)$ in terms of $(s, s')$ instead. The reward function $r(s, s')$ is defined such that the sum of rewards along a complete trajectory (the return) only depends on the energy of the terminating state it reaches: for a trajectory $\tau = (s_{0}, s_{1}, \ldots, s_{T}, s_{f})$, we have
\begin{equation}
    \sum_{t=0}^{T}r(s_{t}, s_{t+1}) = -\gE(s_{T}),
    \label{eq:reward-function-soft-mdp}
\end{equation}
with the convention $s_{T+1} = s_{f}$. In particular, this covers the case of a sparse reward that is received only at the end of the trajectory (\emph{i.e.}, $r(s_{T}, s_{f}) = -\gE(s_{T})$, and zero everywhere else). This decomposition of the energy into intermediate rewards is similar to \citet{buesing2020approximate}.

While the objective of Reinforcement Learning is typically to find a policy $\pi(s_{t+1}\mid s_{t})$ maximizing the expected sum of rewards (here corresponding to finding a state $x\in \gX$ with lowest energy, or equivalently a mode of $P$ in \cref{eq:gibbs-distribution}), in \emph{maximum entropy Reinforcement Learning} (MaxEnt RL) we also search for a policy that maximizes the expected sum of rewards, but this time augmented with the entropy $\gH(\pi(\cdot\mid s))$ of the policy $\pi$ in state $s$:
\begin{align}
    \pi^{*}_{\mathrm{MaxEnt}} = \argmax_{\pi}\E_{\tau}\Bigg[\sum_{t=0}^{T} r(s_{t}, s_{t+1}) + \alpha \gH(\pi(\cdot \mid s_{t}))\Bigg],\nonumber\\
    \label{eq:maxent-rl-problem}
\end{align}
where the expectation is taken over complete trajectories $\tau$ sampled following the policy $\pi$. Intuitively, adding the entropy to the objective encourages stochasticity in the optimal policy and improves diversity. To highlight the difference with the standard objective in RL, $\gM_{\soft}$ will be called \emph{soft MDP}, following the nomenclature of \citet{ziebart2010maxent}.

\paragraph{Notations.} In what follows, it will be convenient to view the transitions in the soft MDP as a directed acyclic graph (DAG) $\gG$ over the states in $\bar{\gS}$ (including $s_{f}$), rooted in $s_{0}$; the graph is acyclic to ensure that $\gM_{\soft}$ is finite-horizon. We will use the notations $\mathrm{Pa}(s)$ and $\mathrm{Ch}(s)$ to denote respectively the parents and the children of a state $s$ in $\gG$. For any terminating state $x \in \gX$, $s_{0} \rightsquigarrow x$ denotes a complete trajectory in $\gG$ of the form $\tau = (s_{0}, s_{1}, \ldots, x, s_{f})$; the transition to the terminal state is implicit in the notation, albeit necessary in our definition.

\subsection{Sampling terminating states from a soft MDP}
\label{sec:sampling-terminating-states}
From the literature on \emph{control as inference} \citep{ziebart2008maxentirl,levine2018controlasinference}, it can be shown that the policy maximizing the MaxEnt RL objective in \cref{eq:maxent-rl-problem} induces a distribution over dynamically consistent trajectories $\tau = (s_{0}, s_{1}, \ldots, s_{T}, s_{f})$ that depends on the sum of rewards along the trajectory:
\begin{align}
    \begin{adjustbox}{center}
    $\displaystyle\pi^{*}(\tau) \triangleq \prod_{t=0}^{T}\pi^{*}_{\mathrm{MaxEnt}}(s_{t+1}\mid s_{t}) \propto \exp\left(\frac{1}{\alpha}\sum_{t=0}^{T}r(s_{t}, s_{t+1})\right).$
    \end{adjustbox}\nonumber\\
    \label{eq:distribution-trajectories}
\end{align}
With our choice of reward function in \cref{eq:reward-function-soft-mdp}, this suggests a simple strategy to sample from the Gibbs distribution in \cref{eq:gibbs-distribution}, once the optimal policy $\pi^{*}_{\mathrm{MaxEnt}}$ is known: sample a trajectory in $\gM_{\soft}$, starting at the initial state $s_{0}$ and following the optimal policy, and only return the terminating state $s_{T}$ reached at the end of $\tau$ (ignoring the terminal state $s_{f}$). However, in general, we are interested in sampling an object $x \in \gX$ (the terminating state) using this sequential process, but not \emph{how} this object is generated (the exact trajectory taken). The distribution of interest is therefore not a distribution over \emph{trajectories} as in \cref{eq:distribution-trajectories}, but its \emph{marginal} over terminating states:
\begin{equation}
    \pi^{*}(x) \triangleq \sum_{\tau:s_{0}\rightsquigarrow x}\pi^{*}(\tau).
    \label{eq:terminating-state-probability}
\end{equation}
$\pi^{*}(x)$ is called the \emph{terminating state distribution} associated with the policy $\pi^{*}_{\mathrm{MaxEnt}}$ \citep{bengio2023gflownetfoundations}. When the state transition graph $\gG$ of the soft MDP is a tree\footnote{With the exception of the terminal state $s_{f}$, whose parents are always all the terminating states in $\gX$.} rooted in $s_{0}$, and there is a unique complete trajectory $s_{0} \rightsquigarrow x$ leading to any $x \in \gX$, then this process is guaranteed to be equivalent to sampling from the Gibbs distribution \citep{bengio2021gflownet}. Examples of tree-structured MDPs include the autoregressive generation of sequences \citep{bachman2015datagensequential,weber2015virl,angermueller2020dynappo,jain2022gfnbiological,feng2022metarlbo}, and sampling from a discrete factor graph with a fixed ordering of the random variables \citep{buesing2020approximate}.

\begin{figure}[t]
    \centering
    \includegraphics[width=\columnwidth]{figures/toy-gflownet-vertical}
    \caption{Illustration of the bias of the terminating state distribution associated with $\pi^{*}_{\mathrm{MaxEnt}}$ on a soft MDP with a DAG structure. The labels on each transition of the MDP corresponds to the reward function, satisfying \cref{eq:reward-function-soft-mdp} (sparse reward setting). The terminating state distribution $\pi^{*}(x)$ is computed by marginalizing $\pi^{*}(\tau)$ over trajectories leading to $x$ (\emph{e.g.}, two trajectories $s_{0} \rightarrow s_{1} \rightarrow x_{4}$ and $s_{0} \rightarrow s_{2} \rightarrow x_{4}$ to $x_{4}$). $\pi^{*}(\tau)$ is computed based on \cref{eq:distribution-trajectories}, and we assume $\alpha = 1$. The terminating state distribution $\pi^{*}(x)$ should be contrasted with the (target) Gibbs distribution $P(x) \propto \exp(-\gE(x))$. The normalization constant is $Z' = \exp(-\gE(x_{3})) + 2\exp(-\gE(x_{4})) + \exp(-\gE(x_{5}))$. This MDP is inspired by \citet{jain2023gfnscientific}.}
    \label{fig:toy-gflownet}
\end{figure}

The equivalence between control and inference (sampling from \cref{eq:gibbs-distribution}) no longer holds when $\gG$ is a general DAG though. As an illustrative example, shown in \cref{fig:toy-gflownet}, consider the problem of generating small molecules by adding fragments one at a time, as described in \citet{you2018graphgenrl}. If we follow the optimal policy, even though the distribution over trajectories is proportional to $\exp(-\gE(x))$ as in \cref{eq:distribution-trajectories}, its marginal over terminating states is biased since there are two trajectories leading to the same molecule $x_{4}$ (\emph{i.e.}, multiple orders in which fragments can be added). This bias was first highlighted by \citet{bengio2021gflownet}.

\subsection{Generative Flow Networks}
\label{sec:gflownets}
To address this mismatch between the target Gibbs distribution in \cref{eq:gibbs-distribution} and the terminating state distribution in \cref{eq:terminating-state-probability} induced by the optimal policy in MaxEnt RL, \citet{bengio2021gflownet} introduced a new class of probabilistic models over discrete and compositional objects called \emph{Generative Flow Networks} (GFlowNets; \citealp{bengio2023gflownetfoundations}). Instead of searching for a policy maximizing \cref{eq:maxent-rl-problem}, the goal of a GFlowNet is to find a \emph{flow} function $F(s \rightarrow s')$ defined over the edges of $\gG$ that satisfies the following flow-matching conditions for all states $s' \in \gS$ such that $s' \neq s_{0}$:
\begin{equation}
    \sum_{s\in\mathrm{Pa}(s')} F(s \rightarrow s') = \sum_{s''\in\mathrm{Ch}(s')}F(s'\rightarrow s''),
    \label{eq:flow-matching-condition}
\end{equation}
with an additional boundary condition for each terminating state $x\in\gX$: $F(x \rightarrow s_{f}) = \exp(-\gE(x)/\alpha)$. Putting it in words, \cref{eq:flow-matching-condition} means that the total amount of flow going into any state $s'$ has to be equal to the amount of flow going out of it, except for the initial state $s_{0}$ which acts as a single ``source'' for all the flow. We can define a policy\footnote{Following the conventions from the GFlowNet literature, we will use the notation $P_{F}$ for this policy, also called the forward transition probability \citep{bengio2023gflownetfoundations}, to distinguish it from the optimal policy $\pi^{*}_{\mathrm{MaxEnt}}$ in MaxEnt RL.} from a flow function by simply normalizing the flows going out of any state $s_{t}$:
\begin{equation}
    P_{F}(s_{t+1}\mid s_{t}) \propto F(s_{t} \rightarrow s_{t+1}).
    \label{eq:forward-transition-probability}
\end{equation}
\citet{bengio2021gflownet} showed that if there exists a flow function satisfying the flow-matching conditions in \cref{eq:flow-matching-condition} as well as the boundary conditions, then sampling terminating states by following $P_{F}$ in the soft MDP as described in \cref{sec:sampling-terminating-states} is equivalent to sampling from \cref{eq:gibbs-distribution}. In that case, the terminating state distribution associated with the policy $P_{F}$ satisfies $P_{F}(x) \propto \exp(-\gE(x)/\alpha)$, for any terminating state $x \in \gX$, and is not biased by the number of trajectory leading to $x$, contrary to the optimal policy in MaxEnt RL.

\begin{figure*}[t]
\centering

\begin{adjustbox}{scale=0.9}
\begin{tikzpicture}[every node/.style={inner sep=0pt}, gfn_edge/.style={ultra thick, -latex}, alg_node/.style={thick, circle, draw, minimum size=41pt, fill=white, inner sep=2pt, align=center}, intra_alg_edge/.style={thick, -latex}, inter_alg_edge/.style={latex-latex, line width=1.5pt}, y=70pt, x=75pt]

\pgfdeclarelayer{background}
\pgfsetlayers{background,main}

% https://colorbrewer2.org/#type=sequential&scheme=YlGnBu&n=4
\definecolor{groupcolor1}{HTML}{225ea8}
\definecolor{groupcolor2}{HTML}{41b6c4}
\definecolor{groupcolor3}{HTML}{a1dab4}

\node[alg_node] (pcl) at (1, 1) {};
\node[] at (pcl.center) {\begin{tikzpicture}
    \node[scale=1] (title_pcl) {PCL};
    \node[below=2pt of title_pcl, scale=0.4] {\citep{nachum2017pcl}};
\end{tikzpicture}};

\node[alg_node] (subtb) at (1, 0) {};
\node[] at (subtb.center) {\begin{tikzpicture}
    \node[scale=1] (title_subtb) {SubTB};
    \node[below=2pt of title_subtb, scale=0.4] {\citep{madan2022subtb}};
\end{tikzpicture}};

\node[alg_node] (tb) at (0, 0) {};
\node[] at (tb.center) {\begin{tikzpicture}
    \node[scale=1] (title_tb) {TB};
    \node[below=2pt of title_tb, scale=0.4] {\citep{malkin2022trajectorybalance}};
\end{tikzpicture}};

\node[alg_node] (sql) at (2.5, 1) {};
\node[] at (sql.center) {\begin{tikzpicture}
    \node[scale=1] (title_sql) {SQL};
    \node[below=2pt of title_sql, scale=0.4] {\citep{haarnoja2017sql}};
\end{tikzpicture}};

\node[alg_node] (db) at (2, 0) {};
\node[] at (db.center) {\begin{tikzpicture}
    \node[scale=1] (title_db) {DB};
    \node[below=2pt of title_db, scale=0.4] {\citep{bengio2023gflownetfoundations}};
\end{tikzpicture}};

\node[alg_node] (fl_db) at (3, 0) {};
\node[] at (fl_db.center) {\begin{tikzpicture}
    \node[scale=1] (title_fl_db) {FL-DB};
    \node[below=2pt of title_fl_db, scale=0.4] {\citep{pan2023forwardlookinggfn}};
\end{tikzpicture}};

\node[alg_node] (policy_sql) at (4, 1) {};
\node[] at (policy_sql.center) {\begin{tikzpicture}
    \node[scale=1] (title_policy_sql) {$\pi$-SQL};
    \node[below=2pt of title_policy_sql, scale=0.4] {(\cref{sec:sql-policy-parametrization})};
\end{tikzpicture}};

\node[alg_node] (modified_db) at (4, 0) {};
\node[] at (modified_db.center) {\begin{tikzpicture}
    \node[scale=0.8, align=center] (title_modified_db) {Modified\\[-0.5ex]DB};
    \node[below=2pt of title_modified_db, scale=0.4] {\citep{deleu2022daggflownet}};
\end{tikzpicture}};

\draw[inter_alg_edge, draw=groupcolor1] (pcl) -- (subtb) node[left, midway, scale=0.5, align=center, anchor=east, inner sep=8pt] {Equivalence with\\corrected reward \cref{eq:thm-corrected-reward}\\\cref{prop:equivalence-subtb-pcl}};
\draw[inter_alg_edge, densely dashed, draw=groupcolor1] (pcl) to[bend right=30] (tb);
\draw[intra_alg_edge] (subtb) -- (tb) node[below, midway, scale=0.5, align=center, inner sep=8pt] {Complete\\trajectories\\\citep{malkin2022trajectorybalance}};
\draw[intra_alg_edge] (pcl) -- (sql) node[below, midway, scale=0.5, align=center, inner sep=8pt] {Unified PCL\\rollout $d=1$\\\citep{nachum2017pcl}};
\draw[intra_alg_edge] (subtb) -- (db) node[below, midway, scale=0.5, align=center, inner sep=8pt] {Subtrajectories\\of length 1\\\citep{malkin2022trajectorybalance}};
\draw[inter_alg_edge, densely dashed, draw=groupcolor2] (sql) -- (db);
\draw[inter_alg_edge, densely dashed, draw=groupcolor2] (sql) -- (fl_db);
\draw[intra_alg_edge] (sql) -- (policy_sql) node[below, midway, scale=0.5, align=center, inner sep=8pt] {Policy\\parametrization\\\cref{sec:sql-policy-parametrization}};
\draw[intra_alg_edge] (db) -- (fl_db) node[below, midway, scale=0.5, align=center, inner sep=8pt] {Intermediate\\rewards\\\citep{pan2023forwardlookinggfn}};
\draw[intra_alg_edge] (fl_db) -- (modified_db) node[below, midway, scale=0.5, align=center, inner sep=8pt] {All states are\\terminating\\\citep{deleu2022daggflownet}};
\draw[inter_alg_edge, densely dashed, draw=groupcolor3] (policy_sql) -- (modified_db);

\node[fit=(pcl)(subtb)(tb)] (trajectories_group) {};
\node[fit=(sql)(fl_db)(db)] (transitions_group) {};
\node[fit=(policy_sql)(modified_db)] (all_terminating_group) {};

\node[anchor=south, scale=0.8, yshift=8pt] (trajectories_group_title) at (trajectories_group.{north}) {Trajectories};
\node[anchor=south, scale=0.8, yshift=8pt] (transitions_group_title) at (transitions_group.{north}) {\vphantom{j}Transitions};
\node[anchor=south, scale=0.8, yshift=8pt] (all_terminating_group_title) at (all_terminating_group.{north}) {\vphantom{j}$\gS \equiv \gX$};

\begin{pgfonlayer}{background}
\node[fit=(trajectories_group)(trajectories_group_title), fill=groupcolor1!20, inner sep=8pt, rounded corners=3pt] {};
\node[fit=(transitions_group)(transitions_group_title), fill=groupcolor2!20, inner sep=8pt, rounded corners=3pt] {};
\node[fit=(all_terminating_group)(all_terminating_group_title), fill=groupcolor3!20, inner sep=8pt, rounded corners=3pt] (all_terminating_group_fill) {};
\end{pgfonlayer}

\coordinate (mid) at ($(pcl.south)!0.5!(subtb.north)$);
\coordinate[xshift=10pt] (mid_right) at (mid -| all_terminating_group_fill.east);
\draw[|-|, thick, shorten <=-5pt, shorten >=-5pt] (policy_sql.north -| mid_right) -- (policy_sql.south -| mid_right) node[midway, above, sloped, scale=0.8, font=\bfseries, inner sep=8pt] {MaxEnt RL};
\draw[|-|, thick, shorten <=-5pt, shorten >=-5pt] (modified_db.north -| mid_right) -- (modified_db.south -| mid_right) node[midway, above, sloped, scale=0.8, font=\bfseries, inner sep=8pt] {GFlowNet};

\end{tikzpicture}
\end{adjustbox}
    \caption{Equivalence between objectives in MaxEnt RL, with corrected rewards, and the objectives in GFlowNets. The objectives are classified based on whether they operate at the level of (complete) trajectories (left), transitions (middle), or if all the states are terminating (right). Further details about the form of the different residuals and the correspondences to transfer from one objective to another are available in \cref{fig:residual-equivalence}.}
    \label{fig:algrithms-equivalence}
\end{figure*}

\section{Bridging the gap between MaxEnt RL \& GFlowNets}
\label{sec:bridging-gap-maxentrl-gfn}
There exists a fundamental difference between MaxEnt RL and GFlowNets in the way the distributions induced by their (optimal) policies relate to the energy: in MaxEnt RL, this distribution is over \emph{how} objects are being created (trajectories) as mentioned in \cref{sec:sampling-terminating-states}, whereas a GFlowNet induces a distribution over the \emph{outcomes} only (terminating states), the latter matching the requirements of the Gibbs distribution. In this section, we recall some existing connections between GFlowNets and MaxEnt RL \citep{tiapkin2023gfnmaxentrl}, and we establish new equivalences between existing MaxEnt RL algorithms and GFlowNet objectives.

\subsection{Reward correction}
\label{sec:reward-correction}
To correct the bias illustrated in \cref{fig:toy-gflownet} caused by multiple trajectories leading to the same terminating state, we can treat \cref{eq:terminating-state-probability} not as a sum but as an \emph{expectation} over trajectories, by reweigthing $\pi^{*}(\tau)$ (which is constant for a fixed terminating state, by \cref{eq:distribution-trajectories} \& \cref{eq:reward-function-soft-mdp}) with a probability distribution over these trajectories. \citet{bengio2023gflownetfoundations} showed that such a distribution over complete trajectories can be defined by introducing a \emph{backward transition probability} $P_{B}(s\mid s')$, which is a distribution over the parents $s \in \mathrm{Pa}(s')$ of any state $s' \neq s_{0}$. \citet{tiapkin2023gfnmaxentrl} showed that the reward of the soft MDP can be modified based on $P_{B}$ in such a way that the corresponding optimal policy $\pi^{*}_{\mathrm{MaxEnt}}$ is equal to the GFlowNet policy $P_{F}$ in \cref{eq:forward-transition-probability}. We restate and generalize this result in \cref{thm:maxent-rl-unbiased}, where we show how this correction counteracts the effect of the marginalization in \cref{eq:terminating-state-probability}, resulting in a terminating state distribution that matches the Gibbs distribution.

\begin{restatable}[Gen. of \citealp{tiapkin2023gfnmaxentrl}; Theorem 1]{theorem}{maxentrlunbiased}
    \label{thm:maxent-rl-unbiased}
    Let $P_{B}(\cdot\mid s')$ be an arbitrary backward transition probability (\emph{i.e.}, a distribution over the parents of $s'\neq s_{0}$ in $\gG$). Let $r(s, s')$ be the reward function of the MDP corrected with $P_{B}$, satisfying for any trajectory $\tau = (s_{0}, s_{1}, \ldots, s_{T}, s_{f})$:
    \begin{equation}
        \sum_{t=0}^{T}r(s_{t}, s_{t+1}) = -\gE(s_{T}) + \alpha \sum_{t=0}^{T-1}\log P_{B}(s_{t}\mid s_{t+1}),
        \label{eq:thm-corrected-reward}
    \end{equation}
    where we used the convention $s_{T+1} = s_{f}$. Then the terminating state distribution associated with the optimal policy $\pi_{\mathrm{MaxEnt}}^{*}$ solution of \cref{eq:maxent-rl-problem} satisfies $\pi^{*}(x) \propto \exp(-\gE(x) / \alpha)$.
\end{restatable}

The proof of the theorem is available in \cref{app:reward-correction}. Unlike \cref{eq:reward-function-soft-mdp}, the return now depends on the trajectory leading to $s_{T}$ via the second term in \cref{eq:thm-corrected-reward}. Interestingly, the temperature parameter $\alpha$ introduced in the MaxEnt RL literature \citep{haarnoja2017sql} finds a natural interpretation as the temperature of the Gibbs distribution. Note that the correction in \cref{eq:thm-corrected-reward} only involves the backward probability of the whole trajectory $\tau$, making it also compatible even with non-Markovian $P_{B}$ \citep{shen2023understandingtraininggfn,bengio2023gflownetfoundations}.

\citet{tiapkin2023gfnmaxentrl} only considered the case where the reward function of the soft MDP is sparse, and the correction with $P_{B}(s_{t}\mid s_{t+1})$ is added at each intermediate transition $s_{t} \rightarrow s_{t+1}$; we will go back to this setting in the following section. This correction of the reward is fully compatible with our observation in \cref{sec:sampling-terminating-states} that sampling terminating states with $\pi^{*}_{\mathrm{MaxEnt}}$ yields samples of \cref{eq:gibbs-distribution} when the soft MDP is a tree with the (uncorrected) reward in \cref{eq:reward-function-soft-mdp}, since in that case any state $s' \neq s_{0}$ has a unique parent $s$, and thus $P_{B}(s\mid s') = 1$ as also observed by \citet{tiapkin2023gfnmaxentrl}.

\subsection{Equivalence between PCL \& (Sub)TB}
\label{sec:equivalence-pcl-subtb}
Similar to how \cref{eq:reward-function-soft-mdp} covered the particular case of a sparse reward, in this section we will consider a reward function satisfying \cref{eq:thm-corrected-reward} where the energy function only appears at the end of the trajectory
\begin{equation}\begin{aligned}
    r(s_{t}, s_{t+1}) &= \alpha \log P_{B}(s_{t}\mid s_{t+1})\\ r(s_{T}, s_{f}) &= -\gE(s_{T}),
\end{aligned}\label{eq:distribution-corrected-reward-sparse}\end{equation}
as introduced in \citep{tiapkin2023gfnmaxentrl}. \cref{thm:maxent-rl-unbiased} suggests that solving the MaxEnt RL problem in \cref{eq:maxent-rl-problem} with the corrected reward is comparable to finding a solution of a GFlowNet, as they both lead to a policy whose terminating state distribution is the Gibbs distribution. It turns out that there exists an equivalence between specific algorithms solving these two problems with our choice of reward function above: Path Consistency Learning (PCL; \citealp{nachum2017pcl}) for MaxEnt RL, and the Subtrajectory Balance objective (SubTB; \citealp{madan2022subtb, malkin2022trajectorybalance}) for GFlowNets. Both of these objectives operate at the level of partial trajectories of the form $\tau = (s_{m}, s_{m+1}, \ldots, s_{n})$, where $s_{m}$ and $s_{n}$ are not necessarily the initial and terminal states anymore.

On the one hand, the PCL objective $\gL_{\mathrm{PCL}}(\theta, \phi) = \frac{1}{2}\E_{\pi_{b}}[\Delta^{2}_{\mathrm{PCL}}(\tau; \theta, \phi)]$ encourages the consistency between a policy $\pi_{\theta}$ parametrized by $\theta$ and a soft value function $V_{\soft}^{\phi}$ parametrized by $\phi$, where $\pi_{b}$ is an arbitrary distribution over (partial) trajectories $\tau$, and the residual is defined as
\begin{align}
    \Delta_{\mathrm{PCL}}(\tau; &\theta, \phi) = -V_{\soft}^{\phi}(s_{m}) + V_{\soft}^{\phi}(s_{n}) \label{eq:residual-pcl}\\
    &+ \sum_{t=m}^{n-1}\big(r(s_{t}, s_{t+1}) - \alpha \log \pi_{\theta}(s_{t+1}\mid s_{t})\big).\nonumber
\end{align}

On the other hand, the SubTB objective $\gL_{\mathrm{SubTB}}(\theta, \phi) = \frac{1}{2}\E_{\pi_{b}}[\Delta^{2}_{\mathrm{SubTB}}(\tau; \theta, \phi)]$ also enforces some form of consistency, but this time between a policy (forward transition probability) $P_{F}^{\theta}$ parametrized by $\theta$, and a state flow function $F_{\phi}$ parametrized by $\phi$, and the residual is defined as
\begin{equation*}
    \Delta_{\mathrm{SubTB}}(\tau; \theta, \phi) = \log \frac{F_{\phi}(s_{n})\prod_{t=m}^{n-1}P_{B}(s_{t}\mid s_{t+1})}{F_{\phi}(s_{m})\prod_{t=m}^{n-1}P_{F}^{\theta}(s_{t+1}\mid s_{t})},
    \label{eq:residual-subtb}
\end{equation*}
where $P_{B}$ is a backward transition probability, which we assume to be fixed here---although in general, $P_{B}$ may also be learned \citep{malkin2022trajectorybalance}. In addition to this objective, some boundary conditions on $F_{\phi}$ must also be enforced (depending on $\gE$), similar to the ones introduced in \cref{sec:gflownets}; see \cref{app:equivalence-pcl-subtb} for details. The following proposition establishes the equivalence between these two objectives, up to a normalization constant that only depends on the temperature $\alpha$, and provides a way to move from the policy/value function parametrization in MaxEnt RL to the policy/flow function parametrization in GFlowNets. Although similarities between these two methods have been mentioned in prior work \citep{malkin2022trajectorybalance,jiralerspong2023eflownet, hu2023gfnllm,mohammadpour2023maxentgfn}, we show here an exact equivalence between both objectives.
\begin{restatable}{proposition}{equivsubtbpcl}
    \label{prop:equivalence-subtb-pcl}
    The Subtrajectory Balance objective (GFlowNet; \citealp{madan2022subtb}) is proportional to the Path Consistency Learning objective (MaxEnt RL; \citealp{nachum2017pcl}) on the soft MDP with the reward function defined in \cref{eq:distribution-corrected-reward-sparse}, in the sense that $\gL_{\mathrm{PCL}}(\theta, \phi) = \alpha^{2}\gL_{\mathrm{SubTB}}(\theta, \phi)$, with the following correspondence
    \begin{align}
        \pi_{\theta}(s'\mid s) &= P_{F}^{\theta}(s'\mid s) &&& V^{\phi}_{\soft}(s) &= \alpha \log F_{\phi}(s). \label{eq:equivalence-pcl-subtb-policy-value}
    \end{align}
\end{restatable}
The proof of this proposition is available in \cref{app:equivalence-pcl-subtb}. The equivalence between the value function in MaxEnt RL and the state flow function in GFlowNets was also found in \citep{tiapkin2023gfnmaxentrl}. When applied to complete trajectories, this also shows the connection between the Trajectory Balance objective (TB; \citealp{malkin2022trajectorybalance}), widely used in the GFlowNet literature, and PCL under our choice of corrected reward in \cref{eq:distribution-corrected-reward-sparse}.

On the other end of the spectrum, if we apply this proposition to transitions in the soft MDP (\emph{i.e.}, subtrajectories of length 1), then we can obtain a similar equivalence between the Detailed Balance objective in GFlowNets (DB; \citealp{bengio2023gflownetfoundations}), and the Soft Q-Learning algorithm (SQL; \citealp{haarnoja2017sql}), via the \emph{Unified PCL} perspective of \citet{nachum2017pcl} that uses a soft Q-function in order to simultaneously parametrize both the policy and the value function in \cref{eq:residual-pcl}; see \cref{cor:equivalence-db-sql} for a detailed statement. We note that this connection between DB \& SQL was also mentioned in prior work \citep{tiapkin2023gfnmaxentrl,mohammadpour2023maxentgfn}. A full summary of the connections resulting from \cref{prop:equivalence-subtb-pcl} between different MaxEnt RL and GFlowNet objectives is available in \cref{fig:algrithms-equivalence}, with further details in \cref{fig:residual-equivalence}.

\subsection{Soft Q-Learning with policy parametrization}
\label{sec:sql-policy-parametrization}
In this section, we consider the case where all the states of the soft MDP are valid elements of the sample space $\gX$ (in other words, all the states are terminating). Since every state is now associated with some energy, we can reshape the rewards \citep{ng1999rewardshaping} while still satisfying \cref{eq:thm-corrected-reward} as
\begin{align}
    r(s_{t}, s_{t+1}) &= \gE(s_{t}) - \gE(s_{t+1}) + \alpha \log P_{B}(s_{t}\mid s_{t+1})\nonumber\\ r(s_{T}, s_{f}) &= 0,
    \label{eq:distribution-corrected-reward-dense}
\end{align}
if we assume, without loss of generality, that $\gE(s_{0}) = 0$ (any offset added to the energy function leaves \cref{eq:gibbs-distribution} unchanged). This is a novel setting that differs from \citet{tiapkin2023gfnmaxentrl}, and was made possible thanks to our general statement in \cref{thm:maxent-rl-unbiased}. We show in \cref{prop:policy-parametrization-sql} that with our choice of rewards above, in particular the fact that no reward is received upon termination, we can express the objective of Soft Q-Learning as a function of a policy $\pi_{\theta}$ parametrized by $\theta$, instead of a Q-function; we call this \emph{$\pi$-SQL}. The objective can be written as $\gL_{\pi\textrm{-}\mathrm{SQL}}(\theta) = \frac{1}{2}\E_{\pi_{b}}[\Delta_{\pi\textrm{-}\mathrm{SQL}}^{2}(s, s'; \theta)]$, where $\pi_{b}$ is an arbitrary distribution over transitions $s \rightarrow s'$ such that $s'\neq s_{f}$, and
\begin{align}
    \Delta_{\pi\textrm{-}\mathrm{SQL}}(s, s'; \theta) = \alpha\big[&\log \pi_{\theta}(s'\mid s) - \log \pi_{\theta}(s_{f}\mid s) \nonumber\\&+ \log \pi_{\theta}(s_{f}\mid s')\big] - r(s, s').
    \label{eq:residual-pisql}
\end{align}
With the reward function in \cref{eq:distribution-corrected-reward-dense}, this alternative perspective on SQL is remarkable in that it is equivalent to the Modified Detailed Balance objective (Modified DB; \citealp{deleu2022daggflownet}), specifically derived in the special case of GFlowNets whose states are all terminating. This objective can be written as $\gL_{\mathrm{M}\textrm{-}\mathrm{DB}}(\theta) = \frac{1}{2}\E_{\pi_{b}}[\Delta_{\mathrm{M}\textrm{-}\mathrm{DB}}^{2}(s, s'; \theta)]$ that depends on a policy (forward transition probability) $P_{F}^{\theta}$ parametrized by $\theta$, where
\begin{align}
    &\Delta_{\mathrm{M}\textrm{-}\mathrm{DB}}(s, s'; \theta) \label{eq:residual-modified-db}\\
    &\qquad = \log \frac{\exp(-\gE(s')/\alpha)P_{B}(s\mid s')P_{F}^{\theta}(s_{f}\mid s)}{\exp(-\gE(s)/\alpha)P_{F}^{\theta}(s'\mid s)P_{F}^{\theta}(s_{f}\mid s')}.\nonumber
\end{align}
\begin{restatable}{proposition}{equivpisqlmodifieddb}
    \label{prop:equivalence-pisql-modified-db}
    Suppose that all the states of the soft MDP are terminating $\gS \equiv \gX$. The Modified Detailed Balance objective (GFlowNet; \citealp{deleu2022daggflownet}) is proportional to the Soft Q-Learning objective with a policy parametrization (MaxEnt RL; $\pi$-SQL) on the soft MDP with the reward function defined in \cref{eq:distribution-corrected-reward-dense}, in the sense that $\gL_{\pi\textrm{-}\mathrm{SQL}}(\theta) = \alpha^{2}\gL_{\mathrm{M}\textrm{-}\mathrm{DB}}(\theta)$, with $\pi_{\theta}(s'\mid s) = P_{F}^{\theta}(s'\mid s)$.
\end{restatable}
The proof is available in \cref{app:equivalence-pisql-mdb}. This result can be further generalized to cases where the states are not necessarily all terminating, but where some partial reward can be received along the trajectory, with an equivalence between SQL and the Forward-Looking Detailed Balance objective (FL-DB; \citealp{pan2023forwardlookinggfn}) in GFlowNets; see \cref{app:equivalence-sql-fl-db} for details.

\begin{figure*}[t]
    \centering
    \begin{adjustbox}{center}
    \includegraphics[width=1.1\textwidth]{figures/treesample.pdf}
    \end{adjustbox}
    \caption{Comparison of MaxEnt RL and GFlowNet algorithms on the factor graph inference task, in terms of the Jensen-Shannon divergence between the terminating state distribution and the target distribution during training. Each curve represents the average JSD with 95\% confidence interval over 20 random seeds.}
    \label{fig:treesample}
\end{figure*}

\section{Related Work}
\label{sec:related-work}
\paragraph{Maximum Entropy Reinforcement Learning.} Unlike standard reinforcement learning where an optimal policy may be completely deterministic (at least in the fully observable case; \citealp{sutton2018introrl}), MaxEnt RL seeks a \emph{stochastic} policy that balances between reward maximization and maximal entropy of future actions \citep{ziebart2010maxent,fox2016tamingnoiserl}. This type of entropy regularization falls into the broader domain of regularized MDPs \citep{geist2019regularizedmdps}. This can be particularly beneficial for improving exploration \citep{haarnoja2017sql} and for robust control under model misspecification \citep{eysenbach2022maxentrlrobust}. Popular MaxEnt RL methods include Soft Q-Learning, Path Consistency Learning \citep{nachum2017pcl}, and Soft Actor-Critic \citep{haarnoja2018sac} studied in this paper.

\paragraph{Generative Flow Networks.} \citet{bengio2021gflownet} took inspiration from reinforcement learning and introduced GFlowNets as a solution for finding diverse molecules binding to a target protein. Since then, they have found applications in a number of domains in scientific discovery \citep{jain2022gfnbiological,jain2023gfnscientific,milaai4science2023crystalgfn}, leveraging diversity in conjunction with active learning \citep{jain2023mogfn,hernandez2023multifidelitygfn}, but also in combinatorial optimization \citep{zhang2023gfnrobustscheduling,zhang2023graphcogfn}, causal discovery \citep{deleu2022daggflownet,deleu2023jspgfn,atanackovic2023dyngfn}, and probabilistic inference in general \citep{zhang2022ebgfn,hu2023gflownetem,hu2023gfnllm,falet2024deltaai}. Although they were framed differently, GFlowNets are also deeply connected to the literature on variational inference \citep{malkin2022gfnhvi,zimmermann2022vigfn}.

A number of works have recently established connections between GFlowNets and (maximum entropy) RL. Closest to our work, \citet{tiapkin2023gfnmaxentrl} showed how the reward in MaxEnt RL can be corrected based on some backward transition probability to be equivalent to GFlowNets. Although their analysis is limited to the case where the reward function in the original soft MDP is sparse (\emph{i.e.}, the reward is only obtained at the end of the trajectory), they were the first to propose a correction applied at each intermediate transition as in \cref{sec:equivalence-pcl-subtb}. We generalized this in \cref{thm:maxent-rl-unbiased} with a correction at the level of the \emph{trajectories}, which offers more flexibility in how the correction is distributed along the trajectory and allows intermediate rewards. \citet{tiapkin2023gfnmaxentrl} also showed similarities between GFlowNet objectives and MaxEnt RL algorithms, namely between Detailed Balance \& Dueling Soft Q-Learning \citep{wang2016dueling}, and between Trajectory Balance \& Policy Gradient \citep{schulman2017equivalencesqlpg}. The correspondence between TB and Policy Gradient was further expanded in \citep{anonymous2023gfnpolicygradient}, as a direct consequence of the connections between GFlowNets, variational inference \citep{malkin2022gfnhvi}, and reinforcement learning \citep{weber2015virl}. Finally, \citet{mohammadpour2023maxentgfn} also introduced a correction that depends on $n(s)$ the number of (partial) trajectories to a certain state $s$, which can be learned by solving a second MaxEnt RL problem \cref{eq:maxent-rl-problem} on an ``inverse'' MDP. This correction corresponds to a particular choice of backward transition probability $P_{B}(s_{t}\mid s_{t+1}) = n(s_{t}) / n(s_{t+1})$ in \cref{eq:thm-corrected-reward}, which has the remarkable property to maximize the flow entropy.

\section{Experimental results}
\label{sec:experimental-results}
We verify empirically the equivalences established in \cref{sec:bridging-gap-maxentrl-gfn} on three domains: the inference over discrete factor graphs \citep{buesing2020approximate}, Bayesian structure learning of Bayesian networks \citep{deleu2022daggflownet}, and the generation of parsimonious phylogenetic trees \citep{zhou2024phylogfn}. In addition to Detailed Balance (and possibly its modified version) and Trajectory Balance on the one hand (GFlowNets), and Soft Q-Learning (possibly parametrized by a policy; see \cref{sec:sql-policy-parametrization}) and Path Consistency Learning on the other hand (MaxEnt RL), we also consider a discrete version of Soft Actor-Critic \citep{christodoulou2019discretesac}, which has no natural conterpart in the GFlowNet literature. For all MaxEnt RL methods, we adjust the MDP to include the correction of the reward. Note that in all the domains considered here, non-trivial intermediate rewards are available in the original MDP, meaning in particular that all instances of DB actually use the Forward-Looking formulation \citep{pan2023forwardlookinggfn}; the form of these intermediate rewards along with additional experimental details are available in \cref{app:experimental-details}.

\subsection{Probabilistic inference over discrete factor graphs}
\label{sec:experiments-treesample}
The probabilistic inference task in \citet{buesing2020approximate} consists in sequentially sampling the values of $d$ discrete random variables in a factor graph one at a time, with a fixed order. This makes the underlying MDP having a tree structure, eliminating the need for reward correction, as described in \cref{sec:sampling-terminating-states}. We adapted this environment to have multiple trajectories leading to each terminating state by allowing sampling the random variables in any order. Details about the energy function are available in \cref{app:details-treesample}.

In \cref{fig:treesample}, we show the performance of the different MaxEnt RL and GFlowNet algorithms on 4 different factor graph structures, as proposed by \citet{buesing2020approximate}, with $d = 6$ variables and where each variable can take one of 5 possible values. We observe that TB \& PCL perform similarly, validating \cref{prop:equivalence-subtb-pcl}, and overall outperform all other methods. Similarly, we can see that DB \& SQL also perform similarly as expected by \cref{cor:equivalence-db-sql}. Finally, although SAC is generally viewed as a strong algorithm for continuous control \citep{Haarnoja2018}, we did not observe any significant improvement over DB/SQL.

\begin{figure*}[t]
    \centering
    \includegraphics[width=0.8\textwidth]{figures/dag_gfn.pdf}
    \caption{Comparison of MaxEnt RL and GFlowNet algorithms on the Bayesian structure learning task, in terms of the Jensen-Shannon divergence between the terminating state distribution and the target posterior during training. Both experiments differ in the way the marginal likelihood $P(\gD\mid G)$ is computed, (left) using the BGe score \citep{geiger1994bge}, (right) is the linear Gaussian score \citep{nishikawa2022vbg}. Each curve represents the average JSD with 95\% confidence interval over 20 random seeds.}
    \label{fig:dag_gfn}
\end{figure*}

\subsection{Structure learning of Bayesian Networks}
\label{sec:experiments-dag_gfn}
We also evaluated all algorithms on the task of learning the structure of Bayesian networks, using a Bayesian perspective \citep{deleu2022daggflownet}. Given a dataset of observations $\gD$ from a joint distribution over $d$ random variables $X_{1}, \ldots, X_{d}$, our objective is to approximate the posterior distribution $P(G\mid \gD) \propto P(\gD\mid G)P(G)$ over the DAG structures $G$ encoding the conditional independencies of $P(X_{1}, \ldots, X_{d})$. The soft MDP is constructed as in \citep{deleu2022daggflownet}, where a DAG $G$ is constructed by adding one edge at a time, starting from a completely empty graph, while enforcing the acyclicity of the graph at each step of generation (\emph{i.e.}, an edge cannot be added if it would introduce a cycle).

Following \citet{malkin2022gfnhvi}, we consider here a relatively small task where $d=5$, so that the target distribution $P(G\mid \gD)$ can be evaluated analytically in order to compare it to our approximations given by MaxEnt RL and/or GFlowNets. Unlike in \cref{sec:experiments-treesample}, we also included the modified DB loss and $\pi$-SQL in our comparison since all the states are valid DAGs. We observe in \cref{fig:dag_gfn} that again TB \& PCL on the one hand, but also modified DB \& $\pi$-SQL on the other hand perform very similarly to one another, empirically validating our equivalences established above. Despite a light search over hyperparameters, we found that SAC performs on average significantly worse than other methods, mainly due to instability during training.

\subsection{Phylogenetic tree generation}
\label{sec:experiments-phylogfn}
Finally, we also compared these methods on the larger-scale task of parsimonious phylogenetic tree generation introduced by \citet{zhou2024phylogfn}. Based on biological sequences of different species, the objective is to find phylogenetic trees over those species that require few mutations. A state of the soft MDP corresponds to a collection of trees over a partition of the species, and actions correspond to merging two trees together by adding a root node. Note that all the trees sampled this way have the same size, although they may have different number of trajectories leading to each of them (unlike \cref{fig:treesample}). The energy of a tree $T$ corresponds to the total number of mutations captured in $T$.

\begin{figure*}[t]
  \centering
  
  \begin{adjustbox}{center}
  \begin{minipage}[b]{0.37\textwidth}
    \begin{adjustbox}{scale=1}
\raisebox{68pt}{\begin{tabular}{lccccc}
\toprule
 & TB & PCL & DB & SQL & SAC \\
\midrule
DS1 & 0.7797 & 0.7399 & \textbf{0.9141} & 0.8695 & 0.6003 \\
DS2 & 0.8550 & 0.8309 & 0.8811 & \textbf{0.8922} & 0.7022 \\
DS3 & 0.5833 & 0.6137 & \textbf{0.8649} & 0.8474 & 0.6334 \\
DS4 & 0.9178 & 0.9177 & 0.9285 & 0.8965 & \textbf{0.9320} \\
DS5 & 0.9688 & 0.9690 & 0.9633 & \textbf{0.9712} & 0.9567 \\
DS6 & 0.9526 & 0.9542 & 0.9496 & \textbf{0.9615} & 0.8017 \\
\bottomrule
\end{tabular}}
    \end{adjustbox}
  \end{minipage}
  \hspace*{5em}
  \begin{minipage}[b]{0.5\textwidth}
    \includegraphics[width=\textwidth]{figures/DBV_SQL_DS5.pdf}
  \end{minipage}
  \end{adjustbox}
  \caption{Comparison of MaxEnt RL and GFlowNet algorithms on the phylogenetic tree generation task. (Left) Comparison of the performance in terms of the Pearson correlation coefficient between the terminating state log-probability and the return on 1000 randomly sampled trees. (Center) Correlation between the terminating state log-probability found with DB and the return, each point representing a tree, with a best linear fit line and its slope. (Right) Similarly for SQL. The correlation plots for all methods and all datasets are available in \cref{app:details-phylogfn}.}
  \label{fig:phylogfn}
\end{figure*}

In \cref{fig:phylogfn}, we compare the performance of all methods on 6 datasets introduced by \citet{zhou2024phylogfn}, in terms of the correlation between the terminating state log-probabilities $\log \pi(T)$ associated with the learned policy, and the (uncorrected) return $-\gE(T)$, since we should ideally have
\begin{equation}
    \log \pi(T) \approx \log \pi^{*}(T) = -\gE(T) - \log Z,
\end{equation}
based on \cref{thm:maxent-rl-unbiased}. We observe once again that TB \& PCL perform overall similarly to one another, as well as DB \& SQL, confirming our observations made above, this time on larger problems where the partition function is intractable. Similar to \cref{sec:experiments-dag_gfn}, we also found that SAC was often less competitive than DB/SQL, except for DS4.

\section{Discussion}
\label{sec:discussion}
\paragraph{Stochastic environments.} Similar to \citep{mohammadpour2023maxentgfn}, we note that our results here are limited to the case where the soft MDP is deterministic, to match the standard assumptions made in the GFlowNet literature. Although some works have attempted to generalize GFlowNets to stochastic environments \citep{bengio2023gflownetfoundations,pan2023stochasticgfn}, there is no apparent consensus on how to guarantee the existence of an optimal policy $P_{F}$ whose terminating state distribution matches \cref{eq:gibbs-distribution} when an object may be generated in multiple ways (\emph{i.e.}, $\gG$ is a DAG). \citet{jiralerspong2023eflownet} introduced an extension to stochastic environments called \emph{EFlowNet} where an optimal policy is guaranteed to exist, albeit limited to the case where the soft MDP has a tree structure and would therefore bypass the need for any reward correction. The assumption of determinism is generally not limiting though since the structure of the soft MDP is typically designed by an expert based on the problem at hand (depending on the distribution to be approximated).

\paragraph{Environments with equal number of trajectories.} We saw in \cref{sec:sampling-terminating-states} that without correcting the reward function, the optimal policy $\pi_{\mathrm{MaxEnt}}^{*}$ has a terminating state distribution biased towards states with more complete trajectories leading to them. However, there are some situations where \emph{all} the states have an equal number of trajectories leading to them. This is the case of the discrete factor graphs environments studied in \cref{sec:experiments-treesample} for example, where all terminating states can be accessed with exactly $n!$ trajectories since each of the $n$ variables can be assigned a value in any order. In this situation, one can apply MaxEnt RL with the original reward in \cref{eq:reward-function-soft-mdp}, without any correction, and still obtain a terminating state distribution equal to the Gibbs distribution, since this constant can be absorbed into the partition function. Just like the case where $\gG$ is a tree, this must be considered as a special case though, and it is generally recommended to always correct the reward with $P_{B}$.

\paragraph{Unified parametrization of the policy \& state flow.} In all applications of GFlowNets involving the SubTB objective \citep{madan2022subtb,malkin2022trajectorybalance} found in the literature, the forward transition probabilities $P_{F}^{\theta}$ and the state flow function $F_{\phi}$ have always been parametrized with separate models (with possibly a shared backbone) as in \cref{sec:equivalence-pcl-subtb}. However, thanks to the equivalence with PCL established in \cref{prop:equivalence-subtb-pcl}, it is actually possible to parametrize both functions using a single ``Q-function'' thanks to the Unified PCL perspective \citep{nachum2017pcl}; see also \cref{app:equivalence-sql-db} in the context of DB \& SQL. A setting with separate networks is closer to the use of a dueling architecture as highlighted by \citet{tiapkin2023gfnmaxentrl}. In our experiments, we observed that using DB with two separate networks performs overall similarly to SQL with a single Q-network, even though the latter requires fewer parameters to learn. A complete study of the effectiveness of this strategy compared to separate networks is left as future work.

\paragraph{Continuous control \& probabilistic inference.} While this paper focused on discrete distributions, to match the well-studied setting in the GFlowNet literature, the reward correction in \cref{thm:maxent-rl-unbiased} may be extended to cases where $\gX$, along with the state and action spaces of the soft MDP, are continuous spaces as conjectured by \citet{tiapkin2023gfnmaxentrl}. We could establish similar connections with MaxEnt RL objectives by leveraging the extensions of GFlowNets to continuous spaces \citep{anonymous2023cflownets,lahlou2023continuousgfn}. Interestingly, continuous GFlowNets have strong connections with diffusion models \citep{zhang2022unifyinggfn,sendera2024diffusiongfn}, for which RL has been shown to be an effective training method \citep{fan2023ddpmrl,black2023rldiffusion}.

\section{Conclusion \& Future work}
\label{sec:conclusion-future-work}
In this work, we showed that many of the well established objectives from the GFlowNet literature \citep{bengio2023gflownetfoundations,malkin2022trajectorybalance,pan2023forwardlookinggfn} happen to be equivalent to well-known algorithms in MaxEnt RL. Our work is anchored in the recent line of work drawing connections between GFlowNets and MaxEnt RL \citep{tiapkin2023gfnmaxentrl,mohammadpour2023maxentgfn,anonymous2023gfnpolicygradient}, extending the reward correction introduced by \citet{tiapkin2023gfnmaxentrl} to be applicable at the level of complete trajectories. This generalization is significant as it allowed us to establish new connections between MaxEnt RL algorithms and GFlowNet objectives, especially in the presence of intermediate rewards in the underlying MDP.

This perspective makes it possible to integrate in a principled way all the tools from Reinforcement Learning for probabilistic inference over large-scale discrete and structured spaces. Future work should be dedicated to further investigating this intersection, borrowing best practices from the RL literature to enable more efficient inference in that setting. In particular, being able to better explore the state space in order to find modes of the Gibbs distribution \citep{rectorbrooks2023thompsongfn}, for example using targeted exploration strategies developed for RL agents \citep{bellemare2016countbasedexploration,pathak2017curiositydrivenexploration}, would be essential for large-scale applications such as causal structure learning \citep{deleu2022daggflownet} and molecule generation \citep{bengio2021gflownet}, going beyond the capabilities of MaxEnt RL alone in terms of exploration \citep{haarnoja2017sql}.

\section*{Acknowledgements}
\label{sec:acknowledgements}
We would like to thank Valentin Thomas, Michal Valko, Pierre M\'{e}nard, Daniil Tiapkin, and Sobhan Mohammadpour for helpful discussions and comments about this paper. Doina Precup and Yoshua Bengio are CIFAR Senior Fellows. This research was enabled in part by compute resources and software provided by Mila (\href{https://mila.quebec/}{mila.quebec}).

\newpage
\bibliography{references}

\newpage
\onecolumn
\title{Discrete Probabilistic Inference as Control\\in Multi-path Environments\\(Supplementary Material)}

\makeatletter
\renewcommand{\AB@affillist}{}
\renewcommand{\AB@authlist}{}
\setcounter{authors}{0}
\renewcommand{\@thanks}{}
\makeatother

\author[1,3]{Tristan~Deleu}
\author[2]{Padideh~Nouri}
\author[1]{Nikolay~Malkin}
\author[2,4]{Doina~Precup}
\author[1]{Yoshua~Bengio}
% Add affiliations after the authors
\affil[ ]{\protect\hspace*{-2em}Mila -- Quebec AI Institute\protect\\[1em]}
\affil[1]{Universit\'{e} de Montr\'{e}al}
\affil[2]{McGill University}
\affil[3]{Valence Labs}
\affil[4]{Google DeepMind}

\maketitle
\appendix
\vspace*{1em}
\section{Reward correction}
\label{app:reward-correction}

\maxentrlunbiased*
\begin{proof}
    Recall from \citet{ziebart2010maxent,haarnoja2017sql} that the optimal policy maximizing \cref{eq:maxent-rl-problem} is
    \begin{equation}
        \pi^{*}_{\mathrm{MaxEnt}}(s'\mid s) = \exp\left(\frac{1}{\alpha}\big(Q^{*}_{\soft}(s, s') - V^{*}_{\soft}(s)\big)\right),
    \end{equation}
    where the soft value functions $Q^{*}_{\soft}$ and $V^{*}_{\soft}$ satisfy the soft Bellman optimality equations, adapted to our deterministic soft MDP:
    \begin{align}
        Q^{*}_{\soft}(s, s') &= r(s, s') + V^{*}_{\soft}(s')\\
        V^{*}_{\soft}(s') &= \alpha \log \sum_{s''\in\mathrm{Ch}(s')}\exp\left(\frac{1}{\alpha}Q^{*}_{\soft}(s', s'')\right).
    \end{align}
    By definition of the terminating state distribution associated with $\pi^{*}_{\mathrm{MaxEnt}}$ in \cref{eq:terminating-state-probability}, for any terminating state $x\in\gX$:
    \begingroup
    \allowdisplaybreaks
    \begin{align}
        \pi^{*}(x) &= \sum_{\tau: s_{0} \rightsquigarrow x}\prod_{t=0}^{T_{\tau}}\pi^{*}_{\mathrm{MaxEnt}}(s_{t+1}\mid s_{t})\\
        &= \sum_{\tau: s_{0} \rightsquigarrow x}\exp\left[\frac{1}{\alpha}\sum_{t=0}^{T_{\tau}}\big(Q^{*}_{\soft}(s_{t}, s_{t+1}) - V^{*}_{\soft}(s_{t})\big)\right]\\
        &= \sum_{\tau: s_{0} \rightsquigarrow x}\exp\left[\frac{1}{\alpha}\sum_{t=0}^{T_{\tau}}\big(r(s_{t}, s_{t+1}) + V^{*}_{\soft}(s_{t+1}) - V^{*}_{\soft}(s_{t})\big)\right]\\
        &= \sum_{\tau: s_{0} \rightsquigarrow x}\exp\Bigg[\frac{1}{\alpha}\Bigg(\sum_{t=0}^{T_{\tau}}r(s_{t}, s_{t+1}) + \underbrace{V^{*}_{\soft}(s_{f})}_{=\,0} - V^{*}_{\soft}(s_{0})\Bigg)\Bigg]\\
        &= \sum_{\tau: s_{0} \rightsquigarrow x}\exp\left[\frac{1}{\alpha}\left(-\gE(x) + \alpha\sum_{t=0}^{T_{\tau}-1}\log P_{B}(s_{t}\mid s_{t+1}) - V^{*}_{\soft}(s_{0})\right)\right]\\
        &= \exp\left[\frac{1}{\alpha}(-\gE(x) - V^{*}_{\soft}(s_{0}))\right]\underbrace{\sum_{\tau: s_{0} \rightsquigarrow x}\prod_{t=0}^{T_{\tau}-1}P_{B}(s_{t}\mid s_{t+1})}_{=\,1}\label{eq:proof-maxent-rl-unbiased-1}\\
        &= \frac{\exp(-\gE(x)/\alpha)}{\exp(V^{*}_{\soft}(s_{0})/\alpha)} \propto \exp(-\gE(x)/\alpha),
    \end{align}
    \endgroup
    where we used in \cref{eq:proof-maxent-rl-unbiased-1} the fact that $P_{B}$ induces a probability distribution over the complete trajectories leading to any terminating state $x$; see for example \citep[][Lemma 5]{bengio2023gflownetfoundations} for a proof of this result.
\end{proof}

\begin{figure}[hbtp]
    \vspace*{-3em}
    \centering
    \includegraphics{figures/residuals-equivalence}
    \caption{Summary of the equivalences between the MaxEnt RL (top, in each box) and GFlowNet (bottom, in each box) objectives, using the classification of \cref{fig:algrithms-equivalence}. All objectives can be written as $\gL(\cdot) = \frac{1}{2}\E_{\pi_{b}}[\Delta^{2}(\cdot)]$, where $\pi_{b}$ is a distribution over appropriate quantities (\emph{i.e.}, trajectories, or transitions). The \emph{terminal reward} setting corresponds to $r(s_{t}, s_{t+1}) = \alpha \log P_{B}(s_{t}\mid s_{t+1})$ \& $r(s_{T}, s_{f}) = -\gE(s_{T})$ (\cref{sec:equivalence-pcl-subtb}), whereas the \emph{intermediate rewards} setting corresponds to $r(s_{t}, s_{t+1}) = -\gE(s_{t}\rightarrow s_{t+1}) + \alpha \log P_{B}(s_{t}\mid s_{t+1})$ (with $\gE(s_{t}\rightarrow s_{t+1}) = \gE(s_{t+1}) - \gE(s_{t})$ if $\gS \equiv \gX$, \cref{sec:sql-policy-parametrization}) \& $r(s_{T}, s_{f}) = 0$ (\cref{app:equivalence-sql-fl-db})}
    \label{fig:residual-equivalence}
\end{figure}

\section{Equivalences between MaxEnt RL \& GFlowNet objectives}
\label{app:equivalences-maxentrl-gfn-objectives}
In this section, we detail all of our new results establishing equivalences between MaxEnt RL and GFlowNet objectives, along with their proofs. We summarize the results with links to the propositions and proofs in \cref{tab:equivalences-summary}. All the objectives considered in this paper take the form of an (expected) least-square $\gL(\cdot) = \frac{1}{2}\E_{\pi_{b}}[\Delta^{2}(\cdot)]$, where $\Delta(\cdot)$ is a residual term that is algorithm-dependent, and $\pi_{b}$ is a distribution over appropriate quantities (\emph{i.e.}, trajectories, or transitions); see \cref{sec:equivalence-pcl-subtb} for an example with the PCL \& SubTB objectives. In this section, we will work exclusively with residuals for simplicity, instead of the objectives themselves. We summarize the different residuals and their correspondences in \cref{fig:residual-equivalence}.
\begin{table}[h]
    \centering
    \caption{Summary of the equivalence results and their proofs in \cref{app:equivalences-maxentrl-gfn-objectives}.}
    \begin{tabular}{llll}
        \toprule
        MaxEnt RL & GFlowNet & Proposition & Proof\\
        \midrule
        PCL \citep{nachum2017pcl} & SubTB \citep{madan2022subtb} & \cref{prop:equivalence-subtb-pcl} & App.~\ref{app:equivalence-pcl-subtb} \\
        SQL \citep{haarnoja2017sql} & DB \citep{bengio2023gflownetfoundations} & \cref{cor:equivalence-db-sql} & App.~\ref{app:equivalence-sql-db} \\
        SQL${}^{\star}$ \citep{haarnoja2017sql} & FL-DB \citep{pan2023forwardlookinggfn} & \cref{prop:equivalence-sql-fl-db} & App.~\ref{app:equivalence-sql-fl-db} \\
        $\pi$-SQL${}^{\star}$ (\cref{sec:sql-policy-parametrization}) & M-DB \citep{deleu2022daggflownet} & \cref{prop:equivalence-pisql-modified-db} & App.~\ref{app:equivalence-pisql-mdb} \\
        \bottomrule
    \end{tabular}
    \label{tab:equivalences-summary}
\end{table}

\subsection{Equivalence between PCL \& SubTB}
\label{app:equivalence-pcl-subtb}

\equivsubtbpcl*
\begin{proof}
    \hypertarget{proof:equivalence-subtb-pcl}
    Let $\tau = (s_{m}, s_{m+1}, \ldots, s_{n})$ be a subtrajectory, where $s_{n}$ may be the terminal state $s_{f}$. We first recall the definitions of the Path Consistency Learning (PCL; \citealp{nachum2017pcl}) and the Subtrajectory Balance (SubTB; \citealp{madan2022subtb,malkin2022trajectorybalance}) objectives. On the one hand, the PCL objective encourages the consistency between a policy $\pi_{\theta}$ parametrized by $\theta$ and a value function $V^{\phi}_{\soft}$ parametrized by $\phi$:
    \begin{equation}
         \Delta_{\mathrm{PCL}}(\tau; \theta, \phi) = -V_{\soft}^{\phi}(s_{m}) + V_{\soft}^{\phi}(s_{n}) + \sum_{t=m}^{n-1}\big(r(s_{t}, s_{t+1}) - \alpha \log \pi_{\theta}(s_{t+1}\mid s_{t})\big).
    \end{equation}
    On the other hand, the SubTB objective also encourages some form of consistency, but this time between a policy (forward transition probability) $P_{F}^{\theta}$ parametrized by $\theta$ and a flow function $F_{\phi}$ parametrized by $\phi$. We will give the form of its residual further down, as it depends on the trajectory $\tau$. Finally, recall that the reward function of the soft MDP is defined by \cref{eq:distribution-corrected-reward-sparse}, in order to satisfy the reward correction necessary for the application of \cref{thm:maxent-rl-unbiased}, following the same decomposition as in \citep{tiapkin2023gfnmaxentrl}
    \begin{align}
        r(s_{t}, s_{t+1}) &= \alpha \log P_{B}(s_{t}\mid s_{t+1}) & r(s_{T}, s_{f}) &= -\gE(s_{T}).
    \end{align}
    In order to show the equivalence between $\gL_{\mathrm{PCL}}$ and $\gL_{\mathrm{SubTB}}$, we only need to show equivalence of their corresponding residuals, by replacing the reward by its definition above. We will use the correspondence in \cref{eq:equivalence-pcl-subtb-policy-value} between the policy/value function of PCL and the policy/flow function of SubTB. We consider two cases:
    \begin{itemize}[leftmargin=*]
        \item If $s_{n} \neq s_{f}$ is not the terminal state, then the residual for SubTB can be written as
        \begin{equation}
            \Delta_{\mathrm{SubTB}}(\tau;\theta, \phi) = \log \frac{F_{\phi}(s_{n})\prod_{t=m}^{n-1}P_{B}(s_{t}\mid s_{t+1})}{F_{\phi}(s_{m})\prod_{t=m}^{n-1}P_{F}^{\theta}(s_{t+1}\mid s_{t})},
        \end{equation}
        where $P_{B}$ is a backward transition probability. Although it is in general possible to learn $P_{B}$ \citep{malkin2022trajectorybalance}, we will consider it fixed here. Substituting \cref{eq:equivalence-pcl-subtb-policy-value} into the residual $\Delta_{\mathrm{PCL}}$:
        \begin{align}
            \Delta_{\mathrm{PCL}}&(\tau; \theta, \phi) = -V^{\phi}_{\soft}(s_{m}) + V_{\soft}^{\phi}(s_{n}) + \alpha \sum_{t=m}^{n-1}\big(\log P_{B}(s_{t}\mid s_{t+1}) - \log \pi_{\theta}(s_{t+1}\mid s_{t})\big)\nonumber\\
            &= -\alpha \log F_{\phi}(s_{m}) + \alpha \log F_{\phi}(s_{n}) + \alpha \sum_{t=m}^{n-1}\big(\log P_{B}(s_{t}\mid s_{t+1}) - \log P_{F}^{\theta}(s_{t+1}\mid s_{t})\big)\nonumber\\
            &= \alpha \log \frac{F_{\phi}(s_{n})\prod_{t=m}^{n-1}P_{B}(s_{t}\mid s_{t+1})}{F_{\phi}(s_{m})\prod_{t=m}^{n-1}P_{F}^{\theta}(s_{t+1}\mid s_{t})} = \alpha \Delta_{\mathrm{SubTB}}(\tau;\theta, \phi).
        \end{align}
        \item If $s_{n} = s_{f}$, then the residual $\Delta_{\mathrm{SubTB}}$ appearing in the Subtrajectory Balance objective must be written as
        \begin{equation}
            \Delta_{\mathrm{SubTB}}(\tau;\theta, \phi) = \log \frac{\exp(-\gE(s_{n-1})/\alpha)\prod_{t=m}^{n-2}P_{B}(s_{t}\mid s_{t+1})}{F_{\phi}(s_{m})\prod_{t=m}^{n-1}P_{F}^{\theta}(s_{t+1}\mid s_{t})},
        \end{equation}
        since the boundary conditions must also be enforced \citep{malkin2022trajectorybalance}. Moreover, by definition of the value function, we can also enforce that $V_{\soft}^{\phi}(s_{f}) = 0$. Therefore
        \begin{align}
            \Delta_{\mathrm{PCL}}&(\tau; \theta, \phi) = -V_{\soft}^{\phi}(s_{m}) + V_{\soft}^{\phi}(s_{f}) - \big(\gE(s_{n-1}) + \alpha \log \pi_{\theta}(s_{f}\mid s_{n-1})\big)\nonumber\\
            &\qquad \qquad + \alpha \sum_{t=m}^{n-2}\big(\log P_{B}(s_{t}\mid s_{t+1}) - \log \pi_{\theta}(s_{t+1}\mid s_{t})\big)\\
            &= -\alpha \log F_{\phi}(s_{m}) - \gE(s_{n-1}) - \alpha \log P_{F}^{\theta}(s_{f}\mid s_{n-1})\nonumber\\
            &\qquad \qquad + \alpha \sum_{t=m}^{n-2}\big(\log P_{B}(s_{t}\mid s_{t+1}) - \log P_{F}^{\theta}(s_{t+1}\mid s_{t})\big)\\
            &= \alpha \Delta_{\mathrm{SubTB}}(\tau;\theta, \phi).
        \end{align}
        Note that the Trajectory Balance objective, operating only at the level of complete trajectories \citep{malkin2022trajectorybalance}, corresponds to this case where $s_{m} = s_{0}$ is the initial state.
    \end{itemize}
    This concludes the proof, showing that $\gL_{\mathrm{PCL}}(\theta, \phi) = \alpha^{2}\gL_{\mathrm{SubTB}}(\theta, \phi)$.
\end{proof}

\subsection{Equivalence between SQL \& DB}
\label{app:equivalence-sql-db}
The following result establishing the equivalence between the objectives in SQL and DB can be seen as a direct consequence of \cref{prop:equivalence-subtb-pcl}, under the Unified PCL perspective of \citet{nachum2017pcl}. We state and prove this as a standalone result for completeness.
\begin{corollary}
    The Detailed Balance objective (GFlowNet; \citealp{bengio2023gflownetfoundations}) is proportional to the Soft Q-Learning objective (MaxEnt RL; \citealp{haarnoja2017sql}) on the soft MDP with the reward function defined in \cref{eq:distribution-corrected-reward-sparse}, in the sense that $\gL_{\mathrm{SQL}}(\theta) = \alpha^{2}\gL_{\mathrm{DB}}(\theta)$, with the following correspondence
    \begin{align}
        F_{\theta}(s) &= \sum_{s''\in\mathrm{Ch}(s)}\exp\left(\frac{1}{\alpha}Q_{\soft}^{\theta}(s, s'')\right) & P_{F}^{\theta}(s'\mid s) &\propto \exp\left(\frac{1}{\alpha}Q_{\soft}^{\theta}(s, s')\right).
        \label{eq:equivalence-db-sql-correspondence}
    \end{align}
    \label{cor:equivalence-db-sql}
\end{corollary}
\begin{proof}
    \hypertarget{proof:equivalence-db-sql}
    Let $s \rightarrow s'$ be a transition in the soft MDP, where $s'$ may be the terminal state. Recall that the residual in the SQL objective depends on a Q-function $Q^{\theta}_{\mathrm{soft}}$ parametrized by $\theta$:
    \begin{align}
        \Delta_{\mathrm{SQL}}(s, s'; \theta) &= Q_{\soft}^{\theta}(s, s') - \big(r(s, s') + V_{\soft}^{\theta}(s')\big),\\
        \mathrm{where}\quad V_{\soft}^{\theta}(s') &\triangleq \alpha \log \sum_{s''\in \mathrm{Ch}(s')}\exp\left(\frac{1}{\alpha}Q_{\soft}^{\theta}(s', s'')\right).
    \end{align}
    In the case where $s' = s_{f}$ is the terminal state, then $V^{\theta}_{\mathrm{soft}}(s') = 0$. On the other hand, the exact form of the residual in the DB objective will be given further down, but always depends on a forward transition probability $P_{F}^{\theta}$ and a state flow function $F_{\theta}$, which we assume are sharing parameters $\theta$. We consider two cases:
    \begin{itemize}[leftmargin=*]
        \item If $s' \neq s_{f}$ is not the terminal state, then the residual in the DB objective is given by
        \begin{equation}
            \Delta_{\mathrm{DB}}(s, s';\theta) = \log \frac{F_{\theta}(s)P_{F}^{\theta}(s'\mid s)}{F_{\theta}(s')P_{B}(s\mid s')}.
        \end{equation}
        With the definition of the reward function in \cref{eq:distribution-corrected-reward-sparse}, we know that $r(s, s') = \alpha \log P_{B}(s\mid s')$. We can therefore show that the residuals of SQL and DB are proportional to one-another:
        \begin{align}
            \Delta&_{\mathrm{SQL}}(s, s';\theta) = Q^{\theta}_{\mathrm{soft}}(s, s') - r(s, s') - \alpha \log \sum_{s''\in\mathrm{Ch}(s')}\exp\left[\frac{1}{\alpha}Q_{\mathrm{soft}}^{\theta}(s', s'')\right]\\
            &= Q_{\mathrm{soft}}^{\theta}(s, s') - \alpha \log P_{B}(s\mid s') - \alpha \log \sum_{s''\in\mathrm{Ch}(s')}\exp\left[\frac{1}{\alpha}Q_{\mathrm{soft}}^{\theta}(s', s'')\right]\\
            &= \alpha \log\left(\exp\left(\frac{1}{\alpha}Q_{\mathrm{soft}}^{\theta}(s, s')\right)\right) - \alpha \log P_{B}(s\mid s') - \alpha \log \sum_{s''\in\mathrm{Ch}(s')}\exp\left[\frac{1}{\alpha}Q_{\mathrm{soft}}^{\theta}(s', s'')\right]\nonumber\\
            &\quad - \alpha \log \sum_{s''\in\mathrm{Ch}(s)}\exp\left[\frac{1}{\alpha}Q_{\mathrm{soft}}^{\theta}(s, s'')\right] + \alpha \log \sum_{s''\in\mathrm{Ch}(s)}\exp\left[\frac{1}{\alpha}Q_{\mathrm{soft}}^{\theta}(s, s'')\right]\\
            &= \alpha \Big[\log P_{F}^{\theta}(s'\mid s) + \log F_{\theta}(s) - \log P_{B}(s\mid s') - \log F_{\theta}(s')\Big]\\
            &= \alpha \Delta_{\mathrm{DB}}(s, s';\theta),
        \end{align}
        where we used the following correspondence between $Q_{\mathrm{soft}}^{\theta}$, $P_{F}^{\theta}$, and $F_{\theta}$:
        \begin{align}
            F_{\theta}(s') &= \sum_{s\in\mathrm{Pa}(s')}\exp\left(\frac{1}{\alpha}Q_{\soft}^{\theta}(s, s')\right) & P_{F}^{\theta}(s'\mid s) &\propto \exp\left(\frac{1}{\alpha}Q_{\soft}^{\theta}(s, s')\right).
        \end{align}

        \item If $s' = s_{f}$ is the terminal state, then the residual in the DB objective is
        \begin{equation}
            \Delta_{\mathrm{DB}}(s, s_{f};\theta) = \log \frac{F_{\theta}(s)P_{F}^{\theta}(s_{f}\mid s)}{\exp(-\gE(s)/\alpha)}.
        \end{equation}
        Again with our definition of the reward function of the soft MDP in \cref{eq:distribution-corrected-reward-sparse}, we know that the reward of the terminating transition is $r(s, s_{f}) = -\gE(s)$. We can therefore also show the relation between the two residuals in this case:
        \begin{align}
            \Delta_{\mathrm{SQL}}(s, s_{f};\theta) &= Q_{\mathrm{soft}}^{\theta}(s, s_{f}) - r(s, s_{f})\\
            &= Q_{\mathrm{soft}}^{\theta}(s, s_{f}) + \gE(s)\\
            &= \alpha \log \left(\exp\left(\frac{1}{\alpha}Q_{\mathrm{soft}}^{\theta}(s, s_{f})\right)\right) + \gE(s)\\
            &\quad - \alpha \log \sum_{s''\in\mathrm{Ch}(s)}\exp\left[\frac{1}{\alpha}Q_{\mathrm{soft}}^{\theta}(s, s'')\right] + \alpha \log \sum_{s''\in\mathrm{Ch}(s)}\exp\left[\frac{1}{\alpha}Q_{\mathrm{soft}}^{\theta}(s, s'')\right]\nonumber\\
            &= \alpha \left[\log P_{F}^{\theta}(s_{f}\mid s) + \log F_{\theta}(s) + \frac{\gE(s)}{\alpha}\right]\\
            &= \alpha \Delta_{\mathrm{DB}}(s, s_{f};\theta),
        \end{align}
        where we used the same correspondence between $Q_{\mathrm{soft}}^{\theta}$, $P_{F}^{\theta}$, and $F_{\theta}$ as above.
    \end{itemize}
    This concludes the proof, showing that $\gL_{\mathrm{SQL}}(\theta) = \alpha^{2}\gL_{\mathrm{DB}}(\theta)$.
\end{proof}
Note that \citet{tiapkin2023gfnmaxentrl} established a similar connection between SQL and DB through a dueling architecture perspective \citep{wang2016dueling}, where the Q-function must be decomposed as (with notation adapted to this paper)
\begin{equation}
    Q^{\theta}(s, s') = V^{\theta}(s) + A^{\theta}(s, s') - \log \sum_{s''\in\mathrm{Ch}(s)}\exp\big(A^{\theta}(s, s'')\big),
\end{equation}
where $A^{\theta}(s, s')$ is an advantage function. The connection they establish is then between $V^{\theta}$ and the state flow on the one hand, and $A^{\theta}$ and the policy on the other hand, via
\begin{align}
    \log F_{\theta}(s) &= V^{\theta}(s) = \log \sum_{s''\in\mathrm{Ch}(s)}\exp\big(Q^{\theta}(s, s'')\big)\label{eq:tiapkin-correspondance-sql-db-1}\\
    \log P_{F}^{\theta}(s'\mid s) &= A^{\theta}(s, s') - \log \sum_{s''\in\mathrm{Ch}(s)}\exp\big(A^{\theta}(s, s'')\big). \label{eq:tiapkin-correspondance-sql-db-2}
\end{align}
This differs from our result in that \cref{cor:equivalence-db-sql} does not explicitly require a separate advantage function. However, we note that both results are effectively equivalent to one another, since \cref{eq:tiapkin-correspondance-sql-db-1} directly corresponds to \cref{eq:equivalence-db-sql-correspondence} in \cref{cor:equivalence-db-sql} (with $\alpha = 1$), and starting from \cref{eq:tiapkin-correspondance-sql-db-2}:
\begin{align}
    \log P_{F}^{\theta}(s'\mid s) &= A^{\theta}(s, s') - \log \sum_{s''\in\mathrm{Ch}(s)}\exp\big(A^{\theta}(s, s'')\big)\\
    &= \Bigg[Q^{\theta}(s, s') - V^{\theta}(s) + \log \sum_{s''\in\mathrm{Ch}(s)}\exp\big(A^{\theta}(s, s'')\big)\Bigg] - \log \sum_{s''\in\mathrm{Ch}(s)}\exp\big(A^{\theta}(s, s'')\big)\nonumber\\
    &= Q^{\theta}(s, s') - V^{\theta}(s) = Q^{\theta}(s, s') - \log \sum_{s''\in\mathrm{Ch}(s)}\exp\big(Q^{\theta}(s, s'')\big),
\end{align}
which also corresponds to \cref{eq:equivalence-db-sql-correspondence} in \cref{cor:equivalence-db-sql}. The connection through a dueling architecture is closer to DB when the policy and the state flow network are parametrized by two separate networks.

\subsection{Equivalence between \texorpdfstring{$\pi$}{pi}-SQL and Modified DB}
\label{app:equivalence-pisql-mdb}

\begin{proposition}
    \label{prop:policy-parametrization-sql}
    Assume that all the states of the soft MDP are terminating (\emph{i.e.}, connected to the terminal state $s_{f}$), and such that for all $s\in\gS$, the reward function satisfies $r(s, s_{f}) = 0$. Then the objective of Soft Q-Learning \citep{haarnoja2017sql} can be written as a function of a policy $\pi_{\theta}$ parametrized by $\theta$. This objective is given by $\gL_{\pi\textrm{-}\mathrm{SQL}}(\theta) = \frac{1}{2}\E_{\pi_{b}}[\Delta_{\pi\textrm{-}\mathrm{SQL}}^{2}(s, s'; \theta)]$, where $\pi_{b}$ is an arbitrary policy over transitions $s\rightarrow s'$ such that $s' \neq s_{f}$, and
    \begin{equation}
        \Delta_{\pi\textrm{-}\mathrm{SQL}}(s, s'; \theta) = \alpha\big[\log \pi_{\theta}(s'\mid s) - \log \pi_{\theta}(s_{f}\mid s) + \log \pi_{\theta}(s_{f}\mid s')\big] - r(s, s').
    \end{equation}
\end{proposition}

\begin{proof}
    Recall that the objective of Soft Q-Learning can be written in terms of a Q-function $Q_{\soft}^{\theta}$ parametrized by $\theta$ as $\gL_{\mathrm{SQL}}(\theta) = \frac{1}{2}\E_{\pi_{b}}[\Delta_{\mathrm{SQL}}^{2}(s, s'; \theta)]$, with
    \begin{align}
        \Delta_{\mathrm{SQL}}(s, s'; \theta) &= Q_{\soft}^{\theta}(s, s') - \big(r(s, s') + V_{\soft}^{\theta}(s')\big),\label{eq:proof-residual-sql}\\
        \mathrm{where}\quad V_{\soft}^{\theta}(s') &\triangleq \alpha \log \sum_{s''\in \mathrm{Ch}(s')}\exp\left[\frac{1}{\alpha}Q_{\soft}^{\theta}(s', s'')\right].
    \end{align}
    Since we assume that $r(s, s_{f}) = 0$, we can enforce the fact that $Q_{\soft}^{\theta}(s, s_{f}) = 0$ in our parametrization of the Q-function. If we define a policy $\pi_{\theta}$ as
    \begin{equation}
        \pi_{\theta}(s'\mid s) \triangleq \exp\left[\frac{1}{\alpha}\big(Q_{\soft}^{\theta}(s, s') - V_{\soft}^{\theta}(s)\big)\right],
    \end{equation}
    then we have in particular $\pi_{\theta}(s_{f}\mid s) = \exp(-V_{\soft}^{\theta}(s)/\alpha)$, based on our observation above. Moreover, we can write the different in value functions appearing in \cref{eq:proof-residual-sql} as
    \begin{align}
        Q_{\soft}^{\theta}(s, s') - V_{\soft}^{\theta}(s') &= Q_{\soft}^{\theta}(s, s') - V_{\soft}^{\theta}(s) + V_{\soft}^{\theta}(s) - V_{\soft}^{\theta}(s')\\
        &= \alpha\big[\log \pi_{\theta}(s'\mid s) - \log \pi_{\theta}(s_{f}\mid s) + \log \pi_{\theta}(s_{f}\mid s')\big]
    \end{align}
    which concludes the proof.
\end{proof}

\equivpisqlmodifieddb*
\begin{proof}
    \hypertarget{proof:equivalence-pisql-modified-db}
    Let $s \rightarrow s'$ be a transition in the soft MDP. In order to show the equivalence between $\gL_{\pi\textrm{-}\mathrm{SQL}}$ and $\gL_{\mathrm{M}\textrm{-}\mathrm{DB}}$, it is sufficient to show the equivalence between their corresponding residuals \cref{eq:residual-pisql} and \cref{eq:residual-modified-db}. Recall that the reward function in \cref{eq:distribution-corrected-reward-dense} is defined by
    \begin{align}
        r(s_{t}, s_{t+1}) &= \gE(s_{t}) - \gE(s_{t+1}) + \alpha \log P_{B}(s_{t}\mid s_{t+1}) & r(s_{T}, s_{f}) &= 0.
    \end{align}
    Replacing the reward in the residual $\Delta_{\pi\textrm{-}\mathrm{SQL}}$, we get
    \begin{align*}
        \Delta_{\pi\textrm{-}\mathrm{SQL}}&(s, s';\theta) = \alpha\big[\log \pi_{\theta}(s'\mid s) - \log \pi_{\theta}(s_{f}\mid s) + \log \pi_{\theta}(s_{f}\mid s')\big] - r(s, s')\\
        &= \alpha \big[\log \pi_{\theta}(s'\mid s) - \log \pi_{\theta}(s_{f}\mid s) + \log \pi_{\theta}(s_{f}\mid s')\big] - \big[\gE(s) - \gE(s') + \alpha \log P_{B}(s\mid s')\big]\\
        &= \alpha \left[-\frac{\gE(s)}{\alpha} + \log \pi_{\theta}(s'\mid s) + \log \pi_{\theta}(s_{f}\mid s') + \frac{\gE(s')}{\alpha} - \log P_{B}(s\mid s') - \log \pi_{\theta}(s_{f}\mid s)\right]\\
        &= \alpha \left[-\frac{\gE(s)}{\alpha} + \log P_{F}^{\theta}(s'\mid s) + \log P_{F}^{\theta}(s_{f}\mid s') + \frac{\gE(s')}{\alpha} - \log P_{B}(s\mid s') - \log P_{F}^{\theta}(s_{f}\mid s)\right]\\
        &= -\alpha \log \frac{\exp(-\gE(s')/\alpha)P_{B}(s\mid s')P_{F}^{\theta}(s_{f}\mid s)}{\exp(-\gE(s)/\alpha)P_{F}^{\theta}(s'\mid s)P_{F}^{\theta}(s_{f}\mid s')} = -\alpha \Delta_{\mathrm{M}\textrm{-}\mathrm{DB}}(s, s'; \theta)
    \end{align*}
    This conclude the proof, showing that $\gL_{\pi\textrm{-}\mathrm{SQL}}(\theta) = \alpha^{2}\gL_{\mathrm{M}\textrm{-}\mathrm{DB}}(\theta)$.
\end{proof}

\subsection{Equivalence between SQL and Forward-Looking DB}
\label{app:equivalence-sql-fl-db}
We will now generalize the result of \cref{prop:equivalence-pisql-modified-db} to the case where $\gX \not\equiv \gS$, but where intermediate rewards are still available along the trajectory. We will assume that for any complete trajectory $\tau = (s_{0}, s_{1}, \ldots, s_{T}, s_{f})$, the energy function at $s_{T}$ can be decomposed into a sum of intermediate rewards \citep{pan2023forwardlookinggfn}
\begin{equation}
    \gE(s_{T}) = \sum_{t=0}^{T-1}\gE(s_{t}\rightarrow s_{t+1}),
\end{equation}
where we overload the notation $\gE$ for simplicity. In that case, we can define the corrected reward as follows in order to satisfy the conditions of \cref{thm:maxent-rl-unbiased}
\begin{align}
    r(s_{t}, s_{t+1}) &= -\gE(s_{t}\rightarrow s_{t+1}) + \alpha \log P_{B}(s_{t}\mid s_{t+1}) &&& r(s_{T}, s_{f}) &= 0.
    \label{eq:reward-shaping-intermediate}
\end{align}
This type of reward shaping is similar to the one introduced in \cref{sec:sql-policy-parametrization}. The Forward-Looking Detailed Balance loss (FL-DB; \citealp{pan2023forwardlookinggfn}) is defined similarly to DB, with the exception that the flow function corresponds to the unknown offset relative to $\gE(s_{t}\rightarrow s_{t+1})$, which is known and therefore does not need to be learned. For some transition $s\rightarrow s'$ such that $s'\neq s_{f}$, the corresponding residual can be written as
\begin{equation}
    \Delta_{\mathrm{FL}\textrm{-}\mathrm{DB}}(s, s'; \theta) = \log \frac{\tilde{F}_{\theta}(s')P_{B}(s\mid s')}{\tilde{F}_{\theta}(s)P_{F}^{\theta}(s'\mid s)} - \frac{\gE(s\rightarrow s')}{\alpha},
    \label{eq:fl-db-residual}
\end{equation}
where $P_{F}^{\theta}$ is the policy (forward transition probability), and $\tilde{F}_{\theta}$ is an offset state-flow function, parametrized by $\theta$. Note that with FL-DB, there is no longer an explicit residual for the boundary condition, unlike in DB, since this is captured through \cref{eq:fl-db-residual} already. The following proposition establishes an equivalence between SQL and FL-DB, similar to \cref{cor:equivalence-db-sql} \& \cref{prop:equivalence-pisql-modified-db}.

\begin{proposition}
    \label{prop:equivalence-sql-fl-db}
    The Forward-Looking Detailed Balance objective (GFlowNet; \citealp{pan2023forwardlookinggfn}) is proportional to the Soft Q-Learning objective (MaxEnt RL; \citealp{haarnoja2017sql}) on the soft MDP with the reward function defined in \cref{eq:reward-shaping-intermediate}, in the sense that $\gL_{\mathrm{SQL}}(\theta) = \alpha^{2}\gL_{\mathrm{FL}\textrm{-}\mathrm{DB}}(\theta)$, with the following correspondence
    \begin{align}
        \tilde{F}_{\theta}(s) &= \sum_{s''\in\mathrm{Ch}(s)}\exp\left(\frac{1}{\alpha}Q_{\soft}^{\theta}(s, s'')\right) &&& P_{F}^{\theta}(s'\mid s) \propto \exp\left(\frac{1}{\alpha}Q_{\soft}^{\theta}(s, s')\right).
        \label{eq:correspondence-sql-fl-db}
    \end{align}
\end{proposition}

\begin{proof}
    The proof is similar to the one in \cref{app:equivalence-sql-db}. Let $s\rightarrow s'$ be a transition in the soft MDP, where $s'\neq s_{f}$. Recall that the residual in the SQL objective is
    \begin{align}
        \Delta_{\mathrm{SQL}}(s, s'; \theta) &= Q_{\soft}^{\theta}(s, s') - \big(r(s, s') + V_{\soft}^{\theta}(s')\big),\\
        \mathrm{where}\quad V_{\soft}^{\theta}(s') &\triangleq \alpha \log \sum_{s''\in \mathrm{Ch}(s')}\exp\left(\frac{1}{\alpha}Q_{\soft}^{\theta}(s', s'')\right).
    \end{align}
    With our choice of reward function in \cref{eq:reward-shaping-intermediate}, we know that $r(s, s') = -\gE(s\rightarrow s') + \alpha \log P_{B}(s\mid s')$. We can therefore show that the residuals of SQL and FL-DB are proportional to one-another:
    \begin{align}
        \Delta_{\mathrm{SQL}}&(s, s';\theta) = Q_{\soft}^{\theta}(s, s') - \big(r(s, s') + V_{\soft}^{\theta}(s')\big)\\
        &= Q_{\soft}^{\theta}(s, s') + \gE(s \rightarrow s') - \alpha \log P_{B}(s\mid s') - \alpha \log \sum_{s''\in\mathrm{Ch}(s')}\exp\left(\frac{1}{\alpha}Q_{\soft}^{\theta}(s', s'')\right)\\
        &= Q_{\soft}^{\theta}(s, s') - \alpha\log \sum_{s''\in\mathrm{Ch}(s)}\exp\left(\frac{1}{\alpha}Q_{\soft}^{\theta}(s, s'')\right) + \gE(s\rightarrow s') - \alpha \log P_{B}(s\mid s')\nonumber\\
        &\qquad + \alpha \log \sum_{s''\in\mathrm{Ch}(s)}\exp\left(\frac{1}{\alpha}Q_{\soft}^{\theta}(s, s'')\right) - \alpha \log \sum_{s''\in\mathrm{Ch}(s')}\exp\left(\frac{1}{\alpha}Q_{\soft}^{\theta}(s', s'')\right)\\
        &= \alpha \Bigg[\log P_{F}^{\theta}(s'\mid s) - \log P_{B}(s\mid s') + \log \tilde{F}_{\theta}(s) - \log \tilde{F}_{\theta}(s') + \frac{\gE(s\rightarrow s')}{\alpha}\Bigg]\\
        &= -\alpha \Delta_{\mathrm{FL}\textrm{-}\mathrm{DB}}(s, s';\theta).
    \end{align}
    Where we used the correspondence between $\tilde{F}_{\theta}$, $P_{F}^{\theta}$, and $Q_{\soft}^{\theta}$ from \cref{eq:correspondence-sql-fl-db}. This concludes the proof, showing that $\gL_{\mathrm{SQL}}(\theta) = \alpha^{2}\gL_{\mathrm{FL}\textrm{-}\mathrm{DB}}(\theta)$.
\end{proof}
Interestingly, the correspondence in \cref{eq:correspondence-sql-fl-db} between state flows and policy on the one hand (GFlowNet) and the Q-function on the other hand (MaxEnt RL) is exactly the same as the one in \cref{cor:equivalence-db-sql}.

\section{Experimental details}
\label{app:experimental-details}
For all algorithms and all environments (unless stated otherwise), we kept the same exploration schedule, frequency of update of the target network (if applicable), and replay buffer, in order to avoid attributing favorable performance to any of those components. Exploration was done using a naive $\varepsilon$-sampling scheme, where actions were sampled from the current policy with probability $1 - \varepsilon$, and uniformly at random with probability $\varepsilon$. All algorithms were trained over 100k iterations, and $\varepsilon$ was decreasing over the first 50k from $\varepsilon=1$ to $\varepsilon=0.1$. All algorithms use a target network, except TB/PCL, and the target network was updated every 1000 iterations. We used a simple circular buffer with 100k capacity for all algorithms (TB/PCL using  buffer of trajectories, as opposed to a buffer of transitions). Hyperparameter search was conducted for all environments over the learning rate alone of all the networks, using a simple grid search.

\subsection{Probabilistic inference over discrete factor graphs}
\label{app:details-treesample}
Given a factor graph with a fixed structure, over $d$ random variables $(V_{1}, \ldots, V_{d})$, the objective is to sample a complete assignment $\vv$ of these variables from the Gibbs distribution $P(\vv) \propto \exp(-\gE(\vv))$, where the energy function is defined by
\begin{equation}
    \gE(v_{1}, \ldots, v_{d}) = -\sum_{m=1}^{M}\psi_{m}(\vv_{[m]}),
    \label{eq:treesample-energy}
\end{equation}
where $\psi_{m}$ is the $m$th factor in the factor graph, and $\vv_{[m]}$ represents the values of the variables that are part of this factor. The factors $\psi_{m}$ are fixed, and randomly generated using the same process as \citet{buesing2020approximate}. Each variable $V_{i}$ is assumed to be discrete and can take one of $K$ possible values. Overall, this means that the number of elements in the sample space is $K^{d}$.

A state of the soft MDP is a (possibly partial) assignment of the values, \emph{e.g.}, $(0, \cdot, 1, 0, \cdot, \cdot)$, where $\cdot$ represents a variable which has not been assigned a value yet. The initial state $s_{0} = (\cdot, \cdot, \ldots, \cdot)$ is the state where no variable has an assigned value. An action consists in picking one variable that has not value (with a $\cdot$), and assigning it one of $K$ values. The process terminates when all the variables have been assigned a value, meaning that all the complete trajectories have length $d$. This differs from the MDP of \citet{buesing2020approximate}, since they were assigning the values of the variables in a fixed order determined ahead of time, making the MDP having a tree structure.

\citet{buesing2020approximate} defined an intermediate reward function corresponding to a decomposition of the energy \cref{eq:treesample-energy} as $\gE(\vv) = \sum_{t=0}^{d-1} \gE(s_{t} \rightarrow s_{t+1})$, where $\vv$ is the terminating state of a complete trajectory $(s_{0}, s_{1}, \ldots, s_{d}, s_{f})$ (\emph{i.e.}, $s_{d} = \vv$), and with the partial energies defined as
\begin{equation}
    \gE(s_{t} \rightarrow s_{t+1}) = -\sum_{m=1}^{M}\psi_{m}(\vv_{[m]})\mathbbm{1}(i \in [m]\ \mathrm{\&}\ \vv_{[m]} \subseteq s_{t+1}),
    \label{eq:treesample-partial-energy}
\end{equation}
if the transition $s_{t} \rightarrow s_{t+1}$ corresponds to assigning the value of a particular variable $V_{i}$. In other words, the partial energy corresponds to computing all the factors as soon as all the necessary information is available (\emph{i.e.}, all the values of the input variables of the factors have been assigned, and $V_{i}$ that was just assigned a value is one of the input variables of the factors).

\subsection{Structure learning of Bayesian Networks}
\label{app:details-dag-gfn}
Given $d$ continuous random variables $(X_{1}, \ldots, X_{d})$, a DAG $G$ and parameters $\theta$, a Bayesian Network represents the conditional independences in the joint distribution based on the structure of $G$
\begin{equation}
    P(X_{1}, \ldots, X_{d}; \theta) = \prod_{j=1}^{d}P\big(X_{j}\mid \mathrm{Pa}_{G}(X_{j}); \theta_{k}\big),
\end{equation}
where $\mathrm{Pa}_{G}(X_{j})$ represents the parent variables of $X_{j}$ in $G$. We assume that all conditional distributions are linear-Gaussian. The objective of Bayesian structure learning is to approximate the posterior distribution over DAGs: $P(G\mid \gD) \propto P(\gD\mid G)P(G)$, where $P(\gD\mid G)$ is the marginal likelihood and $P(G)$ is a prior over graph, assumed to be uniform here. Our experiments vary in the way the marginal likelihood is computed, either based on the BGe score \citep{geiger1994bge}, or the linear Gaussian score \citep{nishikawa2022vbg}. We followed the experimental setup of \citet{deleu2022daggflownet}, where data is generated from a randomly generated ground truth Bayesian network $G^{*}$, sampled using an Erd\"os-R\'{e}nyi scheme with on average 1 edge per node. We generated 100 observations from this Bayesian Network using ancestral sampling. We repeated this process for 20 different random seeds.

A state of the soft MDP corresponds to a DAG $G$ over $d$ nodes, and the initial state is the empty graph over $d$ nodes. An action consists in adding a directed edge between two nodes, such that it is not already present in the graph, and it doesn't introduce a cycle, garanteeing that all the states of the MDP are valid acyclic graphs; there is a special action indicating whether we want to terminate and transition to $s_{f}$. Since all the states are valid DAGs, this means that all the states of the MDP are terminating, and we can use $r(G_{t}, G_{t+1}) = \gE(G_{t}) - \gE(G_{t+1})$ as the intermediate reward, with the appropriate energy function $\gE(G) = \log P(G_{0}, \gD) - \log P(G, \gD)$. \citet{deleu2022daggflownet} showed that the difference in energies can be efficiently computed using the delta-score \citep{friedman2003ordermcmc}.

\subsection{Phylogenetic tree generation}
\label{app:details-phylogfn}
We consider the environment introduced by \citet{zhou2024phylogfn}, where phylogenertic trees used for the analysis of the evolution of a group of $d$ species are generated, according to a parsimonious criterion. Indeed, trees encoding few mutations are favored as they are more likely to represent realistic relationships. Given a tree $T$, whose nodes are the species of interest, the target distribution is given by
\begin{equation}
    P(T) \propto \exp(-M(T\mid \mY) / C),
\end{equation}
where $C = 4$ is a fixed constant, and $M(T\mid \mY)$ is the total number of mutations (also known as the \emph{parsimony score}), based on the biological sequences $\mY$ associated with each species; note that for convenience, we treat the energy function as being $\gE(T) = M(T\mid \mY)/C$ (with $\alpha = 1$). We used 6 out of the 8 datasets considered by \citet{zhou2024phylogfn}; the statistics of the datasets are recalled in \cref{tab:datasets-phylogfn} for completeness.

\begin{table}[ht]
    \centering
    \caption{Statistics of the datasets used in the phylogenetic tree generation task. ``Length'' represents the length of the biological sequence (\emph{e.g.}, the DNA sequence) of each species. See \citep{zhou2024phylogfn} for details and references about these datasets. The number of species represents the number of nodes in the tree, and is a measure of complexity of the task.}
    \begin{tabular}{lll}
    \toprule
    Dataset & \# Species ($d$) & Length\\
    \midrule
    DS1 & 27 & 1949\\
    DS2 & 29 & 2520\\
    DS3 & 36 & 1812\\
    DS4 & 41 & 1137\\
    DS5 & 50 & 378\\
    DS6 & 50 & 1133\\
    \bottomrule
    \end{tabular}
    \label{tab:datasets-phylogfn}
\end{table}

A state of the soft MDP is a collection of trees over a partition of all the species (the leaves of the trees are species), where the initial state corresponds to $d$ trees with a single node (leaf), one for each species in the group. An action consists in picking two trees, and merging them by adding a root. The process terminates when there is only one tree left in this collection, meaning that all complete trajectories have the same length $d-1$. The size of the sample space is $(2d - 3)!!$ (for $d \geq 2$).

To decompose the energy function into $\gE(T) = \sum_{t=0}^{d-2}\gE(s_{t} \rightarrow s_{t+1})$, where $T$ is the terminating state of a complete trajectory $(s_{0}, s_{1}, \ldots, s_{d-1}, s_{f})$, we can use the observation made by \citet{zhou2024phylogfn} that the total number of mutations $M(T\mid \mY)$ can be decomposed as the sum of (1) the number of mutations are the root of the tree, and (2) the total number of mutations in the left and right subtrees. Therefore, we can use the number of mutations at the new root of the tree constructed during the transition $s_{t} \rightarrow s_{t+1}$ as the intermediate energy $\gE(s_{t}\rightarrow s_{t+1})$ (appropriately rescaled by $C$), which can be computed using the Fitch algorithm \citep{zhou2024phylogfn}.

In addition to \cref{fig:phylogfn}, we also provide a complete view of the correlation between the terminating state log-probabilities and the returns for all algorithms and all datasets in \cref{fig:phylo_grid1_3,fig:phylo_grid4_6}. Each point corresponds to a tree sampled using the terminating state distribution found by the corresponding algorithm.

\begin{figure}[htbp]
    \centering
    \vspace*{-2em}
    \includegraphics[width=.8\textwidth]{figures/grid_DS1-3.pdf}
    \caption{Correlation plots for all algorithms. Rows from top to bottom: Trajectory Balance (TB), Path Consistency Learning (PCL), Detailed Balance (DB), Soft Q-Learning (SQL) and SAC. Columns showing DS1 to DS3 from left to right.}
    \label{fig:phylo_grid1_3}
\end{figure}

\begin{figure}[htbp]
    \centering
    \vspace*{-1em}
    \includegraphics[width=0.8\textwidth]{figures/grid_DS4-6.pdf}
    \caption{Similar plots as \cref{fig:phylo_grid1_3} here columns presenting DS4, DS5 and DS6 from left to right. }
    \label{fig:phylo_grid4_6}
\end{figure}

\end{document}