% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
% \usepackage{xr} 
% \externaldocument{pan_56}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Stochastic Generative Flow Networks \\(Supplementary material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,2]{\href{mailto:<penny.ling.pan@gmail.com>}{Ling Pan\thanks{Equal contribution.}}}
\author[1,2]{Dinghuai Zhang$^{*}$}
\author[1,2]{Moksh Jain}
\author[3]{Longbo Huang}
\author[1,2,4]{Yoshua Bengio}
% Add affiliations after the authors
\affil[1]{%
    Mila - Qu\'ebec AI Institute
}
\affil[2]{%
    Universit\'e de Montr\'eal
}
\affil[3]{%
    Tsinghua University
}
\affil[4]{%
    CIFAR AI Chair
}
  
\begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle


\appendix

\section{Experimental Details}

\subsection{Gridworld} \label{app:grid}
The reward function for GridWorld is defined as in Eq.~\eqref{eq:grid_r} following \citet{bengio2021flow}, where $R_0=2.0$, $R_1=0.5$, and $R_2=0.001$.
\begin{equation}
R(x)=R_0+R_1 \prod_i \mathbb{I}\left(0.25<\left|x_i / H-0.5\right|\right)+R_2 \prod_i \mathbb{I}\left(0.3<\left|x_i / H-0.5\right|<0.4\right)
\label{eq:grid_r}
\end{equation}
We use a feedforward network that consists of two hidden layers with $256$ hidden units and LeakyReLU activation. States are represented using one-hot embeddings. As for the environment model in Stochastic GFlowNet, it is also a feedforward layer consisting of two hidden layers with $256$ hidden units and LeakyReLU activation.
All models are trained for $20000$ iterations, and we use a parallel of $16$ rollouts in the environment at each iteration (which are then stored in the experience replay buffer). The GFlowNet model is updated based on the rollouts, and we train it based on the Adam~\citep{kingma2014adam} optimizer using a learning rate of $0.001$ (the learning rate for $Z$ in TB is $0.1$).
We train the environment model using data sampled from the experience replay buffer with a batch size of $16$, which is trained using the Adam optimizer with a learning rate of $0.0001$.
MCMC and PPO use the same configuration as in \citet{bengio2021flow}.

\subsection{Bit Sequences} \label{app:bit}
We follow the same setup for the bit sequence generation task as in \citet{malkin2022trajectory}. The GFlowNet model is a Transformer~\citep{vaswani2017attention} that consists of $3$ hidden layers with $64$ hidden units and uses $8$ attention heads. 
The exploration strategy is $\epsilon$-greedy with $\epsilon=0.0005$, while the sampling temperature is set to $1$. 
It uses a reward exponent of $3$.
The learning rate for training the GFlowNet model is $5 \times 10^{-3}$, with a batch size of $16$.
As for the environment model in Stochastic GFlowNet, we use a feedforward network consisting of two hidden layers with $2048$ hidden units and ReLU activation, which is trained using the Adam optimizer with a learning rate of $5 \times 10^{-4}$.
It is trained using data sampled from the experience replay buffer with a batch size of $128$.
We train all models for $50000$ iterations, using a parallel of $16$ rollouts in the environment.
MCMC, A2C, and SAC adopt the same configuration as in \citet{malkin2022trajectory}.

\subsection{TFBind-8} \label{app:tfb}
For the TFBind-8 generation task, we follow the same setup as in \citet{jain2022biological}.
The vocabulary consists of $4$ nucleobases, and the trajectory length is $8$.
% We follow the same setup for the antimicrobial peptide generation task as in \citet{malkin2022trajectory}.
The GFlowNet model is a feedforward network that consists of $2$ hidden layers with $2048$ hidden units and ReLU activation.
The exploration strategy is $\epsilon$-greedy with $\epsilon=0.001$, while the reward exponent is $3$.
The learning rate for training the GFlowNet model is $10^{-4}$, with a batch size of $32$.
As for the environment model, we use a feedforward network consisting of two hidden layers with $2048$ hidden units and ReLU activation, which is trained using the Adam optimizer with a learning rate of $10^{-5}$.
It is trained using data sampled from the experience replay buffer with a batch size of $16$.
We train all models for $5000$ iterations.
MCMC, A2C, and SAC baselines follow the same configuration as in \citet{jain2022biological}.

\iffalse
Comparison results in terms of the top-$100$ reward (median) are demonstrated in Figure~\ref{fig:tfb}, where Stochastic GFlowNet significantly outperforms baselines.
\begin{figure}[!h]
\centering
\includegraphics[width=0.8\linewidth]{./figs/tfbind8_reward_median.pdf}
\caption{Results of the top-$100$ reward (median) in the TF Bind 8 generation task.}
\label{fig:tfb}
\end{figure}
\fi

\subsection{Antimicrobial Peptide Generation} \label{app:amp}
We follow the same setup for the antimicrobial peptide generation task as in \citet{malkin2022trajectory}.
The GFlowNet model is a Transformer~\citep{vaswani2017attention} that consists of $3$ hidden layers with $64$ hidden units and uses $8$ attention heads. 
The exploration strategy is $\epsilon$-greedy with $\epsilon=0.01$, while the sampling temperature is set to $1$. 
It uses a reward exponent of $3$.
The learning rate for training the GFlowNet model is $0.001$, with a batch size of $16$.
As for the environment model, we use a feedforward network consisting of two hidden layers with $128$ hidden units and ReLU activation, which is trained using the Adam optimizer with a learning rate of $0.0005$.
It is trained using data sampled from the experience replay buffer with a batch size of $128$.
We train all models for $20000$ iterations, using a parallel of $16$ rollouts in the environment.

% \section{Additional Related Work}
% \citet{malkin2022gfnhvi} and \citet{zimmermann2022variational} reveal that GFlowNets implement a generalized version of standard variational inference, with objectives different from the traditional forward and reverse KL divergence, which support learning from offline data. From a similar probabilistic perspective, 

% counterexample of S-GFN may fail

\bibliography{pan_56}

\end{document}
