% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\input{math_commands.tex}
\usepackage{float}
% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{uai2023-template}

\def\d{\frac{\partial}{\partial \theta}}
\def\N{\mathcal{N}}
\usepackage{xcolor}
\newcommand{\todo}[1]{\textcolor{blue}{#1}}
\usepackage{multirow}
\usepackage{pifont}% http://ctan.org/pkg/pifont
\newcommand{\cmark}{\ding{51}}%
\newcommand{\xmark}{\ding{55}}%
\usepackage[capitalize,noabbrev]{cleveref}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
% change from star to dagger
    \makeatletter
\def\@fnsymbol#1{\ensuremath{\ifcase#1\or \dagger\or \ddagger\or
   \mathsection\or \mathparagraph\or \|\or **\or \dagger\dagger
   \or \ddagger\ddagger \else\@ctrerr\fi}}
    \makeatother
\title{Molecule Design by Latent Space Energy-Based Modeling and Gradual Distribution Shifting \\(Supplementary Materials)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Deqian Kong\thanks{Equal contribution}}
\author[2]{Bo Pang$^\dagger$}
\author[3]{Tian Han}
\author[1]{Ying Nian Wu}
% \author[1]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Department of Statistics\\
    University of California, Los Angeles
    % Pittsburgh, Pennsylvania, USA
}
\affil[2]{%
    Salesforce Research
}
\affil[3]{%
    Department of Computer Science\\ 
    Stevens Institute of Technology
  }
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

\section{Details about model and learning} 
\label{sec:a0}

Our model is of the form $p_\alpha(z)p_\beta(x|z) p_\gamma(y|z)$. The marginal distribution of $(x, y)$ is 
\begin{align*}
    p_\theta(x, y) = \int p_\theta(x, y, z) dz 
    = \int p_\alpha(z) p_\beta(x|z) p_\gamma(y|z) dz. 
\end{align*}
We use $p_\theta(x, y)$ to approximate the data distribution of $(x, y)$. 

For the data distribution of $(x, y)$, $y$ is a deterministic function of $x$. However, a machine learning method usually cannot learn the deterministic function exactly. Instead, we can only learn a probabilistic $p_\theta(y|x)$. Our model $p_\theta(x, y)$ seeks to approximate the data distribution $p(x, y)$ by maximum likelihood. A learnable and flexible prior model $p_\alpha(z)$ helps to make the approximation more accurate than a fixed prior model such as that in VAE. 

Let the training data be $\{(x_i, y_i), i = 1, ..., n\}$. The log-likelihood function is $L( \theta) = \sum_{i=1}^{n} \log p_\theta(x_i, y_i)$. The learning gradient is $L'( \theta) = \sum_{i=1}^{n} \nabla_\theta \log p_\theta(x_i, y_i)$. 
In the following, we provide details for calculating $\nabla_\theta \log p_\theta(x, y)$ for a single generic training example $(x, y)$ (where we drop the subscript $_i$ for notation simplicity).
\begin{align*} 
   \nabla_\theta  \log p_\theta(x, y) &= \frac{1}{p_\theta(x, y)} \nabla_\theta p_\theta(x, y)\\
    &= \frac{1}{p_\theta(x, y)} \int \nabla_\theta p_\theta(x, y, z) dz\\
    &= \frac{1}{p_\theta(x, y)} \int p_\theta(x, y, z) \nabla_\theta \log p_\theta(x, y, z) dz\\
    &=  \int \frac{p_\theta(x, y, z)}{p_\theta(x, y)} \nabla_\theta \log p_\theta(x, y, z) dz\\
    &=  \int p_\theta(z \mid x, y) \nabla_\theta \log p_\theta(x, y, z) dz\\
   & = \E_{p_\theta(z|x, y)} \left[ \nabla_\theta \log p_\theta(x, y, z) \right] \\
     &= \E_{p_\theta(z|x, y)} \left[ \nabla_\theta (\log p_\alpha(z) + \log p_\beta(x|z) + \log p_\gamma(y|z)) \right].
\end{align*}

For the prior model, 
\begin{align*}
    \nabla_\alpha \log p_\alpha(z) &= \nabla_\alpha f_\alpha(z) - \nabla_\alpha \log Z(\alpha) \\
    &= \nabla_\alpha f_\alpha(z) -\frac{1}{Z(\alpha)} \nabla_\alpha  Z(\alpha) \\
     &= \nabla_\alpha f_\alpha(z) -\frac{1}{Z(\alpha)}  \int  \nabla_\alpha \exp(f_\alpha(z)) p_0(z) dz \\
      &= \nabla_\alpha f_\alpha(z) -  \int  \nabla_\alpha f_\alpha(z) \frac{1}{Z(\alpha)} \exp(f_\alpha(z)) p_0(z) dz \\
    &=  \nabla_\alpha f_\alpha(z) - \E_{p_\alpha(z)}[ \nabla_\alpha f_\alpha(z)]. 
\end{align*}
Thus the learning gradient for $\alpha$ given an example $(x, y)$ is
\begin{align} 
  \delta_\alpha(x, y) =   \nabla_\alpha \log p_\theta(x, y) = \E_{p_\theta(z|x, y)}[\nabla_\alpha f_\alpha(z)] - \E_{p_\alpha(z)} [\nabla_\alpha f_\alpha(z)]. \label{eq:alpha}
\end{align}
The above equation has an empirical Bayes nature. $p_\theta(z|x, y)$ is based on the empirical observation $(x, y)$, while $p_\alpha$ is the prior model. 
%$\alpha$ is updated based on the difference between $z$ inferred from empirical observation $(x, y)$, and $z$ sampled from the current prior. 
For the generation model,
\begin{align} 
\label{eq:beta}
\begin{split}
  \delta_\beta(x, y) =  \nabla_\beta \log p_\theta(x, y)  = \E_{p_\theta(z|x, y)} [\nabla_\beta \log p_{\beta}(x|z)].
\end{split}
\end{align} 
Similarly, for the regression model, 
\begin{align} 
\label{eq:gamma}
\begin{split}
  \delta_\gamma(x, y) =  \nabla_\gamma \log p_\theta(x, y)  = \E_{p_\theta(z|x, y)} [\nabla_\gamma \log p_{\gamma}(y|z)].
\end{split}
\end{align} 
Estimating expectations in the above equations requires Monte Carlo sampling of the prior model $p_\alpha(z)$ and the posterior distribution $p_\theta(z|x,y)$. If we can draw fair samples from the two distributions, and use these Monte Carlo samples to approximate the expectations, then the gradient ascent algorithm based on the Monte Carlo samples is the stochastic gradient ascent algorithm or the stochastic approximation algorithm of Robbins and Monro \citep{robbins1951stochastic}, who established the convergence of such an algorithm to a local maximum of the log-likelihood. 

For MCMC sampling using Langevin dynamics, the finite step or short-run Langevin dynamics may cause bias in Monte Carlo sampling. The bias was analyzed in \cite{pang2020learning}. The resulting algorithm is an approximate maximum likelihood learning algorithm. 

% \section{Baselines}
% We briefly introduce the previous baselines that we have compared in our experiments.
% \paragraph{JT-VAE}\cite{jin2018junction}: JT-VAE is a graph-based molecule generation method that uses junction tree and molecule graph as inputs. It first optimize the latent vector based on the property predictor and then generate molecules using VAE decoder.

% \paragraph{MolDQN}\citep{}:

% \section{Unconditional Generation}
% \label{sec:a2}
% Three types of encoding systems are used to encode molecules in prior work: SMILES~\citep{weininger1988smiles}, SELFIES~\citep{krenn2020self}, and graph. SMILES and SELFIES linearize a molecule graph into a string of characters. Most previous models using SMILES struggle to generate molecules with high validity, which is the percentage of molecules that satisfy the chemical valency rule. Thus graph representations become popular since explicit valency constraints can be imposed. However, perfect validity in this approach does not imply that the model captures the chemical rules since it is achieved with external constraints. Recently, SELFIES is developed where every SELFIES string corresponds to a valid molecule due to the nature of the encoding system. Thus, validity for generated SMILES strings is a good indicator on how well a learned model captures the basic chemical rules implicitly. Besides validity, we also compare models on uniqueness (the percentage of unique molecules in all generated samples) and novelty (the percentage of generated molecules that are not in the training set).  

% Following previous work, we randomly sample $10$k molecules for ZINC and $30$k for MOSES, and compare on the three aforementioned metrics. Generations results on ZINC and MOSES are shown in Table~\ref{tab:gen_on_zinc} and Table~\ref{tab:gen_on_moses} respectively. \texttt{str-smi} denotes string-based SMILES representations and \texttt{str-sfi} denotes string-based SELFIES representations. In Table~\ref{tab:gen_on_zinc}, we show generation results for both SMILES and SELFIES. SMILES does not have a validity constraint during generation. However, our model still achieves $95.5\%$ validity, which outperforms other SMILES-based methods and is also comparable to those with valency check. This result demonstrates that our model can capture those valency rules effectively and implicitly.
% % Equipped with always-valid SELFIES strings, MolEBM achieves $100\%$ validity in both ZINC and MOSES benchmark.
% Samples from our model also achieve perfect uniqueness and novelty.
% % The expressiveness of EBM and the always-valid SELFIES strings have laid the foundation for our next property design tasks.

% \begin{table}[H]
% \small
% % \caption{Unconditional generation on ZINC. $^\star$ denotes valency check. }
% % \label{tab:gen_on_zinc}
% \begin{center}
% \begin{tabular}{lcccc}
%     % \toprule
%     \textbf{Model} & \textbf{Representation} & \textbf{Validity} & \textbf{Novelty} & \textbf{Uniqueness}\\
%     \midrule
%     JT-VAE~\citep{jin2018junction}    & Graph & 1.000$^\star$  & 1.000 & 1.000\\
%     GCPN~\citep{you2018graph}     & Graph & 1.000$^\star$  & 1.000 & 1.000\\
%     GraphNVP~\citep{madhawa2019graphnvp}  & Graph &  0.426 & 1.000 & 0.948 \\
%     GraphAF~\citep{shi2020graphaf}       & Graph & 1.000$^\star$ & 1.000 & 0.991 \\
%     GraphDF~\citep{luo2021graphdf}      & Graph    & {1.000}$^\star$   & 1.000 & 1.000\\
%     \midrule
%     ChemVAE~\citep{gomez2018automatic}     & \texttt{str-smi}    & 0.170 & 0.980 & 0.310\\
%     GrammarVAE~\citep{kusner2017grammar}  & \texttt{str-smi}    & 0.310 & 1.000 & 0.108\\
%     SDVAE~\citep{dai2018syntax}        & \texttt{str-smi}    & 0.435  & - & -\\
%     \textbf{Ours}  & \texttt{str-smi}    & 0.955  & {1.000} & {1.000}\\
%     \textbf{Ours}  & \texttt{str-sfi}    & {1.000}  & {1.000} & {1.000}
% \end{tabular}
% \end{center}
% \caption{\small Unconditional generation on ZINC. $^\star$ denotes valency check. }
% \label{tab:gen_on_zinc}
% \end{table}

% \begin{table}[H]
% \small
% % \caption{Unconditional generation on MOSES. $^\star$ denotes valency check.\dq{results taken from}}
% % \label{tab:gen_on_moses}
% \begin{center}
% \begin{tabular}{lclcc}
% \textbf{Model}                 & \textbf{Representation} & \textbf{Validity} & \textbf{Novelty} & \textbf{Uniqueness} \\ \hline
% JT-VAE~\citep{jin2018junction} & Graph                   & 1.000$^\star$     & 0.914            & 1.000               \\
% GraphAF~\citep{shi2020graphaf} & Graph                   & 1.000$^\star$     & 1.000            & 0.991               \\
% GraphDF~\citep{luo2021graphdf} & Graph                   & {1.000}$^\star$   & 1.000            & 1.000               \\
% LIMO~\citep{eckmann2022limo}                           & \texttt{str-sfi}        & 1.000             & 1.000            & 0.976               \\
% \textbf{Ours}                & \texttt{str-sfi}        & {1.000}           & {1.000}          & {1.000} 
% \end{tabular}
% \end{center}
% \caption{\small Unconditional generation on MOSES. $^\star$ denotes valency check. Results obtained from \citep{polykovskiy2020molecular,eckmann2022limo}.}
% \label{tab:gen_on_moses}
% \end{table}

% Then we randomly sample $10$k molecules from the learned latent space EBM and calculate their logP, QED and SA scores using RDKit. We compare these property densities with molecule property densities in test split.  Figure~\ref{fig:uncond} shows that the  densities of the property score of the learned model match those of the data quite well, implying that our model is able to capture the regularities in the data space. 

% \begin{figure}[H]
%     \centering
%     \includegraphics[width=\textwidth]{figure/uncond.png}
%     \caption{Property score distributions.}
%     \label{fig:uncond}
% \end{figure}


% In our experiments, we use short-run MCMC \citep{nijkamp2019learning} (i.e. the length of Markov chain $K=20$ and step size $s=0.1$) for all experiments.  Figure~\ref{fig:langevin} shows that with the increasing length of Markov chains, the molecules change accordingly, showing that  Markov chain doesn't get stuck in the local mode.
% \begin{figure}[H]
%     \centering
%     \includegraphics[width=\textwidth]{figure/langevin.png}
%     \caption{Sampled molecules with the different length of Markov chain.}
%     \label{fig:langevin}
% \end{figure}

\section{Training Time}
\label{sec:a3}

The training of joint distribution of molecule and its properties takes around 4 hours with $25$ iterations on a single Nvidia Titan XP GPU with batch size $2048$. For non-biological single-objective property optimization, it takes around $20$ minutes to do $30$ distribution shifting (SGDS) iterations. If we use SGDS without warm start, it takes around half an hour. For biological binding affinity maximization, the optimization time is mainly dependent on the number of queries of AutoDock-GPU. We do $30$ and $20$ SGDS iterations for the single-objective and multi-objective tasks, respectively, which cost $10$ hours and $8$ hours with warm start, and cost $14$ hours and $10$ hours without warm start. For biological property optimization tasks, we use two Nvidia Titan XP GPUs, one for running our code and the other for running AutoDock-GPU. We have added a table to compare with previous methods.

\begin{table}[H]
\small
\begin{center}
\begin{tabular}{lccc}
\textbf{Model}                 & Penalized-logP/QED  &Single binding affinity\\ \hline
JT-VAE &24  &$-$  \\
GCPN & 8   &6                \\
MolDQN & 24  &6                 \\
GraphDF & 8   &12               \\
Mars   & 12  &6                   \\
LIMO   & 1  & 1                       \\
SGDS without warm start& $4.5$ & $18$ \\
SGDS with warm start& $4.3$ & $14$ \\
\end{tabular}
\end{center}
\caption{\small Comparison of molecule generation time in (hrs). Results obtained from \citep{eckmann2022limo}.}
\label{tab:time}
\end{table}

Even if we use MCMC sampling-based methods, our training speed is affordable comparing to existing methods. That is because our designed latent space EBM is low-dimensional (i.e. dim$(z)$=100) and we use short-run MCMC (i.e. with fixed iteration steps $20$) in our experiments. The major bottleneck of the training speed is the time of querying the property compute engines.

% \section{Additional Experiments on GuacaMol Benchmarks}

% % We further evaluate our SGDS on several distribution learning and goal-directed optimization tasks in GuacaMol benchmark~\citep{guacamol}. 

% % To be specific, for distribution learning, we use the validity, uniqueness and novelty to evaluate our model. The results are shown in Table~\ref{tab:gua_dl}. Comparing to existing methods, our methods trained using SELFIES representations achieves highest scores among all three tasks.

% We evaluate multiple property optimization tasks (MPO) in GuacaMol, which are to maximize a weighted sum of several molecule properties using a single scoring function. Those tasks are defined differently from the previous settings in multi-objective binding affinity optimization. To make fair comparisons, we use the single score function as other baselines. \cref{fig:mpo} shows our competitive results on MPO benchmarks. We believe our method can be further improved by using multiple property regression models. We leave it to future investigation. 

% \begin{figure}[H]
%     \centering
%     \includegraphics[width=.4\textwidth]{figure/dot.pdf}
%     \caption{Multiple property optimization (MPO). Baseline results taken from~\citep{brown2019guacamol}.}
%     \label{fig:mpo}
% \end{figure} 

% \begin{table}[H]
% \small
% % \caption{Unconditional generation on MOSES. $^\star$ denotes valency check.\dq{results taken from}}
% % \label{tab:gen_on_moses}
% \begin{center}
% \begin{tabular}{lcccccc}
% \text{Benchmark}                 & \text{AAE} & \text{Graph MCTS} & \text{Random Sampler} & \text{SMILES LSTM} & \text{VAE} & \textbf{Ours}\\ \hline
% Validity & 0.822                   &\bf 1.000 &\bf 1.000     & \bf 1.000    & 0.959   &\bf 1.000            \\
% Uniqueness & \bf 1.000                   &\bf 1.000 & 0.997     & \bf 1.000    & 0.999   &\bf 1.000            \\
% Novelty & 0.998                   &0.994 & 0.000     & 0.912    & 0.971   &\bf 0.999 
% \end{tabular}
% \end{center}
% \caption{\small Distribution learning results on GuacaMol benchmarks~\citep{guacamol}.}
% \label{tab:gua_dl}
% \end{table}

% \subsection{Goal-directed Benchmarks}
% \begin{table}[H]
% \small
% % \caption{Unconditional generation on MOSES. $^\star$ denotes valency check.\dq{results taken from}}
% % \label{tab:gen_on_moses}
% \begin{center}
% \begin{tabular}{lcccccc}
% \text{Benchmark}                 & \text{Best of Dataset} & \text{SMILES GA} & \text{Graph MCTS} & \text{Graph GA} & \text{SMILES LSTM} & \textbf{MolEBM}\\ \hline
% Osimertinib MPO & 0.839                   &0.886 & 0.784     &\bf 0.953    & 0.907   &\textit{0.933}          \\
% Fexofenadine MPO & 0.817                  &0.931 & 0.695     &\bf 0.998    & 0.959   &\textit{0.971}           \\
% Ranolazine MPO & 0.792                   &0.881 & 0.616     &\textit{ 0.920 }   & 0.855   & \bf 0.924 \\
% % Perindopril MPO & 0.579                   &0.661 & 0.385     & 0.792    & 0.808   &  \\
% % Amlodipine MPO & 0.696                   &0.722 & 0.533     &\bf 0.894    &\bf 0.894   & \textit{0.748} \\
% Sitagliptin MPO & 0.509                   &0.689 & 0.458     & \bf 0.891    & 0.545   & \textit{0.829}
% \end{tabular}
% \end{center}
% \caption{\small Goal-directed optimization results on GuacaMol benchmarks~\citep{guacamol}. Top-2 results are highlighted as \textbf{bold} and \textit{italic} respectively.}
% \label{tab:gen_on_gua}
% \end{table}





% https://www.overleaf.com/project/632a984ed3f53196b24a582c
% \newpage

\section{Generated Samples}
\subsection{Biological Property Optimization}
Figure~\ref{fig:ba0} and Figure~\ref{fig:ba1} show generated molecules with high binding affinities towards ESR1 and ACAA1 respectively in single-objective property design experiments. 

Figure~\ref{fig:mba0} and Figure~\ref{fig:mba1} show generated molecules with high binding affinities towards ESR1 and ACAA1 respectively in multi-objective property design.

Comparing to the previous state-of-the-art methods, SGDS is able to produce more high quality molecules than top-3 molecules because after gradual distribution shifting, the joint distribution locates at the area supported by molecules with high binding affinities.

%In single-objective design, we find that few generated molecules may be of less practical use due to undesired properties. This observation is in accordance with \citep{eckmann2022limo}, which is the case when the single-objective optimization is not sufficient. Thus we need multi-objective binding affinities design settings because in contrast to non-biological properties, binding affinities are hard to compute and optimize. 

%In multi-objective design settings, we find that those issues mentioned above can be partially addressed by optimizing binding affinities, QED and SA at the same time.

Meanwhile, compared to previous generative model-based methods, we use Langevin dynamics to infer the posterior distribution $p(z|x,y_1,\dots, y_n)$ without bothering to design different encoders when facing different combination of properties.

\begin{figure}[H]
    \centering
    \includegraphics[width=.7\textwidth]{figure/esr1.pdf}
    \caption{\small Generated molecules in {singe-objective} esr1 binding affinity maximization experiments with corresponding $\mathrm{K_D}(\downarrow)$ in nmol/L.}
    \label{fig:ba0}
\end{figure}

\begin{figure}[H]
    \centering
    \includegraphics[width=.8\textwidth]{figure/acaa1.pdf}
    \caption{\small Generated molecules in {singe-objective} acaa1 binding affinity maximization experiments with corresponding $\mathrm{K_D}(\downarrow)$ in nmol/L.}
    \label{fig:ba1}
\end{figure}

\begin{figure}[H]
    \centering
    \includegraphics[width=.75\textwidth]{figure/multiesr1.pdf}
    \caption{\small Generated molecules in {multi-objective} esr1 binding affinity maximization experiments with corresponding $\mathrm{K_D}(\downarrow)$ in nmol/L, SA$(\downarrow)$ and QED$(\uparrow)$ respectively.}
    \label{fig:mba0}
\end{figure}

\begin{figure}[H]
    \centering
    \includegraphics[width=.75\textwidth]{figure/multiacaa1.pdf}
    \caption{\small Generated molecules in {multi-objective} acaa1 binding affinity maximization experiments with corresponding $\mathrm{K_D}(\downarrow)$ in nmol/L, SA$(\downarrow)$ and QED$(\uparrow)$ respectively.}
    \label{fig:mba1}
\end{figure}

\subsection{P-logP and QED Optimization}
\begin{figure}[H]
    \centering
    \includegraphics[width=.4\textwidth]{figure/top_qed.png}
    \caption{\small Top-3 molecules in single-objective QED maximization.}
    \label{fig:qed_single}
\end{figure}

\begin{figure}[H]
    \centering
    \includegraphics[width=0.6\textwidth]{figure/top_plogp.png}
    \caption{\small Top-3 molecules in single-objective p-logP maximization.}
    \label{fig:plogp_single}
\end{figure}

\newpage
\section{Illustration of Sampling with Gradual Distribution Shifting (SGDS)}
\label{sec:a1}

\cref{fig:sgds,fig:sgds_esr,fig:sgds_acaa} show property densities of sampled molecules of the distribution shifting process in single-objective penalized logP, esr1 and acaa1 optimization respectively. SGDS is implemented with warm start. We can see the model distribution is gradually shifting towards the region supported by molecules with high property values. To better visualize the shifting process, we plot the docking scores rather than $\mathrm{K_D}$. The increase in docking scores corresponds to the exponential decrease in $\mathrm{K_D}$.
\begin{figure}[H]
    \centering
    \includegraphics[width=0.4\textwidth]{figure/plogp2.pdf}
    \caption{Illustration of SGDS in a single-objective penalized logP optimization experiment.}
    \label{fig:sgds}
\end{figure}

\begin{figure}[H]
    \centering
    \includegraphics[width=0.4\textwidth]{figure/sgds_esr.pdf}
    \caption{Illustration of SGDS in a single-objective esr1 optimization experiment.}
    \label{fig:sgds_esr}
\end{figure}

\begin{figure}[H]
    \centering
    \includegraphics[width=0.4\textwidth]{figure/sgds_acaa.pdf}
    \caption{Illustration of SGDS in a single-objective acaa1 optimization experiment.}
    \label{fig:sgds_acaa}
\end{figure}

% \section{Reproducibility}

% Our code and saved checkpoints can be found 
% \href{https://drive.google.com/drive/folders/1UQcXrLWo20wuBocCIEIq7RRm1p2-bb8H?usp=sharing}{here}~~\footnote{\href{https://drive.google.com/drive/folders/1UQcXrLWo20wuBocCIEIq7RRm1p2-bb8H?usp=sharing}{https://drive.google.com/drive/folders/1UQcXrLWo20wuBocCIEIq7RRm1p2-bb8H?usp=sharing}}.

\bibliography{molecule_design}

\end{document}
