\documentclass[accepted]{uai2023}

%% Choose your variant of English; be consistent
\usepackage[american]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib}
    \bibliographystyle{abbrvnat}
\usepackage{booktabs}
\usepackage{tikz}

\usepackage{mathtools}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{amsfonts}

\usepackage{graphicx}
\usepackage{bm}
\usepackage{comment}
\usepackage{enumitem}

\usepackage{booktabs}
\usepackage{xcolor}
\usepackage{sidecap}

\usepackage{mhchem} % Chem equation

\newcommand*\mycommand[1]{\texttt{\emph{#1}}}
\def\tcb{\textcolor{blue}}
\def\tcr{\textcolor{red}}
\def\tcg{\textcolor{green}}
\def\tcm{\textcolor{magenta}}

\def\tcrr{\textcolor{brown}}

\newcommand{\fix}{\marginpar{FIX}}
\newcommand{\new}{\marginpar{NEW}}

\newcommand\soo[1]{\textcolor{orange}{#1}}
\newcommand\joonseok[1]{\textcolor{blue}{#1}}
\newcommand\seunghoon[1]{\textcolor{pink}{#1}}
\newcommand{\sw}[1]{\textcolor{green} {#1}}
\newcommand\hongkee[1]{\textcolor{cyan}{#1}}
\newcommand\jinhwan[1]{\textcolor{purple}{#1}}


\title{Towards Physically Reliable Molecular Representation Learning (Supplementary Materials)}

\author[1]{Seunghoon Yi}
\author[2]{Youngwoo Cho}
\author[1]{Jinhwan Sul}
\author[1]{Seung Woo Ko}
\author[3]{\\Soo Kyung Kim}
\author[2]{Jaegul Choo}
\author[2]{Hongkee Yoon$^*$}
\author[1,4]{\href{mailto:<joonseok@snu.ac.kr>?Subject=Your UAI 2023 paper}{Joonseok Lee\thanks{Corresponding authors}}{}}
% Add affiliations after the authors
\affil[1]{%
    %Graduate School of Data Science\\
    Seoul National University\\
    Seoul, Korea
}
\affil[2]{%
    %Graduate School of AI\\
    Korea Advanced Institute of Science and Technology\\
    Daejeon, Korea
}
\affil[3]{%
    Palo Alto Research Center\\
    Stanford Research Institute\\
    Palo Alto, CA, USA
  }
\affil[4]{%
    Google Research\\
    Mountain View, CA, USA
  }


\begin{document}
\maketitle

\appendix

\pagenumbering{roman}
\renewcommand\thetable{\Roman{table}}
\renewcommand\thefigure{\Roman{figure}}
\setcounter{table}{0}
\setcounter{figure}{0}

%\section*{Appendix}

%------------------------------------
\section{Implementation Details}
\label{sec:exp:impl}

We try $L \in \{4, 6, 8\}$ to stack Molecule Attention Blocks after the embedding layer. We set the embedding size $d = 256$, which is same as $\text{(number of heads)} \times n_b$. 
Here, $n_b$ is the same as the dimension of the query, key, and value in the attention block. For activation, we use LeakyRELU~\citep{nair_rectified_2010,sun_deeply_2014} function after $f_\text{mol}$ and ELU~\citep{clevert2015fast} after $f_\text{bond}$. 
To enforce the positive base and exponents in the parameterized LJP and to avoid numerical errors, we add $1+\epsilon$ to $\beta_3$, $\beta_4$, where $\epsilon$ is set to be $10^{-3}$. 
We set the cutoff threshold $\tau = 5$\AA, and the number of RBFs $n_b = 16$. We use a single linear layer for $f_\text{atom}$ and $f_\text{bond}$, while a two-layer MLP for the MAM task. Specifically, the MLP outputs the estimated likelihood score for 64 atoms for each masked input token.
For the overall objective function, we choose weights as $\lambda_\text{force} = 0.3$,  $\lambda_\text{mask} = 0.7$, and $\lambda_\text{bound} = 1$.
The $\beta_{z_i,k}$ and $\mu_{z_i,k}$ are initialized to $(2n_b^{-1}(1-\exp(-\tau))^{-2}$ and uniformly within $[0, 1]$, respectively.

For training, we use a learning rate of $5\times 10^{-4}$ with Adam optimizer~\citep{kingma2014adam}.
We warm-up for 10 epochs, linearly increasing the learning rate, and we decay the learning rate with the ratio of 0.6 and patience of 24.
The minimum learning rate is set to $10^{-7}$.
We train the model for up to 900 epochs.

For transfer learning experiment on Transition1x, we pretrain a model with $L = 6$ on QM9 dataset. The cutoff thereshold is set to $\tau = 7.5$\AA, while other hyperparameters are set the same as the above.


%------------------------------------
\section{Additional Ablation Study}
\label{sec:exp:ablation_appendix}

We conduct an additional ablation study with varied number of layers.
Tab.~\ref{tab:SSL_ablation} shows that the $\mathbf{A}$-mask we introduce in Fig. 1
%~\ref{main:fig:Main_model}
indeed helps in most cases. Also, we observe that using more MABs up to 8 tends to improve the overall performance.

\begin{table}[t]
    \centering
    {\scriptsize
    \begin{tabular}{l|rr|rr|rr}
        \toprule
        Layers & \multicolumn{2}{c|}{4 (Base)} & \multicolumn{2}{c|}{6 (Large)} & \multicolumn{2}{c}{8 (Huge)} \\ \midrule
        Method & {MAE$_\text{E}$} & MAE$_\text{F}$ & {MAE$_\text{E}$} & MAE$_\text{F}$ & {MAE$_\text{E}$} & MAE$_\text{F}$ \\ \midrule
        Base & 11.86 & 0.91 & 11.83 & 0.77 & 11.33 & 0.72 \\ 
        + \texttt{[CLS]} & {11.70} & 0.78 & {9.03} & 0.90 & {9.70} & 0.78 \\ 
        + $\mathbf{A}$-mask & {9.89} & 0.98 & {9.55} & 1.33 & {9.33} & 0.88 \\ 
        + MAM & {10.77} & 1.43 & {9.38} & 1.27 & {8.35} & 1.28 \\ \bottomrule
    \end{tabular}}
    \caption{Ablation study on SSL methods with different number of layers}
    \label{tab:SSL_ablation}
\end{table}


We also search the mask ratio of our MAM task in Tab.~\ref{tab:ablation_mask_ratio}. We observe that using a mask ratio of 0.3 is clearly better than others in terms of both energy prediction and a reasonable PES.

\begin{table}[h]
    \centering
    {\scriptsize
    \begin{tabular}{c|ccc}
        \toprule
        Masking ratio & MAE$_\text{E}$ & MAE$_\text{F}$ & $\Delta P$\\
        \midrule
        0.1 & 16.18 & 0.0056 & 0.028 \\
        0.15 & 15.82 & 0.0060 & 0.028 \\
        0.2 & 16.77 & 0.0057 & 0.029 \\
        0.3 & \bf{15.16} & \bf{0.0050} & \bf{0.025} \\
        0.5 & 17.73 & 0.0066 & 0.032 \\
        \bottomrule
    \end{tabular}}
    \caption{Ablation study on masking ratio}
    \label{tab:ablation_mask_ratio}
\end{table}

\begin{figure}[h]
	\centering
	\includegraphics[width=0.95\linewidth]{Fig_MAM_ratio.png}
	\caption{Additional structural optimization results by different MAM making ratios.}
	\label{fig:append}
\end{figure}

\iffalse
\section{Qualitative Analysis}
%\label{sec:exp:mam_analysis}

\textbf{Physics-driven Modeling and Regularization.}
We design our model to predict the parameters of a physics-inspired equation (Sec. 3.4).
%~\ref{sec:method:physics}
In Eq. (5),
%~(\ref{eq:ljp}), 
if both $\beta_2$ and $\beta_3$ are a finite number greater than 0, this implies that the equation is fitted to the distance. In particular, having $\beta_3 \approx 6$ indicates that it has similar behavior to the LJ potential.
Since we have two freedoms of Coulomb's terms and LJP-like terms, there is no reason to converge to a single $\beta_3$; based on training, $\beta_3$ seen to be distributed between 4 and 16, which is close to the 6 of LJ potential.
\fi

%------------------------------------
\section{Additional Examples}

\textbf{Reaction barrier estimation.} 
We evaluate the entire Transition1x reaction barrier estimation task by calculating and comparing the reaction barrier task with the ground truth across 225 reaction paths.
Our method shows reasonable results on 212 of them, with a mean absolute error (MAE) less than 0.2 eV on average.
These results are presented in Fig.~\ref{fig:tr1xmore}.

\begin{figure}
	\centering
	\includegraphics[width=0.9\linewidth]{Fig_Tr1x_more.png}
	\caption{
		Estimated reaction barrier along the reaction pathways of Trainsition1x dataset. The ground truth barriers are on the $x$-axis, and those estimated by our model are on the $y$-axis, in eV scale.
  }
	\label{fig:tr1xmore}
\end{figure}



\label{sec:exp:examples}
\textbf{Structure optimization.}
We report additional structural optimization results of random molecules in the QM9 dataset in Fig.~\ref{fig:example_structure_opt}. We observe that our model and TorchMDNet (ET) mostly preserve the optimal structure, while other baselines significantly destroy structures.
In addition, we present relaxation results from 102 molecules in Fig.~\ref{fig:example1}--\ref{fig:example9}. We list results from other baselines and the GT structure(Ref.). Blanks are failed results.


\begin{figure*}
	\centering
	\includegraphics[width=0.98\linewidth]{Appendix_fig1_.pdf}
	\caption{Additional structural optimization results by ours and baselines.}
	\label{fig:example_structure_opt}
\end{figure*}


\begin{figure*}
	\centering
	\includegraphics[width=0.65\linewidth]{test0.png}
	\includegraphics[width=0.65\linewidth]{test1.png}
	\caption{Additional structural optimization results (1/9)}
	\label{fig:example1}
\end{figure*}

\begin{figure*}
	\centering
	\includegraphics[width=0.65\linewidth]{test2.png}
	\includegraphics[width=0.65\linewidth]{test3.png}
	\caption{Additional structural optimization results (2/9)}
	\label{fig:example2}
\end{figure*}

\begin{figure*}
	\centering
	\includegraphics[width=0.65\linewidth]{test4.png}
	\includegraphics[width=0.65\linewidth]{test5.png}
	\caption{Additional structural optimization results (3/9)}
	\label{fig:example3}
\end{figure*}

\begin{figure*}
	\centering
	\includegraphics[width=0.65\linewidth]{test6.png}
	\includegraphics[width=0.65\linewidth]{test7.png}
	\caption{Additional structural optimization results (4/9)}
	\label{fig:example4}
\end{figure*}

\begin{figure*}
	\centering
	\includegraphics[width=0.65\linewidth]{test8.png}
	\includegraphics[width=0.65\linewidth]{test9.png}
	\caption{Additional structural optimization results (5/9)}
	\label{fig:example5}
\end{figure*}

\begin{figure*}
	\centering
	\includegraphics[width=0.65\linewidth]{test10.png}
	\includegraphics[width=0.65\linewidth]{test11.png}
	\caption{Additional structural optimization results (6/9)}
	\label{fig:example6}
\end{figure*}

\begin{figure*}
	\centering
	\includegraphics[width=0.65\linewidth]{test12.png}
	\includegraphics[width=0.65\linewidth]{test13.png}
	\caption{Additional structural optimization results (7/9)}
	\label{fig:example7}
\end{figure*}

\begin{figure*}
	\centering
	\includegraphics[width=0.65\linewidth]{test14.png}
	\includegraphics[width=0.65\linewidth]{test15.png}
	\caption{Additional structural optimization results (8/9)}
	\label{fig:example8}
\end{figure*}

\begin{figure*}
	\centering
	\includegraphics[width=0.8\linewidth]{test16.png}
	\caption{Additional structural optimization results (9/9)}
	\label{fig:example9}
\end{figure*}


% References
\bibliography{yi_95}

\end{document}