% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} 
%% In your camera-ready you should use the 'accepted' parameter. This shows the authors and how an accepted paper will look like. The footer is 'Acccepted for X'. In the final version, the proceedings chairs will add the page numbers for PMLR and the final footer will be 'Proceedings of X'.
%
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}


\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams


\newcommand\mycommfont[1]{\footnotesize\ttfamily\textcolor{black}{#1}}
% \SetCommentSty{mycommfont}

\newtheorem{definition}{Definition}[section]
\newtheorem{remark}{Remark}
\newtheorem{proposition}{Proposition}

\usepackage{amssymb}
\usepackage{physics,amsmath}
\usepackage[algoruled, lined, ruled, resetcount,linesnumbered]{algorithm2e}
\usepackage{multicol}
\usepackage{etoolbox}
\usepackage{multicol}
\usepackage{multirow}
\usepackage{xspace}
\usepackage{placeins}

\usepackage{cleveref}

\newcommand{\eat}[1]{}
\newcommand{\corruptions}{corruptions\xspace}
\newcommand{\spna}{\texttt{SPN}$-$\texttt{a}\xspace}
\newcommand{\spnas}{\texttt{SPN}$-$\texttt{a}s\xspace}
\newcommand{\cna}{\texttt{CN}$-$\texttt{a}\xspace}
\newcommand{\cnas}{\texttt{CN}$-$\texttt{a}s\xspace}
\newcommand{\spnr}{\texttt{SPN}$-$\texttt{r}\xspace}
\newcommand{\spnrs}{\texttt{SPN}$-$\texttt{r}s\xspace}
\newcommand{\cnr}{\texttt{CN}$-$\texttt{r}\xspace}
\newcommand{\cnrs}{\texttt{CN}$-$\texttt{r}s\xspace}
\newcommand{\spn}{\texttt{SPN}\xspace}
\newcommand{\cn}{\texttt{CN}\xspace}
\newcommand{\spns}{\texttt{SPN}s\xspace}
\newcommand{\cns}{\texttt{CN}s\xspace}
\newcommand{\test}{$\mathcal{T}$\xspace}
\newcommand{\testa}{$\mathcal{T}_a$\xspace}
\newcommand{\testr}{$\mathcal{T}_r$\xspace}

\renewcommand{\cite}[1]{\citep{#1}}

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\AtBeginEnvironment{tabular}{\smaller}

% Added for camera ready version
\usepackage{xcolor}
\newcommand*{\rohith}{\textcolor{red}}


%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Robust Learning of Tractable Probabilistic Models}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is automatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
% 
% Important: in case of equal contributions, we strongly recommend to NOT show it in this part of the paper, but rather describe it in the appropriate section at the end of the paper "Author Contribution", where you have more space to describe how each author contributed.
%
% Add authors
% Remember to use the order convention "First/Given name" "Last/Family name", e.g. John Smith, Hanako Yamada, Marco Rossi, Wei Zhang

\author[1]{\href{mailto:<rohith.peddi@utdallas.edu>?Subject=Your UAI 2022 paper}{Rohith Peddi}{}}
\author[1]{Tahrima Rahman}
\author[1]{Vibhav Gogate}
% Add affiliations after the authors
\affil[1]{%
    The University of Texas at Dallas
}

\begin{document}


\maketitle

\begin{abstract}
Tractable probabilistic models (TPMs) compactly represent a joint probability distribution over a large number of random variables and admit polynomial time computation of (1) exact likelihoods; (2) marginal probability distributions over a small subset of variables given evidence; and (3) in some cases most probable explanations over all non-observed variables given observations. In this paper, we leverage these tractability properties to solve the  \textit{robust} maximum likelihood parameter estimation task in TPMs under the assumption that a TPM structure and complete training data is provided as input. Specifically, we show that TPMs learned by optimizing the likelihood perform poorly when data is subject to adversarial attacks/noise/perturbations/corruption and we can address this issue by optimizing robust likelihood. To this end, we develop an efficient approach for constructing uncertainty sets that model data corruption in TPMs and derive an efficient gradient-based local search method for learning TPMs that are robust against these uncertainty sets. We empirically demonstrate the efficacy of our proposed approach on a collection of benchmark datasets.
\end{abstract}

\section{Introduction}\label{sec:intro}

The last decade has witnessed rapid advances in deep generative models that effectively capture probability distributions over high dimensional data such as Autoregressive models (ARNs) \cite{LarochelleandMurray2011}, Normalizing flows \cite{papamakariosetal2019}, Variational Autoencoders (VAEs) \cite{KnigmaandWelling2014}, Diffusion based models \cite{Dicksteinetal2015}, and Generative Adversarial Networks (GANs). Despite their striking success in learning representations over high dimensional data, these models are severely limited in their inference capabilities, and can only answer very few inference queries in polynomial time.

Simultaneously, the field of tractable probabilistic models (TPMs) which encompasses probabilistic models that guarantee efficient computation of probabilistic inference queries has witnessed significant traction. A unified framework called Probabilistic Circuits (PCs) that includes all the tractable models such as Sum-Product Networks (SPNs) \cite{PoonAndDomingos2011}, Arithmetic circuits (ACS), Cutset Networks (CNets) \cite{Rahmanetal2014}, Probabilistic Sentential Decision Diagrams (PSDDs) \cite{Kisaetal2014} has been developed. With memory-efficient computation variants of probabilistic circuits such as Einsum Networks \cite{Peharzetal2020}, the expressivity of these models has significantly increased.  

Although the robustness of probabilistic models has been assessed in the context of deep generative models, it has never been evaluated in the context of tractable probabilistic models. Therefore, in this paper, we analyze the robustness of tractable models in a generative setting through the lens of robust optimization \footnote{code: https://github.com/utd-star-ai-ml/ro\_tpm\_uai\_2022}. Tractable models learn to approximate the data generating distribution via maximum likelihood estimation of the model's parameters. Maximum likelihood estimation demands the data be free from \corruptions. But, in the real-world, data is subjected to \corruptions from a wide variety of sources such as measurement errors, adversaries, and noise. The goal of this paper is to learn tractable models that are immunized against these \corruptions.

Robust Optimization (RO) \cite{Ben-Taletal2009} is a learning paradigm that captures data uncertainty without using probability distributions. The problems considered here are max-min variants of learning problems formulated using stochastic optimization. These max-min formulations have roots in Game theory and can be perceived as a game between an adversary who affects the available data by inducing corruption and an optimizer who reacts to this worst-case selection of the data. In this approach, we assume the presence of point-wise adversaries whose corruptions can be confined in deterministic uncertainty sets and 
estimate the best solution for the worst-case realization of the data. Thus RO is more conservative than stochastic optimization and establishes a sense of \textit{being on the safe side.}

%Robust optimization is more conservative than its stochastic optimization variant and establishes a sense of \textit{being on the safe side.}

In general, robust optimization variants of tractable stochastic optimization problems may not be tractable. The choice of uncertainty sets (1) plays a vital role in determining the tractability of the problem, (2) provides the designer with the flexibility to choose a trade-off between robustness and performance, and (3) determines the similarity of the solutions obtained for a stochastic optimization problem and its robust optimization variant. Any prior knowledge about the stochastic nature of the uncertainty in data can help choose the uncertainty sets. We note that robust optimization aims to estimate fixed solutions that ensure feasibility independent of data corruptions and is different from sensitivity analysis which is typically used as a post-optimization tool. 


\paragraph{Contributions.}
In this paper, we propose two approaches for efficiently learning tractable models immunized against measurement errors, adversarial perturbations and noise.
\begin{itemize}
    \item In the first approach, we formulate the learning objective using the robust optimization framework. Here, we maximize the likelihood of data subject to all \corruptions belonging to a constrained uncertainty set. We propose an iterative algorithm that estimates parameters of the robust model by maximizing the worst case-likelihood obtained by the perturbed data generated by an adversary. 
    \item In our second approach, we propose a regularizer to the maximum likelihood estimation problem that adds a \textit{nearest neighbor bias} to the learning algorithm. We see this as an effective amalgamation of two orthogonal views of capturing the training distribution and \textit{staying on the safe side} by optimizing for worst-case perturbation of the training distribution.
    \item Empirically, we evaluate the proposed approaches on Twenty benchmark datasets for density estimation task on tractable models without latent variables (Cutset Networks) and tractable models with latent variables (SPNs). Our results clearly demonstrate the striking vulnerability of maximum likelihood estimation to \corruptions. They also show that our proposed approach yields TPMs that have significantly higher test set log-likelihood scores on corrupted data than TPMs learned by maximizing likelihood. %We notice the striking vulnerability of the maximum likelihood estimation to \corruptions and advocate  adoption of models learned using our approach, which shows significant improvement on the standard models.
\end{itemize}

To the best of our knowledge this is the first work in learning robust tractable models. \footnote{\cite{mauaetal2018} proposed Credal SPNs as an attempt to robustify SPNs, where they allow parameters of the sum product networks to vary in a closed convex set. But these models are not scalable and unlike SPNs do not admit efficient computation of likelihoods and marginal probability distributions given observations.} 

\section{Notation \& Background}
We denote a dataset using the upper case letter $X$ and individual samples using a small case letter $x$.  A dataset $X$ is an ensemble of individual data samples $x_{i},i=1,\ldots,n$, i.e., $X = [x_{1}, x_{2}, \ldots, x_{n}]^{T}$. For simplicity of exposition, we focus on binary datasets, where each sample $x_i$ is a $d$-dimensional $0/1$ vector, namely $x_{i} \in \{0, 1\}^{d}$. 

We denote \corruptions of individual samples $x_{i}$ using $\Delta x_{i}$ where $\Delta x_{i}$ is a $d$-dimensional $0/1$ vector (or mask), $1$ indicates that the particular dimension is corrupted and $0$ indicates that it is not.  Given a dataset $X$, we denote an ensemble of \corruptions (or masks) by $\Delta X$, where each $x_i \in X$ is associated with a mask $\Delta x_i \in \Delta X$, namely $\Delta X = [\Delta x_{1}, \Delta x_{2}, \ldots, \Delta x_{n}]^{T}$. We denote an \textsc{xor} operation over two binary vectors using $\oplus$. We denote the corrupted dataset by $X\oplus\Delta X=[x_1\oplus \Delta x_1,\ldots,x_n \oplus \Delta x_n]^T$. We denote probability mass function parameterized by $\theta$ at $x$ using $f(\theta, x)$ and log-likelihood for the dataset $X$ by $LL(\theta, X)$. 

\subsection{Generative Tractable Models}

Generative tractable probabilistic models (TPMs) such as thin junction trees \cite{BachandJordon2001}, bounded-treewidth Bayesian networks \cite{ElidanandGould2008}, arithmetic circuits \cite{Shenetal2016}, cutset networks \cite{Rahmanetal2014}, mixtures of cutset networks \cite{RahmanAndGogate2016}, and sum-product networks \cite{PoonAndDomingos2011} compactly represent large multi-dimensional probability distributions while ensuring that several inference and estimation tasks can be solved in time and space that scales polynomially (and often linearly) with the size of the model. TPMs may either have latent variables or they may not. Latent variables typically improve the goodness-of-fit of the models as measured by test-set log-likelihood scores while sacrificing tractability for some inference tasks such as most probable explanation.  In general, inference tasks such as computing the log-likelihood, estimating marginal distribution over a subset of variables given evidence are tractable on the aforementioned TPMs while the most probable explanation task is polynomial only on TPMs having no latent variables.

In a standard setting, for generative parameter learning of tractable probabilistic models (TPMs) we seek to estimate parameters $\theta$ that maximize the log-likelihood function.
\begin{equation} \label{eq: mle}
    \max_{\theta} \sum_{i=1}^{n} \log \left(f\left(\theta ; x_{i} \right)\right)
\end{equation}
In subsequent sections, we focus on two types of tractable probabilistic models, one having latent variables and the second having no latent variables. We chose cutset networks (CNs) \cite{Rahmanetal2014} as our choice for tractable models without latent variables and SPNs \cite{PoonAndDomingos2011} for tractable models with latent variables, but our results can be easily applied to other tractable models. In CNs, the log-likelihood function is concave and the maximum-likelihood estimate can be computed in closed form. On SPNs, the log-likelihood function is not concave and one has to use iterative algorithms such as gradient ascent and soft/hard expectation-maximization (EM) or their stochastic versions to find parameters that correspond to a local maxima of the log-likelihood function.

We leverage the fact that in tractable models such as CNs and SPNs, given a dataset $X$ or a corrupted dataset $X \oplus \Delta X$, each parameter $\theta_i \in \theta$ can be expressed as a conditional probability and the \textit{gradient of the log-likelihood w.r.t. $\theta_i$ can be computed in polynomial time} (cf. \cite{Peharzetal2017}, \cite{Darwiche2009}).

\subsection{Robust Maximum Likelihood Estimators}

We assume presence of \corruptions $\Delta x_{i}$ which differentiates true unobserved samples $x_{i}^{\text{true}}$ from the observed samples $x_{i}^{\text{obs}}$ and motivate learning through the lens of robust optimization paradigm. Specifically, we operate under the assumption that $x_{i}^{\text{obs}} = x_{i}^{\text{true}} \oplus \Delta x_{i}$ i,e observed samples $x_{i}^{\text{obs}}$ are masked variants of true samples $x_{i}^{\text{true}}$ and seek to estimate parameters $\theta$ that maximize probability density of true samples.
\begin{equation*}
    \prod_{i=1}^{n} f\left(\theta ; x_{i}^{\text {true }}\right) \equiv \prod_{i=1}^{n} f\left(\theta ; x_{i}^{\text {obs }} \oplus \Delta x_{i}\right)
\end{equation*}
or equivalently maximize the log-likelihood function
\begin{equation*}
LL\left(\theta ; X^{\text{obs}} \oplus \Delta X \right) \equiv \log \left(\prod_{i=1}^{n} f\left(\theta ; x_{i}^{\text{obs}} \oplus \Delta x_{i}\right)\right)
\end{equation*}
\cite{BertsimasAndNohdani2019} have shown that based on the modelling choice of  \corruptions $\Delta x_{i}$ and the knowledge about them, we get two types of estimators. 

\begin{itemize}
    \item \textbf{Adversarially Robust Estimators} (AREs) are obtained when we consider the \corruptions reside in a deterministic uncertainty set and no further knowledge about the \corruptions is available.
    \item \textbf{Distributionally Robust Estimators} are obtained when \corruptions can be considered as random variables with known support.
\end{itemize}

\section{Approach}

\subsection{Uncertainty Sets}

At a high level, an uncertainty set defines a boundary or region (of assignments) that is close to each observed data point $x_i^{obs}$ such that the true data point $x_i^{true}$ can be any one of the assignments in this region. 
We assume no prior knowledge about the \corruptions and model them to reside in a deterministic uncertainty set. Specifically, we model \corruptions $\Delta x_{i}$ to reside in an uncertainty set constrained on $L_{1}$ or equivalently hamming distance (since we assume binary data) and express the uncertainty set denoted by $\mathcal{U}_h$ as 
\begin{align*}
    \mathcal{U}_h = \{\Delta X &= [\Delta x_{1}, \ldots \Delta x_{n}]^{T} |  \norm{\Delta x_{i}}_{1} \leq h, \\ i &= 1, \ldots, n; \text{ $h$-Hamming distance threshold}\}
\end{align*}

We define the strength of an adversary based on the choice of uncertainty set used to corrupt the data, i.e., an adversary which can produce \corruptions from an uncertainty set defined by $h =5$ is stronger in capacity than an adversary which can produce \corruptions from an uncertainty set defined by $h =3$. 

\subsection{Adversarially robust estimators}

Roughly speaking, we define \textit{robust log-likelihood} as the log-likelihood score of the model under the worst case realization of the data. In a robust setting, we seek to estimate parameters $\theta$ using robust maximum likelihood estimators which assume the presence of \corruptions in the data $\Delta X$ and maximize the likelihood of the true samples $X \oplus \Delta X$. In the real world, we are oblivious to these \corruptions and assume their presence in uncertainty set $\mathcal{U}_h$. Therefore, we seek to estimate $\theta$ that maximizes the log-likelihood against the worst-case realization of the data obtained when perturbed with \corruptions $\Delta x_i$ in $\mathcal{U}$. 

Formally, the robust parameter estimation task is given by
\begin{equation} \label{eq: ARE}
    \max_{\theta} \min_{\Delta X \in \mathcal{U}_h} \sum_{i=1}^{n} \log \left(f\left(\theta ; x_{i}^{\text{obs}} \oplus \Delta x_{i}\right)\right)
\end{equation}

In the above robust optimization problem (Eq. \eqref{eq: ARE}), the size of the uncertainty set, which in turn depends on $h$ and $d$, determines our desire to stay on the safe side. As we increase the size of $\mathcal{U}_h$, we expect a drop in the log-likelihood score; however we immunize our 
our model against all corruptions from this enlarged set. We note that we solve the original maximum likelihood estimation problem \eqref{eq: mle} when $h=0$.

Although, the max-min problem given in Eq. \eqref{eq: ARE} is significantly harder in general than the traditional maximum-likelihood estimation task, it turns out that the objective (given by the inner minimization) remains concave for cutset networks having no latent variables.\footnote{Note that we are performing robust parameter estimation and assume that the structure of the tractable model is provided as input to our algorithm.} This follows from the fact that the log-likelihood function is concave and minimum over a concave function is also concave.  Formally, 
\begin{proposition}
In CNs (having no latent variables), the optimization problem given in Eq. \eqref{eq: ARE} is concave.\footnote{Note that although the objective is concave, it is not smooth and therefore we have to use a sub-gradient method.}
\end{proposition}
Thus, in CNs, since the gradient of the log-likelihood w.r.t. the parameters $\theta$ can be computed in linear time in the size of the data, the robust parameter estimation task can be solved efficiently using a sub-gradient method if the inner minimization task can be solved (optimally and) efficiently. The latter is possible when $h$ is bounded by a constant.

Unfortunately, since the log-likelihood function for SPNs is non-concave, the objective remains non-concave. For such problems, \cite{Danskin1966} has shown that if the inner minimization problem can be solved optimally, then there always exists a directional derivative that can be used to update the parameters and reach a local optimum. 

Formally, we can show that:

\begin{proposition} \label{prop:2}\cite{Danskin1966}
Let \[\Delta X^*(\theta) = \argmin_{\Delta X \in \mathcal{U}_h} \sum_{i=1}^{n} \log \left(f\left(\theta ; x_{i}^{\text{obs}} \oplus \Delta x_{i}\right)\right)\] then \begin{align}\nonumber\left .\nabla_{\theta} \min_{\Delta X \in \mathcal{U}_h} \sum_{i=1}^{n} \log \left(f\left(\theta ; x_{i}^{\text{obs}} \oplus \Delta x_{i}\right)\right )\right |_{\theta=\theta_t}=\\ \nonumber \left . \nabla_{\theta} \sum_{i=1}^{n} \log \left(f\left(\theta ; x_{i}^{\text{obs}} \oplus \Delta x_{i}^*(\theta_t)\right)\right )\right|_{\theta=\theta_t}
\end{align}
\end{proposition}

In other words, if we can find a solution $\Delta X^*(\theta_t)$ to the inner minimization problem, then the gradient of the objective at $\theta=\theta_t$ equals the gradient of the log-likelihood of the dataset $X \oplus \Delta X^*(\theta_t)$. In SPNs, as we mentioned earlier, this gradient can be computed efficiently in time that scales linearly with the size of the model \cite{Peharzetal2017}.

The above discussion yields algorithm \ref{alg:algorithm} where we iteratively solve the inner minimization problem to estimate \corruptions $\Delta X$ from $\mathcal{U}$ and use the obtained \corruptions to perturb the dataset, which shall be used to update the parameters $\theta$ of the model. 

\begin{algorithm}
\caption{Robust Maximum Likelihood Estimation}\label{alg:algorithm}
\KwIn{ Binary dataset $X$, a tractable model structure having parameters $\theta$ and hamming distance threshold $h \in \mathcal{Z}$}
\KwOut{An assignment to  $\theta$}
\SetKwBlock{Beginn}{beginn}{ende}
\Begin {
    Randomly initialize all $\theta_i \in \theta$\\ 
    \Repeat{convergence}{
        \tcp{Solve Inner Minimization}
        Find new set of corruptions $\Delta X$ from uncertainty set constrained by $h$ using the current parameters $\theta$\\ 
        \tcp{Outer Maximization}
        \For{k steps}{
            Use one step of stochastic gradient ascent or EM  to update parameters $\theta$ using  $X \oplus \Delta X$ (see Proposition \ref{prop:2})
        }
    }
    \Return $\theta$
}
\end{algorithm}



\paragraph{Practical considerations.}  
Efficient estimation of adversarially robust estimators (AREs) for tractable models with latent variables depends on the efficiency and practicality of the algorithm used in finding the solution for the inner minimization problem. When exhaustive search over the space of all possible corruptions $d \choose h$ is employed in finding the optimum for the inner minimization problem, we incur a computational cost of $\mathcal{O}(d^h \times S)$ \footnote{\label{foot:static}For tractable representations that use a static ordering of variables such as algebraic decision diagrams (ADDs) and ordered binary decision diagrams (OBDDs) we can find the optimum for the inner minimization problem in time that scales polynomially with $h$, $d$ and $S$. But for dynamically ordered tractable representations such as SPNs and CNs, the time complexity of solving the inner minimization problem is exponential in $h$.} (where $S$ is the size of the model). Thus, in theory, when $h$ is bounded by a constant, the optimum can be computed in polynomial time. However, exhaustive search is not practically feasible for large models (e.g.,  when $d>100$ and $h>3$). Therefore, we use a greedy local search algorithm \eat{(see alg. \ref{alg:greedylocalsearch})} having time complexity $\mathcal{O}(d\times h \times S)$ to search for a neighbor having the smallest log-likelihood. Since the gradient can be computed in time that scales linearly with the size of the model, when local search is employed, the overall time complexity of each iteration is reduced from $\mathcal{O}(d^h\times S \times k)$ to $\mathcal{O}(d\times h \times S \times k)$.
%\rohith{END}

Using Danskin's theorem (see Proposition \ref{prop:2}, it is straight-forward to show that for SPNs and CNs, Algorithm \ref{alg:algorithm} converges to a local optima of Eq. \eqref{eq: ARE}. In CNs, the local optima also corresponds to the global optima.

\eat{
\begin{algorithm}
\caption{Greedy Local Search}\label{alg:greedylocalsearch}
\KwIn{ A $d-$dimensional sample $x$, a tractable model structure having parameters $\theta$ and hamming distance threshold $h \in \mathcal{Z}$}
\KwOut{$\Delta x = argmin_{\Delta x \in \mathcal{U}_{h}} log (f(\theta; x \oplus \Delta x))$}
\SetKwBlock{Beginn}{beginn}{ende}
\Begin {
    $x' = x$\\ 
    \For{$i=1$ to $h$}{
        $best = log f(\theta; x')$\\
        $x\_{new} = x'$\\
        \For{$j=1$ to $d$}{
            $x"$ = Flip the value in the $j^{th}$ position of $x'$\\
            \If{$log f(\theta; x") < best$}{
                $best = log f(\theta; x")$\\
                $x\_{new}=x"$\\
            }
        }
        $x' = x\_{new}$ \\
    }
    \Return $\Delta x = x \oplus x'$ \\
}
\end{algorithm}
}

\subsection{Regularized Maximum Likelihood Estimators}
In a robust setting, as we increase the size of the uncertainty set (see Eq. \eqref{eq: ARE}), we immunize against \corruptions from a larger set and achieve better robust likelihood scores. However, these models perform poorly on the original training and test sets. To address this issue, we propose an alternative approach where we jointly optimize for both standard and robust likelihoods, weighing the latter using a regularization constant (hyperparameter) $\lambda \geq 0$. 
\begin{align}\label{eq:RMLE}
    \nonumber
    \max_{\theta}\Bigg[ \overbrace{ \Big[ \sum_{i=1}^{n}  \log f(\theta, x_{i}^{\text{obs}})}^\text{Standard Likelihood} \Big] +
    \\\lambda \times \underbrace{\Big[ \sum_{i=1}^{n} \min_{\Delta x_{i} \in \mathcal{U}}  \log \left(f\left(\theta ; x_{i}^{\text{obs}} \oplus \Delta x_{i}\right)\right)}_\text{Robust Likelihood}  \Big]  \Bigg]
\end{align}
We can use the same algorithm (see Alg. \ref{alg:algorithm}) to estimate parameters $\theta$ with a minor change in Step-6 where, instead of corrupted dataset $X \oplus \Delta X$ we use augmented dataset $[X, X \oplus \Delta X]$. Roughly speaking, the optimization problem in Eq.\eqref{eq:RMLE} is equivalent to applying a nearest neighbor regularizer to the original \eqref{eq: mle}. Our proposed approach is closely related to \cite{Xuetal2009} who showed that robust linear regression under $L_{\infty}$ ball is equivalent to Lasso regression.



\section{Experiments}

% SPN LOG-LIKELIHOOD TABLE
\begin{table*}[t]
\begin{center} 
\caption{\label{tab:spn-ll-paper} Generative performance: Test set log-likelihood scores of models having latent variables. $h\in\{1,2,3\}$: hamming distance thresholds. \spn : SPN trained original training data, \spna: SPN trained on the adversarially generated training data by \spn, \spnr: SPN trained via joint maximization of standard and robust likelihoods. \test: original test data, \testa: adversarially perturbed \test by \spn, \testr: randomly perturbed \test by \spn.} 
\begin{tabular}{ |cc|ccc|ccc|ccc| }
\hline 
\multicolumn{1}{|c|}{ \multirow{2}{*}{DATASET} } & \multicolumn{1}{|c|}{ \multirow{2}{*}{$h$} } & \multicolumn{3}{c|}{\test} & \multicolumn{3}{c|}{\testa} & \multicolumn{3}{c|}{\testr} \\
\cline{3-11} 
 & \multicolumn{1}{|c|}{}  & \multicolumn{1}{c|}{\spn}  & \multicolumn{1}{c|}{\spna} & \multicolumn{1}{c|}{\spnr}  & \multicolumn{1}{c|}{\spn}  & \multicolumn{1}{c|}{\spna} & \multicolumn{1}{c|}{\spnr} & \multicolumn{1}{c|}{\spn}  & \multicolumn{1}{c|}{\spna} & \multicolumn{1}{c|}{\spnr} \\
\hline 
\multicolumn{1}{|c|}{ \multirow{3}{*}{Plants}} & 1 & \multicolumn{1}{|c|}{ \multirow{3}{*}{ -13.56}} &  \multicolumn{1}{c|}{ -14.18}  &  \multicolumn{1}{c|}{ -13.81}  &  \multicolumn{1}{c|}{ -22.38}  &  \multicolumn{1}{c|}{ \textbf{ -18.0}}  &  \multicolumn{1}{c|}{ -18.31}  &  \multicolumn{1}{c|}{ -22.06}  &  \multicolumn{1}{c|}{ \textbf{ -17.98}}  &  \multicolumn{1}{c|}{ -18.17} \\
\cline{2-2} \cline{4-11} 
 \multicolumn{1}{|c|}{} & 3  & \multicolumn{1}{c|}{}   &  \multicolumn{1}{c|}{ -16.08}  &  \multicolumn{1}{c|}{ -14.61}  &  \multicolumn{1}{c|}{ -39.17}  &  \multicolumn{1}{c|}{ \textbf{-23.89}}  &  \multicolumn{1}{c|}{ -24.26}  &  \multicolumn{1}{c|}{ -30.9}  &  \multicolumn{1}{c|}{ \textbf{-22.91}}  &  \multicolumn{1}{c|}{ -23.19} \\
\cline{2-2} \cline{4-11} 
 \multicolumn{1}{|c|}{} & 5  & \multicolumn{1}{c|}{}   &  \multicolumn{1}{c|}{ -17.88}  &  \multicolumn{1}{c|}{ -14.67}  &  \multicolumn{1}{c|}{ -54.85}  &  \multicolumn{1}{c|}{ \textbf{-28.2}}  &  \multicolumn{1}{c|}{ -29.69}  &  \multicolumn{1}{c|}{ -38.33}  &  \multicolumn{1}{c|}{ \textbf{-26.6}}  &  \multicolumn{1}{c|}{ -27.25} \\
\hline 
\multicolumn{2}{|c|}{Avg.}  &  \multicolumn{1}{c|}{ \textbf{-13.56}}  &  \multicolumn{1}{c|}{ -16.05}  &  \multicolumn{1}{c|}{ -14.36}  &  \multicolumn{1}{c|}{ -38.8}  &  \multicolumn{1}{c|}{ \textbf{-23.36}}  &  \multicolumn{1}{c|}{ -24.09}  &  \multicolumn{1}{c|}{ -30.43}  &  \multicolumn{1}{c|}{ \textbf{-22.5}}  &  \multicolumn{1}{c|}{ -22.87} \\
\hline 
\multicolumn{1}{|c|}{ \multirow{3}{*}{Netflix}} & 1 & \multicolumn{1}{|c|}{ \multirow{3}{*}{ -56.84}} &  \multicolumn{1}{c|}{ -57.62}  &  \multicolumn{1}{c|}{ -57.17}  &  \multicolumn{1}{c|}{ -61.0}  &  \multicolumn{1}{c|}{ \textbf{ -59.58}}  &  \multicolumn{1}{c|}{ -59.92}  &  \multicolumn{1}{c|}{ -60.18}  &  \multicolumn{1}{c|}{ \textbf{ -59.53}}  &  \multicolumn{1}{c|}{ -59.56} \\
\cline{2-2} \cline{4-11} 
 \multicolumn{1}{|c|}{} & 3  & \multicolumn{1}{c|}{}   &  \multicolumn{1}{c|}{ -59.43}  &  \multicolumn{1}{c|}{ -57.72}  &  \multicolumn{1}{c|}{ -67.14}  &  \multicolumn{1}{c|}{ \textbf{-65.01}}  &  \multicolumn{1}{c|}{ -65.11}  &  \multicolumn{1}{c|}{ -61.4}  &  \multicolumn{1}{c|}{ -62.11}  &  \multicolumn{1}{c|}{ -61.45} \\
\cline{2-2} \cline{4-11} 
 \multicolumn{1}{|c|}{} & 5  & \multicolumn{1}{c|}{}   &  \multicolumn{1}{c|}{ -60.88}  &  \multicolumn{1}{c|}{ -58.17}  &  \multicolumn{1}{c|}{ -72.06}  &  \multicolumn{1}{c|}{ -69.49}  &  \multicolumn{1}{c|}{ \textbf{-67.69}}  &  \multicolumn{1}{c|}{ -62.82}  &  \multicolumn{1}{c|}{ -64.24}  &  \multicolumn{1}{c|}{ \textbf{-62.13}} \\
\hline 
\multicolumn{2}{|c|}{Avg.}  &  \multicolumn{1}{c|}{ \textbf{-56.84}}  &  \multicolumn{1}{c|}{ -59.31}  &  \multicolumn{1}{c|}{ -57.69}  &  \multicolumn{1}{c|}{ -66.73}  &  \multicolumn{1}{c|}{ -64.69}  &  \multicolumn{1}{c|}{ \textbf{-64.24}}  &  \multicolumn{1}{c|}{ -61.47}  &  \multicolumn{1}{c|}{ -61.96}  &  \multicolumn{1}{c|}{ \textbf{-61.05}} \\
\hline 
\multicolumn{1}{|c|}{ \multirow{3}{*}{DNA}} & 1 & \multicolumn{1}{|c|}{ \multirow{3}{*}{ -97.36}} &  \multicolumn{1}{c|}{ -97.55}  &  \multicolumn{1}{c|}{ -97.69}  &  \multicolumn{1}{c|}{ -101.94}  &  \multicolumn{1}{c|}{ \textbf{ -99.47}}  &  \multicolumn{1}{c|}{ -99.91}  &  \multicolumn{1}{c|}{ -101.18}  &  \multicolumn{1}{c|}{ \textbf{ -99.35}}  &  \multicolumn{1}{c|}{ -99.73} \\
\cline{2-2} \cline{4-11} 
 \multicolumn{1}{|c|}{} & 3  & \multicolumn{1}{c|}{}   &  \multicolumn{1}{c|}{ -97.73}  &  \multicolumn{1}{c|}{ -97.67}  &  \multicolumn{1}{c|}{ -107.32}  &  \multicolumn{1}{c|}{ \textbf{-102.07}}  &  \multicolumn{1}{c|}{ -103.07}  &  \multicolumn{1}{c|}{ -102.61}  &  \multicolumn{1}{c|}{ \textbf{-100.45}}  &  \multicolumn{1}{c|}{ -100.88} \\
\cline{2-2} \cline{4-11} 
 \multicolumn{1}{|c|}{} & 5  & \multicolumn{1}{c|}{}   &  \multicolumn{1}{c|}{ -98.16}  &  \multicolumn{1}{c|}{ -97.6}  &  \multicolumn{1}{c|}{ -111.38}  &  \multicolumn{1}{c|}{ \textbf{-104.77}}  &  \multicolumn{1}{c|}{ -105.8}  &  \multicolumn{1}{c|}{ -104.06}  &  \multicolumn{1}{c|}{ \textbf{-101.83}}  &  \multicolumn{1}{c|}{ -101.97} \\
\hline 
\multicolumn{2}{|c|}{Avg.}  &  \multicolumn{1}{c|}{ \textbf{-97.36}}  &  \multicolumn{1}{c|}{ -97.81}  &  \multicolumn{1}{c|}{ -97.65}  &  \multicolumn{1}{c|}{ -106.88}  &  \multicolumn{1}{c|}{ \textbf{-102.1}}  &  \multicolumn{1}{c|}{ -102.93}  &  \multicolumn{1}{c|}{ -102.62}  &  \multicolumn{1}{c|}{ \textbf{-100.54}}  &  \multicolumn{1}{c|}{ -100.86} \\
\hline 
\multicolumn{1}{|c|}{ \multirow{3}{*}{Movie}} & 1 & \multicolumn{1}{|c|}{ \multirow{3}{*}{ -53.37}} &  \multicolumn{1}{c|}{ -54.21}  &  \multicolumn{1}{c|}{ -54.16}  &  \multicolumn{1}{c|}{ -80.05}  &  \multicolumn{1}{c|}{ \textbf{ -67.03}}  &  \multicolumn{1}{c|}{ -71.03}  &  \multicolumn{1}{c|}{ -80.02}  &  \multicolumn{1}{c|}{ \textbf{ -67.25}}  &  \multicolumn{1}{c|}{ -71.03} \\
\cline{2-2} \cline{4-11} 
 \multicolumn{1}{|c|}{} & 3  & \multicolumn{1}{c|}{}   &  \multicolumn{1}{c|}{ -56.94}  &  \multicolumn{1}{c|}{ -55.35}  &  \multicolumn{1}{c|}{ -132.0}  &  \multicolumn{1}{c|}{ \textbf{-85.68}}  &  \multicolumn{1}{c|}{ -96.0}  &  \multicolumn{1}{c|}{ -104.96}  &  \multicolumn{1}{c|}{ \textbf{-81.66}}  &  \multicolumn{1}{c|}{ -89.86} \\
\cline{2-2} \cline{4-11} 
 \multicolumn{1}{|c|}{} & 5  & \multicolumn{1}{c|}{}   &  \multicolumn{1}{c|}{ -59.57}  &  \multicolumn{1}{c|}{ -55.64}  &  \multicolumn{1}{c|}{ -182.1}  &  \multicolumn{1}{c|}{ \textbf{-100.87}}  &  \multicolumn{1}{c|}{ -122.6}  &  \multicolumn{1}{c|}{ -123.16}  &  \multicolumn{1}{c|}{ \textbf{-94.2}}  &  \multicolumn{1}{c|}{ -107.59} \\
\hline 
\multicolumn{2}{|c|}{Avg.}  &  \multicolumn{1}{c|}{ \textbf{-53.37}}  &  \multicolumn{1}{c|}{ -56.91}  &  \multicolumn{1}{c|}{ -55.05}  &  \multicolumn{1}{c|}{ -131.38}  &  \multicolumn{1}{c|}{ \textbf{-84.53}}  &  \multicolumn{1}{c|}{ -96.54}  &  \multicolumn{1}{c|}{ -102.71}  &  \multicolumn{1}{c|}{ \textbf{-81.04}}  &  \multicolumn{1}{c|}{ -89.49} \\
\hline 
\multicolumn{1}{|c|}{ \multirow{3}{*}{BBC}} & 1 & \multicolumn{1}{|c|}{ \multirow{3}{*}{ -260.03}} &  \multicolumn{1}{c|}{ \textbf{ -256.59}}  &  \multicolumn{1}{c|}{ -272.71}  &  \multicolumn{1}{c|}{ -272.79}  &  \multicolumn{1}{c|}{ \textbf{ -263.73}}  &  \multicolumn{1}{c|}{ -284.05}  &  \multicolumn{1}{c|}{ -272.06}  &  \multicolumn{1}{c|}{ \textbf{ -263.32}}  &  \multicolumn{1}{c|}{ -283.35} \\
\cline{2-2} \cline{4-11} 
 \multicolumn{1}{|c|}{} & 3  & \multicolumn{1}{c|}{}   &  \multicolumn{1}{c|}{ \textbf{-256.49}}  &  \multicolumn{1}{c|}{ -263.04}  &  \multicolumn{1}{c|}{ -297.09}  &  \multicolumn{1}{c|}{ \textbf{-274.57}}  &  \multicolumn{1}{c|}{ -286.38}  &  \multicolumn{1}{c|}{ -280.07}  &  \multicolumn{1}{c|}{ \textbf{-268.65}}  &  \multicolumn{1}{c|}{ -279.18} \\
\cline{2-2} \cline{4-11} 
 \multicolumn{1}{|c|}{} & 5  & \multicolumn{1}{c|}{}   &  \multicolumn{1}{c|}{ \textbf{-256.41}}  &  \multicolumn{1}{c|}{ -261.46}  &  \multicolumn{1}{c|}{ -320.12}  &  \multicolumn{1}{c|}{ \textbf{-282.87}}  &  \multicolumn{1}{c|}{ -295.06}  &  \multicolumn{1}{c|}{ -287.86}  &  \multicolumn{1}{c|}{ \textbf{-274.43}}  &  \multicolumn{1}{c|}{ -282.58} \\
\hline 
\multicolumn{2}{|c|}{Avg.}  &  \multicolumn{1}{c|}{ -260.03}  &  \multicolumn{1}{c|}{ \textbf{-256.5}}  &  \multicolumn{1}{c|}{ -265.74}  &  \multicolumn{1}{c|}{ -296.67}  &  \multicolumn{1}{c|}{ \textbf{-273.72}}  &  \multicolumn{1}{c|}{ -288.5}  &  \multicolumn{1}{c|}{ -280.0}  &  \multicolumn{1}{c|}{ \textbf{-268.8}}  &  \multicolumn{1}{c|}{ -281.7} \\
\hline 
\end{tabular}
\end{center} 
\end{table*}

%  CN LOG-LIKELIHOOD TABLE
\begin{table*}[]
\begin{center}
\caption{\label{tab:cn-ll}
Generative performance: Test set log-likelihood scores of  cutset networks or models without latent variables. $h\in\{1,2,3\}$: hamming distance thresholds. \cn : Cutset networks trained on original training data, \cna: CNs learned from adversarially generated training data by \cns, \cnr: trained via joint maximization of standard and robust likelihoods. \test: original test data, \testa: adversarially perturbed \test by \cn, \testr: randomly perturbed \test by \cn.}

\begin{tabular}{|cc|ccc|ccc|ccc|}
\hline
\multicolumn{1}{|c|}{\multirow{2}{*}{Dataset}} & \multirow{2}{*}{$h$} & \multicolumn{3}{c|}{\test}                                                                                    & \multicolumn{3}{c|}{\testa}                                                                                          & \multicolumn{3}{c|}{\testr}                                                                                          \\ \cline{3-11} 
\multicolumn{1}{|c|}{}                         &                             & \multicolumn{1}{c|}{\cn}                         & \multicolumn{1}{c|}{\cna}              & \cnr              & \multicolumn{1}{c|}{\cn}                & \multicolumn{1}{c|}{\cna}                       & \cnr                      & \multicolumn{1}{c|}{\cn}                & \multicolumn{1}{c|}{\cna}                       & \cnr                       \\ \hline
\multicolumn{1}{|c|}{\multirow{3}{*}{Plants}}  & 1                           & \multicolumn{1}{c|}{\multirow{3}{*}{-13.50}}   & \multicolumn{1}{c|}{-13.61}           & -13.56           & \multicolumn{1}{c|}{-35.16}           & \multicolumn{1}{c|}{\textbf{-29.94}}           & -30.68                    & \multicolumn{1}{c|}{-25.43}           & \multicolumn{1}{c|}{\textbf{-23.43}}           & -23.81                    \\ \cline{2-2} \cline{4-11} 
\multicolumn{1}{|c|}{}                         & 3                           & \multicolumn{1}{c|}{}                          & \multicolumn{1}{c|}{-13.72}           & -13.62           & \multicolumn{1}{c|}{-58.00}           & \multicolumn{1}{c|}{\textbf{-48.74}}           & -49.66                    & \multicolumn{1}{c|}{-38.97}           & \multicolumn{1}{c|}{\textbf{-34.88}}           & -35.27                    \\ \cline{2-2} \cline{4-11} 
\multicolumn{1}{|c|}{}                         & 5                           & \multicolumn{1}{c|}{}                          & \multicolumn{1}{c|}{-13.82}           & -13.63           & \multicolumn{1}{c|}{-72.16}           & \multicolumn{1}{c|}{\textbf{-58.08}}           & -61.65                    & \multicolumn{1}{c|}{-49.94}           & \multicolumn{1}{c|}{\textbf{-42.80}}           & -44.80                    \\ \hline
\multicolumn{2}{|c|}{\text{Avg.}}                                          & \multicolumn{1}{c|}{\text{\textbf{-13.50}}}  & \multicolumn{1}{c|}{\text{-13.72}}  & \text{-13.60}  & \multicolumn{1}{c|}{\text{-55.11}}  & \multicolumn{1}{c|}{\text{\textbf{-45.59}}}  & \text{-47.33}           & \multicolumn{1}{c|}{\text{-38.11}}  & \multicolumn{1}{c|}{\text{\textbf{-33.70}}}  & \text{-34.63}           \\ \hline
\multicolumn{1}{|c|}{\multirow{3}{*}{Netflix}} & 1                           & \multicolumn{1}{c|}{\multirow{3}{*}{-58.71}}   & \multicolumn{1}{c|}{-59.96}           & -58.97           & \multicolumn{1}{c|}{-66.26}           & \multicolumn{1}{c|}{\textbf{-62.77}}           & -63.59                    & \multicolumn{1}{c|}{-62.91}           & \multicolumn{1}{c|}{\textbf{-62.00}}           & -61.92                    \\ \cline{2-2} \cline{4-11} 
\multicolumn{1}{|c|}{}                         & 3                           & \multicolumn{1}{c|}{}                          & \multicolumn{1}{c|}{-61.07}           & -59.67           & \multicolumn{1}{c|}{-75.09}           & \multicolumn{1}{c|}{\textbf{-65.83}}           & -67.21                    & \multicolumn{1}{c|}{-66.56}           & \multicolumn{1}{c|}{\textbf{-64.10}}           & -64.12                    \\ \cline{2-2} \cline{4-11} 
\multicolumn{1}{|c|}{}                         & 5                           & \multicolumn{1}{c|}{}                          & \multicolumn{1}{c|}{-62.35}           & -59.91           & \multicolumn{1}{c|}{-81.19}           & \multicolumn{1}{c|}{\textbf{-67.38}}           & -69.92                    & \multicolumn{1}{c|}{-69.04}           & \multicolumn{1}{c|}{\textbf{-65.43}}           & -65.58                    \\ \hline
\multicolumn{2}{|c|}{{Avg.}}                                          & \multicolumn{1}{c|}{{-58.71}}           & \multicolumn{1}{c|}{{-61.13}}  & {-59.52}  & \multicolumn{1}{c|}{{-74.18}}  & \multicolumn{1}{c|}{{\textbf{-65.33}}}  & {-66.91}           & \multicolumn{1}{c|}{{-66.17}}  & \multicolumn{1}{c|}{{\textbf{-63.84}}}  & {-63.87}           \\ \hline
\multicolumn{1}{|c|}{\multirow{3}{*}{DNA}}     & 1                           & \multicolumn{1}{c|}{\multirow{3}{*}{-87.60}}   & \multicolumn{1}{c|}{-87.82}           & -87.70           & \multicolumn{1}{c|}{-95.74}           & \multicolumn{1}{c|}{\textbf{-93.52}}           & -93.88                    & \multicolumn{1}{c|}{-94.37}           & \multicolumn{1}{c|}{\textbf{-93.08}}           & -93.36                    \\ \cline{2-2} \cline{4-11} 
\multicolumn{1}{|c|}{}                         & 3                           & \multicolumn{1}{c|}{}                          & \multicolumn{1}{c|}{-89.74}           & -88.62           & \multicolumn{1}{c|}{-109.12}          & \multicolumn{1}{c|}{\textbf{-99.34}}           & -101.06                   & \multicolumn{1}{c|}{-103.41}          & \multicolumn{1}{c|}{\textbf{-97.78}}           & -98.45                    \\ \cline{2-2} \cline{4-11} 
\multicolumn{1}{|c|}{}                         & 5                           & \multicolumn{1}{c|}{}                          & \multicolumn{1}{c|}{-90.71}           & -89.19           & \multicolumn{1}{c|}{-121.95}          & \multicolumn{1}{c|}{\textbf{-104.54}}          & -107.37                   & \multicolumn{1}{c|}{-110.50}          & \multicolumn{1}{c|}{\textbf{-100.94}}          & -101.89                   \\ \hline
\multicolumn{2}{|c|}{{Avg.}}                                          & \multicolumn{1}{c|}{{\textbf{-87.60}}}  & \multicolumn{1}{c|}{{-89.42}}  & {-88.50}  & \multicolumn{1}{c|}{{-108.94}} & \multicolumn{1}{c|}{{\textbf{-99.13}}}  & {-100.77}          & \multicolumn{1}{c|}{{-102.76}} & \multicolumn{1}{c|}{{\textbf{-97.27}}}  & {-97.90}           \\ \hline
\multicolumn{1}{|c|}{\multirow{3}{*}{Each Movie}}   & 1                           & \multicolumn{1}{c|}{\multirow{3}{*}{-58.20}}   & \multicolumn{1}{c|}{-58.52}           & -58.21           & \multicolumn{1}{c|}{-124.66}          & \multicolumn{1}{c|}{\textbf{-117.42}}          & -119.15                   & \multicolumn{1}{c|}{-86.10}           & \multicolumn{1}{c|}{\textbf{-83.96}}           & -84.53                    \\ \cline{2-2} \cline{4-11} 
\multicolumn{1}{|c|}{}                         & 3                           & \multicolumn{1}{c|}{}                          & \multicolumn{1}{c|}{-58.70}           & -58.37           & \multicolumn{1}{c|}{-184.36}          & \multicolumn{1}{c|}{\textbf{-174.85}}          & -176.03                   & \multicolumn{1}{c|}{-112.96}          & \multicolumn{1}{c|}{\textbf{-109.01}}          & -109.62                   \\ \cline{2-2} \cline{4-11} 
\multicolumn{1}{|c|}{}                         & 5                           & \multicolumn{1}{c|}{}                          & \multicolumn{1}{c|}{-58.76}           & -58.77           & \multicolumn{1}{c|}{-233.61}          & \multicolumn{1}{c|}{-222.43}                   & \textbf{-214.66}          & \multicolumn{1}{c|}{-131.36}          & \multicolumn{1}{c|}{-126.49}                   & \textbf{-125.46}          \\ \hline
\multicolumn{2}{|c|}{{Avg.}}                                          & \multicolumn{1}{c|}{{\textbf{-58.20}}}  & \multicolumn{1}{c|}{{-58.66}}  & {-58.45}  & \multicolumn{1}{c|}{{-180.88}} & \multicolumn{1}{c|}{{-171.57}}          & {\textbf{-169.95}} & \multicolumn{1}{c|}{{-110.14}} & \multicolumn{1}{c|}{{-106.49}}          & {\textbf{-106.54}} \\ \hline
\multicolumn{1}{|c|}{\multirow{3}{*}{BBC}}     & 1                           & \multicolumn{1}{c|}{\multirow{3}{*}{-261.86}}  & \multicolumn{1}{c|}{-261.97}          & -261.89          & \multicolumn{1}{c|}{-271.99}          & \multicolumn{1}{c|}{\textbf{-269.79}}          & -270.12                   & \multicolumn{1}{c|}{-269.98}          & \multicolumn{1}{c|}{\textbf{-268.97}}          & -269.21                   \\ \cline{2-2} \cline{4-11} 
\multicolumn{1}{|c|}{}                         & 3                           & \multicolumn{1}{c|}{}                          & \multicolumn{1}{c|}{-262.61}          & -262.36          & \multicolumn{1}{c|}{-288.77}          & \multicolumn{1}{c|}{\textbf{-278.96}}          & -280.94                   & \multicolumn{1}{c|}{-277.79}          & \multicolumn{1}{c|}{\textbf{-275.59}}          & -275.80                   \\ \cline{2-2} \cline{4-11} 
\multicolumn{1}{|c|}{}                         & 5                           & \multicolumn{1}{c|}{}                          & \multicolumn{1}{c|}{-264.97}          & -262.72          & \multicolumn{1}{c|}{-304.09}          & \multicolumn{1}{c|}{\textbf{-285.92}}          & -290.28                   & \multicolumn{1}{c|}{-285.14}          & \multicolumn{1}{c|}{\textbf{-282.64}}          & -282.69                   \\ \hline
\multicolumn{2}{|c|}{{Avg.}}                                          & \multicolumn{1}{c|}{{\textbf{-261.86}}} & \multicolumn{1}{c|}{{-263.18}} & {-262.32} & \multicolumn{1}{c|}{{-288.28}} & \multicolumn{1}{c|}{{\textbf{-278.22}}} & {-280.45}          & \multicolumn{1}{c|}{{-277.64}} & \multicolumn{1}{c|}{{\textbf{-275.73}}} & {-275.90}          \\ \hline
\end{tabular}
\end{center}
\end{table*}


In this section, we evaluated the impact of our proposed parameter estimation method on both the generative and predictive performance of TPMs as well as their robustness to adversarial attacks and random noise. Our evaluation uses two popular classes of TPMs: sum product networks (SPNs) \citep{PoonAndDomingos2011} and cutset networks (CNs) \citep{Rahmanetal2014}. As mentioned earlier, we chose these two TPMs as representatives for the following two classes of TPMs: (1) TPMs having latent variables (SPNs) on which only marginal inference is tractable and (2) TPMs having no latent variables (CNs) on which both posterior marginal distributions and most probable explanations can be computed in polynomial time.


Given data, we learned both the structures and parameters of cutset networks without any latent variables using the LearnCNet algorithm proposed by \cite{Rahmanetal2014}. For each dataset,  we initially learned a large depth cutset network and then performed a bottom-up reduce error pruning technique using the validation set to improve its generalization accuracy. Our experiments on SPNs were performed using two open-source implementations: EiNETs \citep{Peharzetal2020} and RAT-SPNs \citep{Peharzetal2019}. For RAT-SPNs, we used the following structural parameters for all datasets: depth $D=3$, number of replicas $R=50$, number of sum nodes $C=10$, number of input distributions $I=10$. EiNETs use stochastic EM for estimation of parameters that maximize the likelihood of the data. We use the default parameters for online EM frequency and online EM step size (as mentioned in the author's GitHub page\footnote{https://github.com/cambridge-mlg/EinsumNetworks}). RAT-SPNs were trained using the DeeProb-kit\footnote{https://github.com/deeprob-org/deeprob-kit}) library where the parameters are learnt using stochastic gradient descent with a learning rate of $1e$-$2$. In our experiments, we found that the performance of SPNs trained using EiNETs and RAT-SPNs are comparable across all the evaluation criteria but we noticed that the computation time of learning and inference is much faster with EiNETs. All our experiments for SPNs and CNs were performed on machine equipped with a NVIDIA A40 GPU and a 2.4 GHz Xeon 8-core processor.

For each dataset, we learned three types of SPNs and CNs: 1) {\texttt{SPN}} and {\texttt{CN}} learned by maximizing the standard data log-likelihood,  2) \spna and \cna learned by maximizing robust likelihood (see Eq. \eqref{eq: ARE}) of the training data, and finally 3) \spnr and \cnr obtained by joint maximization of standard and robust likelihoods (see Eq.\eqref{eq:RMLE}). We performed our experiments using $\lambda = 1$. Note that the structure of all SPNs (and CNs) is learned from the original training data. The three SPNs (and CNs) differ from each other in how the parameters are learned; in other words, the structure is constant across all models. We experimented with three values, \{$1$, $3$, $5$\}, for the hamming distance threshold $h$. Models of types (2) and (3) were learnt on uncertainty sets $\mathcal{U}_h$ of varying size based on these hamming distance thresholds. These sets govern the size of allowable \corruptions in the data. 

We evaluated our method on 20 benchmark datasets that have been used in several experimental evaluations of TPMs \cite{LowdAndDavis2010}. For each dataset and each $h$, we generated two additional test sets. The first test set, which we call fully adversarial test set, denoted by $\mathcal{T}_a$ was generated from $\mathcal{T}$ as follows. We begin with an empty $\mathcal{T}_a$. Then, for each test example in $\mathcal{T}$, we use \textit{greedy local search} to find a neighbor of the example that is at most $h$ hamming distance away and has the smallest log-likelihood score w.r.t. either the {\texttt{SPN}} or {\texttt{CN}} and add it to $\mathcal{T}_a$.  The second test set which we call randomly perturbed test set, denoted by $\mathcal{T}_r$,  was generated from $\mathcal{T}$ as follows. We begin with an empty $\mathcal{T}_r$. Then, for each test example in $\mathcal{T}$, we select a neighbor from $100$ \textit{randomly generated neighbors} such that each neighbor is at most $h$ hamming distance away from the example and the selected neighbor has the smallest log-likelihood score w.r.t. either the {\texttt{SPN}} or {\texttt{CN}}, and add it to $\mathcal{T}_r$.

We evaluate both the generative and predictive performances of all three types of models under various corruption scenarios. To the best of our knowledge, this is the first empirical study on the robustness of expressive TPMs.

\subsection{Robust Generative Performance}

To evaluate the generative performance and robustness of the learned models, we compare their log-likelihood scores on three different test sets described above (\test,\testr,\testa) for $h=\{1,3,5\}$. Scores on the set \test indicate the model's \textit{goodness-of-fit} to the underlying data generating distribution and larger scores imply a better fit. On the other hand, scores on the sets \testa and \testr are representative of a model's robustness to adversarial and random perturbations. Higher scores imply that the model is resilient to small perturbations to the samples in \test. Tables \ref{tab:spn-ll-paper} and \ref{tab:cn-ll} report the average log-likelihood scores of SPNs and CNs respectively obtained on \test, \testa and \testr. For ease of readability, we only report results on five datasets with increasing dimensionality. A comprehensive set of results are provided in the supplement.

We observe that although \spns and \cns have slightly higher scores on \test as compared to their robust counterparts \{\spna, \spnr\}'s and \{\cna, \cnr\}'s, they have significantly lower scores on the corrupted sets \testa and \testr. Both SPNs and CNs trained using our proposed approaches consistently exhibit superior robust test-set log-likelihood scores as compared with standard SPNs and CNs.

\textbf{Impact of increasing $h$:} We observe that as we increase $h$, the performance of both \spna and \spnr degrades on the original test set \test, but the performance of \spnr degrades at a slower rate than \spna. In particular, there is an order of magnitude difference in the likelihood scores of \spna and \spnr for $h=5$. For cutset networks, we see the same picture; as we increase $h$, the performance of \cnr degrades at a slower rate than \cna on \test. 

Comparing between SPNs and CNs, we see that as we increase $h$, the performance of adversarial and regularized CNs  degrades at a much slower rate on \test as compared with SPNs. This slow (and more graceful) degradation is likely due to the fact that CNs are more biased and have fewer parameters than SPNs; as a result CNs are less sensitive to changes in the training data. 

On the adversarial and random test sets, namely on \testa and \testr respectively, we observe that increasing $h$ significantly degrades the performance of \spns and \cns which are trained on the original training set. For instance, there are several orders of magnitude difference between the log-likelihood scores on \testa (and \testr) for $h=5$ and $h=1$. On the other hand, as compared with \spns (and \cns), the rate of decrease in log-likelihoods (as we increase $h$) is much smaller for \spna and \spnr (\cna and \cnr). 

%\rohith{START}
\textbf{Choice of $h$:}  We motivate our choice of uncertainty sets \textbf{$h \in \{1,3,5\}$} from two viewpoints; experimental view and observational view. In our experiments, we noticed for the density estimation task, a competent adversary can easily find samples in uncertainty sets $h\in \{1,3,5\}$ which can bring down the log-likelihood scores by 2-3 fold and for the image completion task, an adversary can easily find samples in $h=5$ which can completely change the output of the completed image (e.g., changing from 4 to a 9 or 3 to an 8 as shown in \ref{fig:qualitative}). For uncertainty sets $h \ge 5$, we observe that samples obtained may no longer be part of the true underlying distribution (i.e., the samples are out-of-distribution). For example, on the MNIST dataset, the difference between (0, 8), (3, 8), (4, 9), (2, 3) etc. is $ \le 3$ pixels. Similarly, the benchmark datasets used in density estimation task are curated from user click stream, page visits and preferences data; here, samples from $h \ge 5$ can completely alter the estimated distribution.
%\rohith{END}

% SPN - CONDITIONAL LOG-LIKELIHOOD
\begin{table*}[t]
\begin{center} 
\caption{\label{tab:spn-cll-0.5} Predictive performance: Conditional log-likelihood scores given 50\% evidence for models having latent variables (SPNs). $h\in\{1,2,3\}$: hamming distance thresholds. \spn : SPN trained original training data, \spna: SPN trained on the adversarially generated training data by \spn, \spnr: SPN trained via joint maximization of standard and robust likelihoods. \test: original test data, \testa: adversarially perturbed \test by \spn, \testr: randomly perturbed \test by \spn.}
\begin{tabular}{ |cc|ccc|ccc|ccc| }
\hline 
\multicolumn{1}{|c|}{ \multirow{2}{*}{DATASET} } & \multicolumn{1}{|c|}{ \multirow{2}{*}{$h$} } & \multicolumn{3}{c|}{\test} & \multicolumn{3}{c|}{\testa} & \multicolumn{3}{c|}{\testr} \\
\cline{3-11} 
 & \multicolumn{1}{|c|}{}  & \multicolumn{1}{c|}{\spn}  & \multicolumn{1}{c|}{\spna} & \multicolumn{1}{c|}{\spnr}  & \multicolumn{1}{c|}{\spn}  & \multicolumn{1}{c|}{\spna} & \multicolumn{1}{c|}{\spnr} & \multicolumn{1}{c|}{\spn}  & \multicolumn{1}{c|}{\spna} & \multicolumn{1}{c|}{\spnr} \\
\hline 
\multicolumn{1}{|c|}{ \multirow{3}{*}{Plants}} & 1 & \multicolumn{1}{|c|}{ \multirow{3}{*}{ -5.64}} &  \multicolumn{1}{c|}{ -5.94}  &  \multicolumn{1}{c|}{ -5.73}  &  \multicolumn{1}{c|}{ -9.59}  &  \multicolumn{1}{c|}{ \textbf{ -7.57}}  &  \multicolumn{1}{c|}{ -7.95}  &  \multicolumn{1}{c|}{ -9.46}  &  \multicolumn{1}{c|}{ -7.8}  &  \multicolumn{1}{c|}{ \textbf{ -7.67}} \\
\cline{2-2} \cline{4-11} 
 \multicolumn{1}{|c|}{} & 3  & \multicolumn{1}{c|}{}   &  \multicolumn{1}{c|}{ -7.07}  &  \multicolumn{1}{c|}{ -6.21}  &  \multicolumn{1}{c|}{ -16.97}  &  \multicolumn{1}{c|}{ \textbf{-10.01}}  &  \multicolumn{1}{c|}{ -10.04}  &  \multicolumn{1}{c|}{ -14.3}  &  \multicolumn{1}{c|}{ \textbf{-10.49}}  &  \multicolumn{1}{c|}{ -10.73} \\
\cline{2-2} \cline{4-11} 
 \multicolumn{1}{|c|}{} & 5  & \multicolumn{1}{c|}{}   &  \multicolumn{1}{c|}{ -7.96}  &  \multicolumn{1}{c|}{ -6.13}  &  \multicolumn{1}{c|}{ -25.6}  &  \multicolumn{1}{c|}{ \textbf{-12.27}}  &  \multicolumn{1}{c|}{ -12.82}  &  \multicolumn{1}{c|}{ -18.96}  &  \multicolumn{1}{c|}{ \textbf{-12.23}}  &  \multicolumn{1}{c|}{ -12.72} \\
\hline 
\multicolumn{2}{|c|}{Avg.}  &  \multicolumn{1}{c|}{ \textbf{-5.64}}  &  \multicolumn{1}{c|}{ -6.99}  &  \multicolumn{1}{c|}{ -6.02}  &  \multicolumn{1}{c|}{ -17.39}  &  \multicolumn{1}{c|}{ \textbf{-9.95}}  &  \multicolumn{1}{c|}{ -10.27}  &  \multicolumn{1}{c|}{ -14.24}  &  \multicolumn{1}{c|}{ \textbf{-10.17}}  &  \multicolumn{1}{c|}{ -10.37} \\
\hline 
\multicolumn{1}{|c|}{ \multirow{3}{*}{Netflix}} & 1 & \multicolumn{1}{|c|}{ \multirow{3}{*}{ -28.02}} &  \multicolumn{1}{c|}{ -28.52}  &  \multicolumn{1}{c|}{ -28.17}  &  \multicolumn{1}{c|}{ -29.94}  &  \multicolumn{1}{c|}{ \textbf{ -29.05}}  &  \multicolumn{1}{c|}{ -29.07}  &  \multicolumn{1}{c|}{ -29.7}  &  \multicolumn{1}{c|}{ -29.28}  &  \multicolumn{1}{c|}{ \textbf{ -29.1}} \\
\cline{2-2} \cline{4-11} 
 \multicolumn{1}{|c|}{} & 3  & \multicolumn{1}{c|}{}   &  \multicolumn{1}{c|}{ -29.48}  &  \multicolumn{1}{c|}{ -28.51}  &  \multicolumn{1}{c|}{ -32.57}  &  \multicolumn{1}{c|}{ -31.26}  &  \multicolumn{1}{c|}{ \textbf{-30.86}}  &  \multicolumn{1}{c|}{ -30.37}  &  \multicolumn{1}{c|}{ -30.54}  &  \multicolumn{1}{c|}{ \textbf{-29.87}} \\
\cline{2-2} \cline{4-11} 
 \multicolumn{1}{|c|}{} & 5  & \multicolumn{1}{c|}{}   &  \multicolumn{1}{c|}{ -30.53}  &  \multicolumn{1}{c|}{ -28.85}  &  \multicolumn{1}{c|}{ -34.92}  &  \multicolumn{1}{c|}{ -33.06}  &  \multicolumn{1}{c|}{ \textbf{-32.25}}  &  \multicolumn{1}{c|}{ -30.88}  &  \multicolumn{1}{c|}{ -31.79}  &  \multicolumn{1}{c|}{ \textbf{-30.64}} \\
\hline 
\multicolumn{2}{|c|}{Avg.}  &  \multicolumn{1}{c|}{ \textbf{-28.02}}  &  \multicolumn{1}{c|}{ -29.51}  &  \multicolumn{1}{c|}{ -28.51}  &  \multicolumn{1}{c|}{ -32.48}  &  \multicolumn{1}{c|}{ -31.12}  &  \multicolumn{1}{c|}{ \textbf{-30.73}}  &  \multicolumn{1}{c|}{ -30.32}  &  \multicolumn{1}{c|}{ -30.54}  &  \multicolumn{1}{c|}{ \textbf{-29.87}} \\
\hline 
\multicolumn{1}{|c|}{ \multirow{3}{*}{DNA}} & 1 & \multicolumn{1}{|c|}{ \multirow{3}{*}{ -49.58}} &  \multicolumn{1}{c|}{ \textbf{ -49.52}}  &  \multicolumn{1}{c|}{ -49.7}  &  \multicolumn{1}{c|}{ -52.15}  &  \multicolumn{1}{c|}{ \textbf{ -50.31}}  &  \multicolumn{1}{c|}{ -50.66}  &  \multicolumn{1}{c|}{ -51.81}  &  \multicolumn{1}{c|}{ \textbf{ -50.33}}  &  \multicolumn{1}{c|}{ -50.65} \\
\cline{2-2} \cline{4-11} 
 \multicolumn{1}{|c|}{} & 3  & \multicolumn{1}{c|}{}   &  \multicolumn{1}{c|}{ \textbf{-49.46}}  &  \multicolumn{1}{c|}{ -49.72}  &  \multicolumn{1}{c|}{ -55.4}  &  \multicolumn{1}{c|}{ \textbf{-51.56}}  &  \multicolumn{1}{c|}{ -52.16}  &  \multicolumn{1}{c|}{ -52.45}  &  \multicolumn{1}{c|}{ \textbf{-50.7}}  &  \multicolumn{1}{c|}{ -51.1} \\
\cline{2-2} \cline{4-11} 
 \multicolumn{1}{|c|}{} & 5  & \multicolumn{1}{c|}{}   &  \multicolumn{1}{c|}{ \textbf{-49.41}}  &  \multicolumn{1}{c|}{ -49.47}  &  \multicolumn{1}{c|}{ -57.85}  &  \multicolumn{1}{c|}{ \textbf{-52.76}}  &  \multicolumn{1}{c|}{ -53.58}  &  \multicolumn{1}{c|}{ -53.13}  &  \multicolumn{1}{c|}{ \textbf{-51.34}}  &  \multicolumn{1}{c|}{ -51.44} \\
\hline 
\multicolumn{2}{|c|}{Avg.}  &  \multicolumn{1}{c|}{ -49.58}  &  \multicolumn{1}{c|}{ \textbf{-49.46}}  &  \multicolumn{1}{c|}{ -49.63}  &  \multicolumn{1}{c|}{ -55.13}  &  \multicolumn{1}{c|}{ \textbf{-51.54}}  &  \multicolumn{1}{c|}{ -52.13}  &  \multicolumn{1}{c|}{ -52.46}  &  \multicolumn{1}{c|}{ \textbf{-50.79}}  &  \multicolumn{1}{c|}{ -51.06} \\
\hline 
\multicolumn{1}{|c|}{ \multirow{3}{*}{Movie}} & 1 & \multicolumn{1}{|c|}{ \multirow{3}{*}{ -22.35}} &  \multicolumn{1}{c|}{ -22.82}  &  \multicolumn{1}{c|}{ -22.63}  &  \multicolumn{1}{c|}{ -25.47}  &  \multicolumn{1}{c|}{ \textbf{ -25.45}}  &  \multicolumn{1}{c|}{ -30.59}  &  \multicolumn{1}{c|}{ -39.62}  &  \multicolumn{1}{c|}{ \textbf{ -31.23}}  &  \multicolumn{1}{c|}{ -35.59} \\
\cline{2-2} \cline{4-11} 
 \multicolumn{1}{|c|}{} & 3  & \multicolumn{1}{c|}{}   &  \multicolumn{1}{c|}{ -24.05}  &  \multicolumn{1}{c|}{ -23.41}  &  \multicolumn{1}{c|}{ -34.9}  &  \multicolumn{1}{c|}{ \textbf{-30.18}}  &  \multicolumn{1}{c|}{ -41.66}  &  \multicolumn{1}{c|}{ -54.06}  &  \multicolumn{1}{c|}{ \textbf{-38.95}}  &  \multicolumn{1}{c|}{ -45.94} \\
\cline{2-2} \cline{4-11} 
 \multicolumn{1}{|c|}{} & 5  & \multicolumn{1}{c|}{}   &  \multicolumn{1}{c|}{ -25.99}  &  \multicolumn{1}{c|}{ -23.32}  &  \multicolumn{1}{c|}{ -50.75}  &  \multicolumn{1}{c|}{ \textbf{-36.12}}  &  \multicolumn{1}{c|}{ -54.6}  &  \multicolumn{1}{c|}{ -63.07}  &  \multicolumn{1}{c|}{ \textbf{-45.85}}  &  \multicolumn{1}{c|}{ -57.15} \\
\hline 
\multicolumn{2}{|c|}{Avg.}  &  \multicolumn{1}{c|}{ \textbf{-22.35}}  &  \multicolumn{1}{c|}{ -24.29}  &  \multicolumn{1}{c|}{ -23.12}  &  \multicolumn{1}{c|}{ -37.04}  &  \multicolumn{1}{c|}{ \textbf{-30.58}}  &  \multicolumn{1}{c|}{ -42.28}  &  \multicolumn{1}{c|}{ -52.25}  &  \multicolumn{1}{c|}{ \textbf{-38.68}}  &  \multicolumn{1}{c|}{ -46.23} \\
\hline 
\multicolumn{1}{|c|}{ \multirow{3}{*}{BBC}} & 1 & \multicolumn{1}{|c|}{ \multirow{3}{*}{ -91.2}} &  \multicolumn{1}{c|}{ \textbf{ -88.78}}  &  \multicolumn{1}{c|}{ -99.07}  &  \multicolumn{1}{c|}{ -102.32}  &  \multicolumn{1}{c|}{ \textbf{ -94.95}}  &  \multicolumn{1}{c|}{ -108.79}  &  \multicolumn{1}{c|}{ -101.48}  &  \multicolumn{1}{c|}{ \textbf{ -94.38}}  &  \multicolumn{1}{c|}{ -107.99} \\
\cline{2-2} \cline{4-11} 
 \multicolumn{1}{|c|}{} & 3  & \multicolumn{1}{c|}{}   &  \multicolumn{1}{c|}{ \textbf{-88.83}}  &  \multicolumn{1}{c|}{ -92.51}  &  \multicolumn{1}{c|}{ -123.22}  &  \multicolumn{1}{c|}{ \textbf{-104.17}}  &  \multicolumn{1}{c|}{ -112.48}  &  \multicolumn{1}{c|}{ -107.0}  &  \multicolumn{1}{c|}{ \textbf{-97.57}}  &  \multicolumn{1}{c|}{ -104.67} \\
\cline{2-2} \cline{4-11} 
 \multicolumn{1}{|c|}{} & 5  & \multicolumn{1}{c|}{}   &  \multicolumn{1}{c|}{ \textbf{-88.83}}  &  \multicolumn{1}{c|}{ -91.2}  &  \multicolumn{1}{c|}{ -143.8}  &  \multicolumn{1}{c|}{ \textbf{-111.82}}  &  \multicolumn{1}{c|}{ -119.87}  &  \multicolumn{1}{c|}{ -112.08}  &  \multicolumn{1}{c|}{ \textbf{-100.89}}  &  \multicolumn{1}{c|}{ -105.64} \\
\hline 
\multicolumn{2}{|c|}{Avg.}  &  \multicolumn{1}{c|}{ -91.2}  &  \multicolumn{1}{c|}{ \textbf{-88.81}}  &  \multicolumn{1}{c|}{ -94.26}  &  \multicolumn{1}{c|}{ -123.11}  &  \multicolumn{1}{c|}{ \textbf{-103.65}}  &  \multicolumn{1}{c|}{ -113.71}  &  \multicolumn{1}{c|}{ -106.85}  &  \multicolumn{1}{c|}{ \textbf{-97.61}}  &  \multicolumn{1}{c|}{ -106.1} \\
\hline 
\end{tabular}
\end{center} 
\end{table*}

% CN - CONDITIONAL LOG-LIKELIHOOD

\begin{table*}[]
\begin{center}
\caption{\label{tab:cn-cll-0.5}Predictive performance: Conditional log-likelihood scores given 50\% evidence for models having no latent variables (CNs). $h\in\{1,2,3\}$: hamming distance thresholds. \cn : Cutset networks trained on original training data, \cna: CNs learned from adversarially generated training data by \cns, \cnr: trained via joint maximization of standard and robust likelihoods. \test: original test data, \testa: adversarially perturbed \test by \cn, \testr: randomly perturbed \test by \cn.}
\begin{tabular}{|cc|ccc|ccc|ccc|}
\hline
\multicolumn{1}{|c|}{\multirow{2}{*}{Dataset}} & \multirow{2}{*}{$h$} & \multicolumn{3}{c|}{\test}                                                                 & \multicolumn{3}{c|}{\testa}                                                               & \multicolumn{3}{c|}{\testr}                                                               \\ \cline{3-11} 
\multicolumn{1}{|c|}{}                         &                    & \multicolumn{1}{c|}{\cn}                        & \multicolumn{1}{c|}{\cna}    & \cnr    & \multicolumn{1}{c|}{\cn}       & \multicolumn{1}{c|}{\cna}             & \cnr             & \multicolumn{1}{c|}{\cn}       & \multicolumn{1}{c|}{\cna}             & \cnr             \\ \hline
\multicolumn{1}{|c|}{\multirow{3}{*}{Plants}}  & 1                  & \multicolumn{1}{c|}{\multirow{3}{*}{-9.61}}   & \multicolumn{1}{c|}{-9.68}   & -9.64   & \multicolumn{1}{c|}{-31.26}  & \multicolumn{1}{c|}{\textbf{-25.90}}  & -26.65           & \multicolumn{1}{c|}{-20.83}  & \multicolumn{1}{c|}{\textbf{-18.87}}  & -19.23           \\ \cline{2-2} \cline{4-11} 
\multicolumn{1}{|c|}{}                         & 3                  & \multicolumn{1}{c|}{}                         & \multicolumn{1}{c|}{-9.80}   & -9.70   & \multicolumn{1}{c|}{-50.67}  & \multicolumn{1}{c|}{\textbf{-41.50}}  & -42.37           & \multicolumn{1}{c|}{-34.00}  & \multicolumn{1}{c|}{\textbf{-29.51}}  & -29.91           \\ \cline{2-2} \cline{4-11} 
\multicolumn{1}{|c|}{}                         & 5                  & \multicolumn{1}{c|}{}                         & \multicolumn{1}{c|}{-9.89}   & -9.72   & \multicolumn{1}{c|}{-61.56}  & \multicolumn{1}{c|}{\textbf{-47.99}}  & -51.22           & \multicolumn{1}{c|}{-42.68}  & \multicolumn{1}{c|}{\textbf{-35.87}}  & -37.68           \\ \hline
\multicolumn{2}{|c|}{Avg.}                                          & \multicolumn{1}{c|}{\textbf{-9.61}}           & \multicolumn{1}{c|}{-9.79}   & -9.69   & \multicolumn{1}{c|}{-47.83}  & \multicolumn{1}{c|}{\textbf{-38.46}}  & -40.08           & \multicolumn{1}{c|}{-32.50}  & \multicolumn{1}{c|}{\textbf{-28.08}}  & -28.94           \\ \hline
\multicolumn{1}{|c|}{\multirow{3}{*}{Netflix}} & 1                  & \multicolumn{1}{c|}{\multirow{3}{*}{-45.29}}  & \multicolumn{1}{c|}{-46.12}  & -45.39  & \multicolumn{1}{c|}{-52.51}  & \multicolumn{1}{c|}{\textbf{-48.87}}  & -49.88           & \multicolumn{1}{c|}{-49.28}  & \multicolumn{1}{c|}{\textbf{-47.95}}  & -48.15           \\ \cline{2-2} \cline{4-11} 
\multicolumn{1}{|c|}{}                         & 3                  & \multicolumn{1}{c|}{}                         & \multicolumn{1}{c|}{-47.03}  & -45.96  & \multicolumn{1}{c|}{-60.77}  & \multicolumn{1}{c|}{\textbf{-51.35}}  & -52.93           & \multicolumn{1}{c|}{-52.16}  & \multicolumn{1}{c|}{\textbf{-49.46}}  & -49.71           \\ \cline{2-2} \cline{4-11} 
\multicolumn{1}{|c|}{}                         & 5                  & \multicolumn{1}{c|}{}                         & \multicolumn{1}{c|}{-48.01}  & -46.13  & \multicolumn{1}{c|}{-65.87}  & \multicolumn{1}{c|}{\textbf{-52.34}}  & -55.00           & \multicolumn{1}{c|}{-54.12}  & \multicolumn{1}{c|}{\textbf{-50.58}}  & -50.97           \\ \hline
\multicolumn{2}{|c|}{Avg.}                                          & \multicolumn{1}{c|}{\textbf{-45.29}}          & \multicolumn{1}{c|}{-47.05}  & -45.83  & \multicolumn{1}{c|}{-59.72}  & \multicolumn{1}{c|}{\textbf{-50.85}}  & -52.60           & \multicolumn{1}{c|}{-51.85}  & \multicolumn{1}{c|}{\textbf{-49.33}}  & -49.61           \\ \hline
\multicolumn{1}{|c|}{\multirow{3}{*}{DNA}}     & 1                  & \multicolumn{1}{c|}{\multirow{3}{*}{-67.88}}  & \multicolumn{1}{c|}{-68.71}  & -68.20  & \multicolumn{1}{c|}{-75.64}  & \multicolumn{1}{c|}{\textbf{-72.17}}  & -72.86           & \multicolumn{1}{c|}{-73.68}  & \multicolumn{1}{c|}{\textbf{-72.17}}  & -72.42           \\ \cline{2-2} \cline{4-11} 
\multicolumn{1}{|c|}{}                         & 3                  & \multicolumn{1}{c|}{}                         & \multicolumn{1}{c|}{-69.79}  & -68.80  & \multicolumn{1}{c|}{-86.64}  & \multicolumn{1}{c|}{\textbf{-77.70}}  & -79.34           & \multicolumn{1}{c|}{-80.89}  & \multicolumn{1}{c|}{\textbf{-76.29}}  & -76.81           \\ \cline{2-2} \cline{4-11} 
\multicolumn{1}{|c|}{}                         & 5                  & \multicolumn{1}{c|}{}                         & \multicolumn{1}{c|}{-70.55}  & -69.24  & \multicolumn{1}{c|}{-97.68}  & \multicolumn{1}{c|}{\textbf{-82.01}}  & -84.64           & \multicolumn{1}{c|}{-87.61}  & \multicolumn{1}{c|}{\textbf{-79.06}}  & -79.96           \\ \hline
\multicolumn{2}{|c|}{Avg.}                                          & \multicolumn{1}{c|}{\textbf{-67.88}}          & \multicolumn{1}{c|}{-69.68}  & -68.75  & \multicolumn{1}{c|}{-86.65}  & \multicolumn{1}{c|}{\textbf{-77.29}}  & -78.95           & \multicolumn{1}{c|}{-80.73}  & \multicolumn{1}{c|}{\textbf{-75.84}}  & -76.40           \\ \hline
\multicolumn{1}{|c|}{\multirow{3}{*}{Movie}}   & 1                  & \multicolumn{1}{c|}{\multirow{3}{*}{-41.36}}  & \multicolumn{1}{c|}{-41.50}  & -41.35  & \multicolumn{1}{c|}{-107.57} & \multicolumn{1}{c|}{\textbf{-100.30}} & -102.19          & \multicolumn{1}{c|}{-74.19}  & \multicolumn{1}{c|}{\textbf{-71.71}}  & -72.49           \\ \cline{2-2} \cline{4-11} 
\multicolumn{1}{|c|}{}                         & 3                  & \multicolumn{1}{c|}{}                         & \multicolumn{1}{c|}{-41.55}  & -41.38  & \multicolumn{1}{c|}{-159.81} & \multicolumn{1}{c|}{\textbf{-147.69}} & -149.37          & \multicolumn{1}{c|}{-96.51}  & \multicolumn{1}{c|}{\textbf{-92.43}}  & -93.12           \\ \cline{2-2} \cline{4-11} 
\multicolumn{1}{|c|}{}                         & 5                  & \multicolumn{1}{c|}{}                         & \multicolumn{1}{c|}{-41.65}  & -41.62  & \multicolumn{1}{c|}{-217.39} & \multicolumn{1}{c|}{-203.58}          & \textbf{-197.10} & \multicolumn{1}{c|}{-112.39} & \multicolumn{1}{c|}{-107.70}          & \textbf{-105.93} \\ \hline
\multicolumn{2}{|c|}{Avg.}                                          & \multicolumn{1}{c|}{\textbf{-41.36}}          & \multicolumn{1}{c|}{-41.57}  & -41.45  & \multicolumn{1}{c|}{-161.59} & \multicolumn{1}{c|}{-150.52}          & \textbf{-149.55} & \multicolumn{1}{c|}{-94.36}  & \multicolumn{1}{c|}{-90.61}           & \textbf{-90.51}  \\ \hline
\multicolumn{1}{|c|}{\multirow{3}{*}{BBC}}     & 1                  & \multicolumn{1}{c|}{\multirow{3}{*}{-186.91}} & \multicolumn{1}{c|}{-187.00} & -186.95 & \multicolumn{1}{c|}{-193.91} & \multicolumn{1}{c|}{\textbf{-193.24}} & -193.32          & \multicolumn{1}{c|}{-193.95} & \multicolumn{1}{c|}{\textbf{-193.50}} & -193.61          \\ \cline{2-2} \cline{4-11} 
\multicolumn{1}{|c|}{}                         & 3                  & \multicolumn{1}{c|}{}                         & \multicolumn{1}{c|}{-187.38} & -187.23 & \multicolumn{1}{c|}{-204.63} & \multicolumn{1}{c|}{\textbf{-199.52}} & -200.42          & \multicolumn{1}{c|}{-200.05} & \multicolumn{1}{c|}{-199.00}          & \textbf{-198.99} \\ \cline{2-2} \cline{4-11} 
\multicolumn{1}{|c|}{}                         & 5                  & \multicolumn{1}{c|}{}                         & \multicolumn{1}{c|}{-188.82} & -187.43 & \multicolumn{1}{c|}{-214.46} & \multicolumn{1}{c|}{\textbf{-204.51}} & -206.93          & \multicolumn{1}{c|}{-205.69} & \multicolumn{1}{c|}{\textbf{-204.12}} & -204.31          \\ \hline
\multicolumn{2}{|c|}{Avg.}                                          & \multicolumn{1}{c|}{\textbf{-186.91}}         & \multicolumn{1}{c|}{-187.73} & -187.20 & \multicolumn{1}{c|}{-204.33} & \multicolumn{1}{c|}{\textbf{-199.09}} & -200.22          & \multicolumn{1}{c|}{-199.90} & \multicolumn{1}{c|}{\textbf{-198.87}} & -198.97          \\ \hline
\end{tabular}
\end{center}
\end{table*}

\subsection{Robust Predictive Performance}

We used conditional log-likelihood (CLL) scores to evaluate the predictive performance. Given query variables $q$ and evidence variables $e$, the CLL score of a data point $x$ equals $\log f(x^q|x^e)$. We compare the average CLL scores of all models on \test, \testa and \testr. We randomly selected different percentages of variables as query variables and set the remaining variables as evidence variables. The uncertainty sets are now computed over the evidence variables using greedy local search for hamming distances $\{1,3,5\}$. 

\begin{figure}[ht]
\caption{\label{fig:qualitative} }
\includegraphics[scale = 0.37]{qualitative-example.png}
\centering
\end{figure}

Tables \ref{tab:spn-cll-0.5} and \ref{tab:cn-cll-0.5} report the CLL scores obtained by the various SPNs and CNs where half of the variables were set as query variables and the remaining as evidence variables. We observe a similar trend: \{\spna, \spnr\} and \{\cna, \cnr\} have better CLL scores compared to \spn and \cn respectively on \testa and \testr. These results demonstrate that our proposed method yields robust predictions.

\paragraph{Robust image completion:} Fig.\ref{fig:qualitative} shows qualitative results on the image completion task for randomly chosen images from the binarized MNIST dataset \cite{mnist}. The first row shows the original corrupted images, the second row shows covered images (the top and left halves are covered in the first two and the last two columns respectively), and the third and fourth rows show reconstructions based on SPNs and robust SPNs respectively. We clearly observe that on corrupted data, robust SPNs yield better quality completions as compared to the original SPNs.

In summary, we notice that models trained on \eqref{eq: ARE} produced better robust (conditional) log-likelihood scores than standard models but suffer in standard (conditional) log-likelihood scores. But, models trained on (\ref{eq:RMLE}) have comparable robust (conditional) log-likelihoods scores to those trained on \eqref{eq: ARE} and also have better standard (conditional) log-likelihood scores comparable to standard models evaluated on standard (conditional) log-likelihood scores.


\section{Conclusion and Future Work}

In this paper, we presented an algorithm for learning robust Tractable Probabilistic Models (TPMs) when subjected to noise/perturbations/corruptions. At a high level, we formulate the robust learning problem as a max-min variant of the standard maximum likelihood estimation task where an adversary plays the role of a minimizer, affecting the training data by adding point-wise \corruptions from a deterministic uncertainty set and the optimizer plays the role of a maximizer, learning parameters that maximize the likelihood for worst case realization of data. We develop a gradient-based local search technique for solving this max-min problem and show that because TPMs admit polynomial-time gradient computations, our algorithm converges to either a local or global optima and runs in polynomial time. Via a large experimental evaluation on standard benchmark datasets, we showed that our proposed methods perform reliably well, both in terms of generative and predictive evaluation measures, when the data is corrupted.

%\rohith{START}
%\section{Future work}
\textbf{Future work:} In this work, we focused on learning robust estimators using point-wise adversaries whose corruptions are confined in deterministic uncertainty sets; in future, we wish to explore learning distributionally robust estimators using stronger adversaries that can move entire observed distribution in probabilistic uncertainty sets constructed based on discrepancy measures such as Wasserstein distance, $\phi$-divergence, etc. We also wish to explore theoretical bounds for robust generalization.
%\rohith{END}





\begin{acknowledgements} 
We thank anonymous reviewers for their insightful comments which helped us significantly improve an earlier draft of this paper. Specifically, we would like to thank the anonymous reviewer who helped us formulate footnote \ref{foot:static}. This work was supported in part by the DARPA Explainable Artificial Intelligence (XAI) Program under contract number N66001-17-2-4032, by the DARPA Perceptually-enabled Task Guidance (PTG) Program under contract number HR00112220005 and by the National Science Foundation CAREER award IIS-1652835.
\end{acknowledgements}

\bibliography{peddi_671}

\appendix
% NOTE: necessary when ptmx or no mathfont class option is given
\providecommand{\upGamma}{\Gamma}
\providecommand{\uppi}{\pi}


\end{document}
