\documentclass[accepted]{uai2022} % for initial submission
% \documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

\usepackage{amsmath}
\usepackage{bm}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\usepackage{mathrsfs}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage[ruled]{algorithm2e}
\usepackage{diagbox}
\usepackage{multirow}
\usepackage{tikz}

\SetKwFor{While}{while}{}{end while}%
\SetKwRepeat{Do}{do}{while}
\SetKw{KwGoTo}{go to}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Bias Aware Probabilistic Boolean Matrix Factorization}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,2]{\href{mailto:<wan82@purdue.edu>?Subject=Your UAI 2022 paper}{Changlin Wan}{}}
\author[1]{Pengtao Dang}
\author[3]{Tong Zhao}
\author[1]{Yong Zang}
\author[1]{Chi Zhang}
\author[1]{\href{mailto:<shacao@iu.edu>?Subject=Your UAI 2022 paper}{Sha Cao}{}}

% Add affiliations after the authors
\affil[1]{%
    Indiana University, Indianapolis, Indiana, United States
}
\affil[2]{%
    Purdue University, West Lafayette, Indiana, United States
}
\affil[3]{%
Amazon, Seattle, Washington, United States
  }
  
  \begin{document}
\maketitle

\begin{abstract}
Boolean matrix factorization (BMF) is a combinatorial problem arising from a wide range of applications including recommendation system, collaborative filtering, and dimensionality reduction. Currently, the noise model of existing BMF methods is often assumed to be homoscedastic; however, in real world data scenarios, the deviations of observed data from their true values are almost surely diverse due to stochastic noises, making  each data point not equally suitable for fitting a model. In this case, it is not ideal to treat all data points as equally distributed. Motivated by such observations, we introduce a probabilistic BMF model that recognizes the object- and feature-wise bias distribution respectively, called bias aware BMF (BABF).
To the best of our knowledge, BABF is the first approach for Boolean decomposition with consideration of the feature-wise and object-wise bias in binary data. We conducted experiments on datasets with different levels of background noise, bias level, and sizes of the signal patterns, to test the effectiveness of our method in various scenarios. We demonstrated that our model outperforms the state-of-the-art factorization methods in both accuracy and efficiency in recovering the original datasets, and the inferred bias level is highly 
significantly correlated with true existing bias in both simulated and real world datasets. 
\end{abstract}

\section{Introduction}
Boolean matrix is one type of data representation with binary entries that originates from a wide range of applications including recommendation system, network analysis, collaborative filtering, and biological gene expression \citep{miettinen2020recent,balasubramaniam2018people,kocayusufoglu2018summarizing,zhao2020understanding,liang2020bem}. The goal of \textbf{Boolean matrix factorization(BMF)} is to discover hidden patterns from binary data, where it finds a pair of low-rank binary matrices (\(X\in \{0,1\}^{m\times k}\), \(Y\in \{0,1\}^{k\times n}\)) (Figure \ref{fig:intro}A,B,C), whose Boolean product approximates the original input matrix (\(A\in \{0,1\}^{m\times n}\)), i.e.,
\[A\sim X\otimes Y, \quad A_{ij}\sim \vee_{l=1}^k X_{il}\wedge Y_{lj}.\]
\begin{figure}
    \centering
    \includegraphics[width=\linewidth]{intro.png}
    \caption{BMF with homoscedastic noise model (A-C) and bias aware BMF with column- and row-specific bias (D-H). H illustrates a biased data case in purchase history data.}
    \label{fig:intro}
\end{figure}
It has been known that under linear algebra, the input binary matrix \(A\) can be of high rank, owing to the spike columns or rows, which prevents the application of established methods like SVD and PCA \cite{wall2003singular}; while by applying BMF, in the most optimal case, one can reduce the rank of the original matrix to its log level \cite{monson1995survey}. Such low-rank decomposition can capture the local \textbf{dependency} between subsets of objects (row of \(A\)) and subsets of features (column of \(A\)). Specifically, in each rank-1 submatrix resulted from the decomposition into \(X, Y\), i.e., \(X_{:l}\otimes Y_{l:}\), it indicates a group of objects (i.e., nonzero entries in \(X_{:l}\)) sharing the same behavior on a set of features (i.e., nonzero entries in \(Y_{l:}\)). Here we denote the overall pattern matrix as \(Z:=X\otimes Y\). 
For the background error distribution, existing BMF methods tend to assume homoscedastic error distribution, or a universal flipping error with a flipping rate of \(p_f=p(A_{ij}=0 | Z_{ij}=1)=p(A_{ij}=1|Z_{ij}=0)\). In other words, the objective of BMF is to find the a decomposition of \(A\) such that \[A=(Z + E)\, mod \,2\,; \; s.t. Z=X\otimes Y, p(E_{ij}=1)=p_f\] where \(Z, E\) minimize a certain cost function \(\tau(Z,A)=|E|=|A\ominus (X\otimes Y)|\) (Figure \ref{fig:intro}A,B,C). Here, \(mod 2\) represents the modulo operation with a quotient of 2, and \(|\cdot|\) represents a certain norm measure defined by the cost function \(\tau(\cdot)\).

Unfortunately, the assumption of homoscedastic error distribution is often violated when applied to complex real-world data, where the individual objects or features may have its specific bias pattern that result in \textbf{heteroscedastic error distribution}. Existing BMF methods fail to account for such object- or feature-specific bias, which could severly impact our ability to identify the true pattern \(Z\), as the error matrix \(E\) may display row- or column-specific bias \citep{wan2020denoising}. Take the online transaction records data as an example. The observed transaction records data from customers (row) and items (column) are constituted by three components: pattern, bias and flipping error (Figure \ref{fig:intro}D), meaning that aside from stochastic error, to determine whether or not a costumer would purchase a certain item, one should not only look at the purchase pattern that he/she belongs to (Figure \ref{fig:intro}E), but also his/her innate personal purchase preferences and the popularity of the item (Figure \ref{fig:intro}G). For example, a super-buyer, or someone with impulsive buying habits, is very likely to make a purchase regardless of the properties of the items; while a super-item, or a popular item, is also likely to be purchased by users with different characteristics (Figure \ref{fig:intro}B,H). 

To mend the gap in binary data analysis, we propose \textbf{BABF} (\textbf{B}ias \textbf{A}ware \textbf{B}oolean matrix \textbf{F}actorization), the first tool to derive the latent binary pattern (\(Z\)), in the presence of individual row-wise and column-wise bias (Figure \ref{fig:intro}D-H), denoted as two real-valued probability vectors \(\bm{\mu},\,\bm{\nu}\), with \({\bm\mu}_i\in \left[0,1\right]\forall i\in \{1,...,m\}\) and \({\bm\nu}_j\in \left[0,1\right]\forall j\in \{1,...,n\}\). These two vectors represent processes that are object- and feature-specific, and are independent from the pattern generation process, or the homoscedastic background error. In other words, they capture the individual bias generation process that can't be captured by the existing model. 


In this work, our contribution is three-fold:
\begin{itemize}
    \item BABF is the first method that considers a heteroscedastic error model resulted from object- and feature-specific bias, which is more suitable for modeling real world data.
    \item BABF is a highly efficient algorithm in capturing the low rank structures in binary matrix in the presence of individual bias, and showed robust performance in deriving the true patterns across different data scenarios.
    \item As a byproduct of pattern discovery, BABF-derived individual bias patterns are highly consistent with the true bias pattern in simulated data and reasonable in real world data, which may lead to practical interpretations depending on different application scenarios.

\end{itemize}
%1) BABF is the first method that considers a heteroscedastic error model resulted from object- and feature-specific bias, which is more suitable for modeling real world data; 2) BABF is a highly efficient algorithm in capturing the low rank structures in binary matrix in the presence of individual bias, and showed robust performance in deriving the true patterns across different data scenarios; 3) As a byproduct of pattern discovery, BABF-derived individual bias patterns are highly consistent with the true bias pattern in simulated data and reasonable in real world data, which may lead to practical interpretations depending on different application scenarios.

\section{Problem formulation}
In this section, we formally address our objective to derive the latent Boolean patterns while considering the individual row- and column-wise bias in a probabilistic framework. We first introduce the notations used across this paper, then report the existing probabilistic BMF framework in \citet{ravanbakhsh2016boolean,rukat2017bayesian}, and then our bias-aware BMF model, BABF\footnote{code could be accessed at https://github.com/clwan/BABF}.      

\subsection{notation}

Matrix, vector and scalar values are denoted by uppercase (\(A\)), bold lowercase (\(\textbf{a}\)) and lowercase  (\(a\)) characters, respectively. The upper-script represents the dimension of the object (e.g. \(A^{m\times n}\)), while the lower-script indicates the element indices (e.g. \(i\)-th row: \(A_{i:}\), \(j\)-th column: \(A_{:j}\), and \(ij\)-th element: \(A_{ij}\)). \(|\cdot|\) represents a certain type of norm measure. Under Boolean arithmetic, the \textit{and}, \textit{or}, and \textit{not} operations are denoted by \(\wedge,\, \vee\), and \(\neg\). Subsequently, the Boolean element-wise sum and subtraction are defined as \(X\oplus Y=X\vee Y\) and \(X\ominus Y=(\neg X \vee Y)\wedge (X\vee \neg Y)\). The Boolean matrix product is defined as \(Z=X\otimes Y\), where \(Z_{ij}=\vee_{l=1}^k X_{ik}\wedge Y_{lj}\). 


\subsection{Existing homoscedastic BMF model}

Following \citet{ravanbakhsh2016boolean,rukat2017bayesian},  each observed entry in a matrix \(A\), i.e. \(A_{ij}\in \{0,1\}\), is assumed to be generated from the latent pattern \(Z_{ij}\) with a homoscedastic error model with universal flipping probability \(p_f\), where the likelihood function is defined as

\[p(A_{ij}|Z_{ij})=\begin{cases}
1-p_f,\; if \,A_{ij}=Z_{ij}\\
p_f,\; if\, A_{ij}\neq Z_{ij}
\end{cases}\]
\[p(A|Z)=\prod_{i,j}p(A_{ij}|Z_{ij})\]

As \(Z=X\otimes Y\), individual Bernoulli prior is applied on every element of \(X\) and \(Y\), i.e.,
\[p(X)=\prod_{i,l}p(X_{ij})\quad p(Y)=\prod_{l,j}Y_{jl}\]
Under this formulation, BMF is equivalent to a \textit{Maximum A posterior (\textbf{MAP})} inference problem of \(X\) and \(Y\) that maximizes the following overall likelihood function:
%\[\begin{split}
%&log(p(X,Y|A))\propto log(p(X)p(Y)p(Z|X,Y)p(A|Z))\\&
%=\sum_{il}^{m\times k}log(p(X_{il}))+\sum_{lj}^{k\times n}log(p(Y_{lj}))\\&
%+\sum_{ij}^{m\times n}log(p(Z_{ij}|\vee_{l=1}^k X_{il}\wedge Y_{lj}))\\&
%+\sum_{ij}^{m\times n}log(p(A_{ij}|Z_{ij}))
%\end{split}\]
\[p(X,Y|A)\propto p(X)p(Y)p(Z|X,Y)p(A|Z)\]
Following \citet{ravanbakhsh2016boolean}, we assume identical Bernoulli prior on \(X,Y\), represented by factor \(h\), e.g., \(h(X_{il})=log(p(X_{il})),\, h(Y_{lj})=log(p(Y_{lj}))\). Here, \(p(Z|X,Y)\) encodes the hard constraint that ensures the equality of the Boolean product, i.e., \(Z=X\otimes Y\). By introducing an auxiliary tensor \(W\in \{0,1\}^{m\times n\times k}\), where \(W_{ijl}=X_{il}\wedge Y_{lj}, \;Z_{ij}=\vee_{l=1}^{k}W_{ijl}\), this hard constraint is dispersed onto each element in \(W\), and can be reformulated as an identity constraint as \[p(W_{ijl}|X_{il},Y_{lj})=\mathcal{I}(W_{ijl}=X_{il}\wedge Y_{lj})\]  
where for \(\mathcal{I}\), we have \(\mathcal{I}(true)=1\) and \(\mathcal{I}(false)=0\). Obviously, if \(W_{ijl}\neq X_{il}\wedge Y_{lj}\), the factor \(f(W_{ijl},X_{il},Y_{lj})=log(p(W_{ijl}|X_{il},Y_{lj}))\) will be evaluated to be \(-\infty\). Finally, factor \(g(\{W_{ijl}\},\forall l\in \{1,...,k\})=log(p(A_{ij}|Z_{ij}))\) assess the likelihood of observed variable \(A_{ij}\) given the latent pattern \(Z_{ij}\). Overall, we have the factor graph representation of the log-likelihood \(p(X,Y|A)\) (Figure \ref{fig:model}A, adopted from \citet{ravanbakhsh2016boolean}) as 
\[\begin{split}
&log(p(X,Y|A))=\sum_{ij}h(X_{ij})+\sum_{lj}h(Y_{lj})\\&
+\sum_{ijl}f(W_{ijl},X_{il},Y_{lj})
+\sum_{ij}g(\{W_{ijl}\}_l)
\end{split}\]

\begin{figure}
    \centering
    \includegraphics[width=\linewidth]{model.png}
    \caption{The factor graph representation of BMF and Bias-aware BMF. Noted, figure A is adopted from \citet{ravanbakhsh2016boolean}}
    \label{fig:model}
\end{figure}

Owing to the NP-hard complexity of BMF \citep{stockmeyer1975set,gillis2018complexity}, it is intractable to infer the MAP of the log-likelihood. Alternatively, focusing on the marginal-MAP often yields good empirical success \citep{ravanbakhsh2016boolean,rukat2017bayesian}, e.g.,
\[\begin{split}
 &\argmax_{X_{il}}log(p(X_{il}|A))=\\&
 \argmax_{X_{il}}\sum_{X_{i'l'}\backslash X_{il},Y_{l'j'}}log(p(X_{i'l'},Y_{l'j'}|A))  
\end{split}\]
Max-sum belief propagation and Gibbs sampling have been reported to achieve good performance under such a strategy \citep{ravanbakhsh2016boolean,rukat2017bayesian}. 

\subsection{Proposed bias Aware BMF model}
The probabilistic BMF model presented above provides a good framework for us to account for the feature- and object-wise bias. Compared with the homoscedastic setting, the core advancement of our work is to consider the observed data as generated from a process that is more realistic: aside from stochastic error, or the homoscedastic error distribution as in \citep{ravanbakhsh2016boolean,rukat2017bayesian}, we consider that the observed data is generated not only from the latent pattern \(Z=X\otimes Y\), but also from independent object/feature behavior process governed by a bias matrix \(B\in\{0,1\}^{m\times n}\), where \(B\) is determined by a row- and column-wise bias vector \(\bm{\mu}\) and \(\bm{\nu}\) in such way that \[\quad p_{B_{ij}}=p(B_{ij}=1)=\bm{\mu}_i\bm{\nu}_j\] 
%\[\begin{split}
%&\bm{{\mu}}_i\in\left[0,1\right],\quad p(A_{ij'}=1)=\bm{\mu}_{i},\forall %j'\in\{1,...,n\}\\&
%\bm{\nu}_j\in\left[0,1\right],\quad p(A_{i'j}=1)=\bm{\nu}_{j},\forall %i'\in\{1,...,m\}
%\end{split}\]

And the generation process of \(A\) is hence
\[A=B\oplus ((Z+E)\,mod\,2).\]


The new likelihood of each observations can be characterized in the following four scenarios:
\[\begin{split}
&p(A_{ij}=1|Z_{ij}=0)=1-(1-p_f)(1-\bm{\mu}_i\bm{\nu}_j)\\&
p(A_{ij}=0|Z_{ij}=0)=(1-p_f)(1-\bm{\mu}_i\bm{\nu}_j)\\&
p(A_{ij}=1|Z_{ij}=1)=1-p_f(1-\bm{\mu}_i\bm{\nu}_j)\\&
p(A_{ij}=0|Z_{ij}=1)=p_f(1-\bm{\mu}_i\bm{\nu}_j)
\end{split}\]


The new posterior probability could then be written as
\[\begin{split}
p(X,Y,& \bm{\mu},\bm{\nu}|A)=\\&
p(X)p(Y)p(Z|X,Y)p(\bm{\mu})p(\bm{\nu})p(A|Z,\bm{\mu},\bm{\nu})
\end{split}\]
Factor graph representation of the new posterior is shown in Figure \ref{fig:model}B. Comparing to the existing probabilistic BMF model introduced in 2.2, the new factor graph involves the row- and column-wise bias vectors \(\bm{\mu},\bm{\nu}\). Given no prior knowledge of the two variables, we assume a uniform prior on \(\bm{\mu},\bm{\nu}\), thus factor \(b({\bm\mu}_i),b({\bm\nu}_j)\) evaluate to 0 in the graph. And the likelihood factor \(g\) is also related to \(\bm{\mu},\bm{\nu}\) in the new formulation. In the next section, we introduce BABF algorithm to derive the decomposition. 



%Same as in 2.2, the new bias aware formulation is still an NP-hard problem, and we also turn to find the marginal-MAP, which corresponds to optimally estimating individual variables, while the other variables are marginalized. In the next section, we introduce BABF algorithm to derive the decomposition. 




\section{The algorithm of BABF}
While we assume \(A\) to be generated from two sources, latent pattern \(Z\) and Bias \(B\), these two sources themselves can be considered as independent from each other. Such independence is also reflected on the factor graph (Figure \ref{fig:model}B). Though the likelihood factor \(g\) and the auxiliary variables \(W\) are involved with both \(\{X,Y\}\) and \(\{\bm{\mu},\bm{\nu}\}\), the direct message update of \(\{X,Y\}\) and \(\{\bm{\mu},\bm{\nu}\}\) are independent with each other. Conveniently, \(\{X,Y,W\}\) and \(\{\bm{\mu},\bm{\nu},W\}\) can be considered as two individual systems to be treated separately. 

\begin{algorithm}
\SetAlgoLined
\textbf{Inputs:}\(A\), \(k\), \(p_X\), \(p_Y\), \(p_f\), \(t_{all}\), \(t_{MF}\), \(t_{BI}\) \\
\textbf{BABF}:\\
\While{\(t\leq t_{all}\) and not converged messages}{
\(\bm{\mu}^{t+1},\bm{\nu}^{t+1}\leftarrow\)Bias\_infer(\(A,X^{t},Y^{t},t_{BI}\))\\
\(X^{t+1},Y^{t+1}\leftarrow\) prob\_BMF(\(A,k,p_X,p_Y,p_f,\bm{\mu}^{t+1},\bm{\nu}^{t+1},t_{MF}\))
}
\texttt{\\}

\textbf{Bias\_infer}:\\
\(Z:=X\otimes Y\)\\
\While{\(t\leq t_{BI}\) and \(error\_now< error\_all\)}{
\(error\_all:=error\_now\)
\(\bm{\mu}_i^{t+1}:=\frac{\sum_{j\in\{j_0|Z_{ij_0}=0\}} A_{ij}\bm{\nu}_j^t}{\sum_{j\in\{j_0|Z_{ij_0}=0\}} \bm{\nu}_j^t}, \forall i\in\{1,...,m\}\)
\(\bm{\nu}_j^{t+1}:=\frac{\sum_{i\in\{i_0|Z_{i_0j}=0\}} A_{ij}\bm{\mu}_i^t}{\sum_{i\in\{i_0|Z_{i_0j}=0\}} \bm{\mu}_i^t}, \forall j\in\{1,...,n\}\)
\(error\_now:=\sum_{(i,j)\in\{(i_0,j_0)|Z_{i_0j_0}=0\}}(A_{ij}-\bm{\mu}_i\bm{\nu}_j)^2\)
}


\texttt{\\}
\textbf{prob\_BMF:}\\
\(p(A_{ij}|Z_{ij})\leftarrow\) calculate based on \(\bm{\mu},\bm{\nu}\).\\
Initialize \(\Psi_{ijl}^0,\,\hat{\Psi}_{ijl}^0,\,\Phi_{ijl}^0,\,\hat{\Phi}_{ijl}^0,\,\Gamma_{ijl}^0,\,\hat{\Gamma}_{ijl}^0,\forall i,j,l\)\\
\While{\(t\leq t_{MF}\) and not converged messages}{
%\(\forall i,i' \in \{1,...,m\},\,j,j'\in \{1,...,n\},\,l,l'\in\{1,...,k\}\)
\(\Phi_{ijl}^{t+1}:=\text{max}(\Gamma_{ijl}^t+\hat{\Psi}_{ijl}^t,0)-\text{max}(\Psi_{ijl}^t,0)\)
\(\Psi_{ijl}^{t+1}:=\text{max}(\Gamma_{ijl}^t+\hat{\Phi}_{ijl}^t,0)-\text{max}(\Phi_{ijl}^t,0)\)
\(\hat{\Phi}_{ijl}^{t+1}:=log(\frac{1-p_f}{p_f})+\sum_{j'\neq j}\Phi_{ij'l}^t\)
\(\hat{\Psi}_{ijl}^{t+1}:=log(\frac{1-p_f}{p_f})+\sum_{i'\neq i}\Psi_{i'jl}^t\)
\(\Gamma_{ijl}^{t+1}:=\text{min}(log(\frac{p(A_{ij}|1)}{p(A_{ij}|0)}+\sum_{l'\neq l}\text{max}(\Gamma_{ijl'}^t)), \text{max}(0,-\max_{l'\neq l}\hat{\Gamma}_{ijl'}^t))\)
\(\hat{\Gamma}_{ijl}^{t+1}:=\text{min}(\hat{\Phi}_{ijl}^t+\hat{\Psi}_{ijl}^t,\hat{\Psi}_{ijl}^k,\hat{\Phi}_{ijl}^t)\)
}
%\(\forall i\in\{1,...,m\}\, j\in \{1,...,n\},\, l\in\{1,...,k\}\)
\(X_{il}=\begin{cases}
1,\quad \text{if}\; log(\frac{1-p_f}{p_f})+\sum_i\Phi_{ijl}^t>0\\
0,\quad otherwise.
\end{cases}\)\\
\(Y_{lj}=\begin{cases}
1,\quad \text{if}\; log(\frac{1-p_f}{p_f})+\sum_i\Psi_{ijl}^t>0\\
0,\quad otherwise.
\end{cases}\)\\
\texttt{\\}
\textbf{Outputs:} \(X,Y,\bm{\mu},\bm{\nu}\)
 \caption{BABF, Bias Aware BMF}
\end{algorithm}

Under this formulation, fitting pattern \(\{X,Y\}\) while given \(B\) is an NP-hard problem as it can be regarded as traditional BMF without the influence of \(B\), i.e., \[A\cdot(\neg B)=((Z+E)\, mod \,2)\cdot (\neg B).\]
Or probabilistically, while given \(B\),  this problem could be reduced to weighted graph maximum cut, which is also NP hard \cite{stockmeyer1975set,gillis2018complexity}. Overall, we can claim bias-aware BMF is at least as hard as traditional BMF. Therefore, it is also an NP hard problem. To solve this problem, we still turn to find the marginal-MAP, which corresponds to optimally estimating individual variables, while the other variables are marginalized.




Here we introduce BABF in algorithm 1. BABF has two core components, prob\_BMF and Bias\_infer, corresponding to the derivations of \(\{X,Y\}\)  and \(\{\bm{\mu},\bm{\nu}\}\). Other than the input data \(A\), BABF takes the pattern number parameters \(k\), the Bernoulli prior of \(X,Y\), filling error \(p_f\) and the maximum iterations for the overall algorithm as well as core components  (\(t_{all},t_{MF},t_{BI}\)) as input, and outputs the decomposition \(X,Y\) and the bias vectors \(\bm{\mu},\bm{\nu}\). 

%\begin{algorithm}
%\SetAlgoLined
%\textbf{Inputs}: \(A,k,p_X,p_Y,p_f,\bm{\mu},\bm{\nu},t_{MF}\)\\
%\textbf{Initialization:}\(\Psi_{ijl}^0,\,\hat{\Psi}_{ijl}^0,\,\Phi_{ijl}^0,\,\hat{\Phi}_%{ijl}^0,\,\Gamma_{ijl}^0,\,\hat{\Gamma}_{ijl}^0,\forall i,j,l\)\\
%\textbf{prob\_BMF:}\\
%\(p(A_{ij}|Z_{ij})\leftarrow\) calculate based on \(\bm{\mu},\bm{\nu}\).\\
%\While{\(t\leq t_{MF}\) and not converged messages}{
%%\(\forall i,i' \in \{1,...,m\},\,j,j'\in \{1,...,n\},\,l,l'\in\{1,...,k\}\)
%\(\Phi_{ijl}^{t+1}:=\text{max}(\Gamma_{ijl}^t+\hat{\Psi}_{ijl}^t,0)-\text{max}(\Psi_{ijl%}^t,0)\)
%\(\Psi_{ijl}^{t+1}:=\text{max}(\Gamma_{ijl}^t+\hat{\Phi}_{ijl}^t,0)-\text{max}(\Phi_{ijl%}^t,0)\)
%\(\hat{\Phi}_{ijl}^{t+1}:=log(\frac{1-p_f}{p_f})+\sum_{j'\neq j}\Phi_{ij'l}^t\)
%\(\hat{\Psi}_{ijl}^{t+1}:=log(\frac{1-p_f}{p_f})+\sum_{i'\neq i}\Psi_{i'jl}^t\)
%\(\Gamma_{ijl}^{t+1}:=\text{min}(log(\frac{p(A_{ij}|1)}{p(A_{ij}|0)}+\sum_{l'\neq %l}\text{max}(\Gamma_{ijl'}^t)), \text{max}(0,-\max_{l'\neq l}\hat{\Gamma}_{ijl'}^t))\)
%\(\hat{\Gamma}_{ijl}^{t+1}:=\text{min}(\hat{\Phi}_{ijl}^t+\hat{\Psi}_{ijl}^t,\hat{\Psi}_%{ijl}^k,\hat{\Phi}_{ijl}^t)\)
%}
%%\(\forall i\in\{1,...,m\}\, j\in \{1,...,n\},\, l\in\{1,...,k\}\)
%\(X_{il}=\begin{cases}
%1\quad \text{if}\; log(\frac{1-p_f}{p_f})+\sum_{i=1}^m\Phi_{ijl}^t>0\\
%0\quad otherwise.
%\end{cases}\)
%\(Y_{lj}=\begin{cases}
%1\quad \text{if}\; log(\frac{1-p_f}{p_f})+\sum_{i=1}^m\Psi_{ijl}^t>0\\
%0\quad otherwise.
%\end{cases}\)

%\textbf{Return:}
%\(X,Y\)
%\caption{prob\_BMF}
%\end{algorithm}



\subsection{prob\_BMF}
When fixing bias vectors \(\bm{\mu},\bm{\nu}\), the only differences between bias aware BMF and the existing BMF model introduced in 2.2 is that each likelihood factor \(g_{ij}\) would evaluate to different probability assignments by referencing \(\bm{\mu}_{i},\bm{\nu}_j\). Following \citet{ravanbakhsh2016boolean}, we utilize the max-sum belief propagation (BP) strategy to approximate the overall likelihood in prob\_BMF. Correspondingly, the message \(\Gamma_{ijl}\) that propagates the likelihood information to auxiliary variable \(W\) would be different from \citet{ravanbakhsh2016boolean} with individualized probabilities. We introduced detailed derivations of the message passing process in the Bias aware factor graph (Figure \ref{fig:model}B) in the Appendix.   

%\begin{algorithm}
%\SetAlgoLined
%\textbf{Inputs:} \(A,X,Y,t_{BI}\)\\
%\textbf{Initialization}: \({\mu}_i,{\nu}_j,\,\forall %i\in\{1,...,m\}\;j\in\{1,...,n\}\) \(error\_all=m\times n\), %\(error\_now=error\_all-1\)\\
%\textbf{Bias\_infer}:\\
%\(Z:=X\otimes Y\)\\
%\While{\(t\leq t_{BI}\) and \(error\_now< error\_all\)}{
%\(error\_all:=error\_now\)
%\({\mu}_i^{t+1}:=\frac{\sum_{j=1,Z_{ij}=0}^n %A_{ij}{\nu}_j^t}{\sum_{j=1,Z_{ij}=0}^n {\nu}_j^t} \forall i\in\{1,...,m\}\)
%\({\nu}_j^{t+1}:=\frac{\sum_{i=1,Z_{ij}=0}^m %A_{ij}{\mu}_i^t}{\sum_{i=1,Z_{ij}=0}^m {\mu}_i^t} \forall j\in\{1,...,n\}\)
%\(error\_now:=\sum_{ij,Z_{ij}=0}(A_{ij}-{\mu}_i{\nu}_j)^2\)
%}
%\textbf{Return:} \(\bm{\mu},\bm{\nu}\)
% \caption{Bias\_infer}
%\end{algorithm}

\begin{figure*}
    \centering
    \includegraphics[width=\textwidth]{simu_b.png}
    \caption{Performance comparison on simulated data}
    \label{fig:simub}
\end{figure*}

\subsection{Bias\_infer}

The inference of the marginal-MAP of \(\bm{\mu},\bm{\nu}\) is a non-trivial task even with accurate pattern information \(Z\), as for any bias variable \(\bm{\mu}_i\), any observation related to this variable is related to a different \(\bm{\nu}_i\), and vice versa. To circumvent this computational challenge, we adopted two modifications. 1) We only consider the observations that are not covered by pattern \(Z\) for bias inference. We argue the pattern related observations have marginal contribution to the bias inference and could be omitted. 2) Instead of deriving exact MAP, we treat this as an optimization problem, where we could utilize conventional loss functions to achieve the same objective that optimize the difference between \(\bm{\mu},\bm{\nu}\) and background information. Inspired by \citet{wan2020denoising}, we apply a modified mean square loss. Take \(\bm{\mu}_i\) as an example, the loss function takes the form of
 \[\Omega_{i}=\sum_{j\in\{j_0|Z_{ij_0}=0\}}\bm{\nu}_j^t(A_{ij}-\bm{\mu}_i^t)^2\]
 The most important benefit of this modified loss is that
it ensures each probability \(\bm{\mu}_i\) would be from the interval [0,1], and it still considers the impact of \(\bm{\nu}_j\) on each observation \(A_{ij}\). Moreover, it is with high computational feasibility as the updated \(\bm{\mu}_i^{t+1}\) could be easily derived as
 \(\bm{\mu}_i^{t+1}:=\frac{\sum_{j\in\{j_0|Z_{ij_0}=0\}} A_{ij}\bm{\nu}_j^t}{\sum_{j\in\{j_0|Z_{ij_0}=0\}} \bm{\nu}_j^t}\). And similarly,
  \(\bm{\nu}_j^{t+1}:=\frac{\sum_{i\in\{i_0|Z_{i_0j}=0\}} A_{ij}\bm{\mu}_i^t}{\sum_{i\in\{i_0|Z_{i_0j}=0\}} \bm{\mu}_i^t}\). Here, we implement this strategy in Bias\_infer. Empirically, it is robust for the bias inference across different scenarios, which we will introduce in detail in the Experiments section.

\subsection{complexity analysis}
The computational cost of BABF depends on the core modules. For each iteration, prob\_BMF will visit all variables in \(\{X,Y,W\}\), and the calculation of the message update is at constant cost. Hence, the cost of prob\_BMF is bounded by the size of latent variables, i.e. \(O(mnk)\). The consideration of mean square loss enables high computational feasibility to update the bias, therefore, in each iteration of Bias\_infer the computation is linear with data size, i.e., \(O(mn)\). Overall, the computational cost of each iteration of BABF is \(O(mnk)\).   


\section{Experiments}
We evaluate the performance of our bias aware model on both synthetic and real world datasets. We first introduce related methods for BMF and report the benchmark performance across different simulated data scenarios. We then highlight the practical use of BABF in our analysis of a movielens and gene expression data.

%In this section we illustrate the performance of BABF, the first tool to simultaneously derive both \(\{X,Y\}\) and \(\{\bm{\mu},\bm{\nu}\}\). 

\begin{figure*}
    \centering
    \includegraphics[width=\textwidth]{background.png}
    \caption{BABF inferred bias is highly correlated with ground truth bias}
    \label{fig:bg}
\end{figure*}


\subsection{related work}
In addition to the probabilistic methods introduced above \citep{ravanbakhsh2016boolean,rukat2017bayesian}, different heuristic methods have been developed to solve the BMF problem. Previously \citet{wan2020denoising} systematically discussed the bias issue in BMF, but their focus is to explore the identifiability of the patterns in the presence of bias in the noise model. For the rest of the methods, none of them considered the heteroscesdastic issue of the error distribution. Among these methods, ASSO represents a series of work from Miettinen et al \citep{miettinen2008discrete,miettinen2011model,karaev2015getting,tatti2019boolean}. ASSO first generates a pool of column bases from row-wise correlation matrix, and iteratively searches for the best column and row bases following a pre-defined cost function. PANDA is another series of heuristic methods that embed the cost function in the search of top\_\(k\) core patterns \citep{lucchese2010mining,lucchese2013unifying}. Formal Concept Analysis also showed empirical success in BMF \citep{belohlavek2015below,belohlavek2018new,belohlavek2019factorizing}. More recently, \citet{wan2020fast} proposed a fast algorithm by formulating submatrix pattern identification in a geometric perspective. \citet{kovacs2020binary} formulates BMF as an integer program problem and utilizes column generation framework to search for the best solutions. Here, we benchmark the performance of BABF with MP \citep{ravanbakhsh2016boolean}, CG \citep{kovacs2020binary}, MEBF \citep{wan2020fast}, ASSO \citep{miettinen2008discrete} and PANDA \citep{lucchese2010mining} and believe that this set of methods represent the state-of-the-art performance of BMF in different perspectives. 






\subsection{benchmark on simulated data}
We simulate an observed binary matrix \(A\) by the following model: \[A=B\oplus ((Z+E) \,mod\,2).\]

Here, \(B, Z, E\) represent the column-/row-wise bias matrix, pattern matrix and error matrix respectively. 
Each entry in \(B,E\in\{0,1\}^{m\times n}\) is simulated to follow Bernoulli distribution with success probabilities \(p(B_{ij})\propto \bm{\mu}_i\bm{\nu}_j\) and \(p(E_{ij})=p_f\). The latent pattern matrix is generated by \(Z=X\otimes Y\), where \(X\in \{0,1\}^{m\times k},Y\in\{0,1\}^{k\times n}\), and entries in \(X,Y\) also follow Bernoulli distributions with success probabilities \(p(X_{il})=p_X\) and \(p(Y_{lj})=p_Y\). To comprehensively evaluate the methods,  we generate varied data scenarios by considering different pattern numbers (\(k\in\{3,4,5\}\)), and flipping error (\(p_f\in\{0,0.05\}\)). We also use different levels of \(p_X,p_Y\) to simulate pattern matrices of different density levels, where low density has \(p_X=p_Y=0.2\) while high density has \(p_X=p_Y=0.4\). The bias level is controlled by \(\bm{\mu},\bm{\nu}\). In case of low bias, we sample every \(\bm{\mu}_i,\,\bm{\nu}_j\) uniformly from \(\left[0.1,0.8\right]\), which yields a overall bias level of \(\Bar{p_{B_{ij}}}\sim 0.2\). For the high bias case, \(\bm{\mu}_i,\,\bm{\nu}_j\) is sampled from \(\left[0.3,0.9\right]\) that results an overall bias level of \(\Bar{p_{B_{ij}}}\sim 0.36\). Altogether, we simulated 24 data scenarios. For each scenario, we set \(m=n=100\) and simulate 20 replicates.



\subsubsection{Performance on reconstruction error}

We report the benchmark results in Figure \ref{fig:simub}. We utilize default setting of the benchmarking methods in our analysis. As for BABF, we assume the prior of \(X,Y\) as Bernoulli distribution with \(p_X=p_Y=0.5\) and a flipping error of \(p_f=0.01\). The maximum iterations of \(t_{all},t_{BI},t_{MF}\) are set at 20, 5 and 50.

For each method, we compare their performance using  reconstruction\_error, i.e, \(|\hat{Z}-Z|\) as evaluation metric. Here, \(|\cdot|\) represents the \(L_1\) norm, and \(\hat{Z}\) denotes the derived pattern matrix by each method. Lower reconstruction error indicates a better performance. It is anticipated that heuristic approaches like CG, MEBF, ASSO, PANDA would show varied performances respect to different data scenarios as different bias level would result in different impact on their underlying heuristic assumptions. Probabilistic method MP showed an overall stable performance but still struggles with high bias level. As expected, BABF achieves the most desirable performance with different bias levels, which highlights the importance to consider individual bias. Additionally, BABF revealed its robustness towards different data scenarios.




\subsubsection{Evaluate inferred bias}
We explore whether BABF could reliably recover the bias levels \(\bm{\mu},\bm{\nu}\). Here, we denote BABF inferred row- and column-wise bias as \(\hat{\bm{\mu}},\hat{\bm{\nu}}\). Since it is easy to find a scalar value \(r\), s.t., \(\bm{\mu}_i\cdot\bm{\nu}_j=r\hat{\bm{\mu}}_i\cdot\frac{1}{r}\hat{\bm{\nu}}_j\), we do not seek to directly compare the difference of values between \(\bm{\mu}_i, \hat{\bm{\mu}}_i\), or \(\bm{\nu}_j, \hat{\bm{\nu}}_j\), but instead analyze the correlation between the inferred bias and true bias for every input matrix \(A\), i.e., \(corr(\bm{\mu},\hat{\bm{\mu}}), corr(\bm{\nu},\hat{\bm{\nu}})\).  We report the correlation results across different data scenarios with pattern number (\(k=4\)) in Figure \ref{fig:bg}. Every scenario has 20 replications. Figure \ref{fig:bg}A,B show the row- and column-wise bias across different data scenarios. In most cases, BABF inferred bias achieved over 0.8 correlation with ground truth. Even in the worst case, the correlation is as high as 0.4. To give a more intuitive idea, we reveal the inferred bias and true bias of the first input matrix from each scenario as an example in Figure \ref{fig:bg}C,D. The high correlation suggests desirable performance of BABF to infer the individual bias associated with the objects and the features.




\subsubsection{Performance on data without bias}
Next, we wish to test how BABF performs on data without bias ( \(\bm{\mu}_i=\bm{\nu}_j=0,\forall i,j\)). In other words, we would like to demonstrate that BABF works well in scenarios with or without bias. In Figure \ref{fig:simub}, we report the reconstruction error of the methods across 12 data scenarios all without background bias. In general, BABF and MP showed reliable performance. In some high density cases, BABF performs slightly worse than MP, but the difference is only marginal. Overall, BABF showed robust performance towards different data scenarios.


\begin{figure}
    \centering
    \includegraphics[width=\linewidth]{simu_nb.png}
    \caption{Performance comparison on simulated data without individual bias}
    \label{fig:simunb}
\end{figure}

\subsubsection{Selection of the pattern number}
In our setting, the pattern number \(k\) is the most important hyper-parameter that directly determines the number of variables in the factor graph. Under the probabilistic framework, we could utilize different statistical metrics to select the most optimal pattern numbers. Here we test three metrics, including cross validation accuracy (CV), Akaike information criterion (AIC) and Bayesian information criterion (BIC) for the model selection of \(k\). For CV, we use 90\% of the data for fitting and the rest 10\% for testing \citep{kohavi1995study}. For AIC and BIC, we utilize the formulation in \citet{stoica2004model}. For all the methods, we evaluate the metrics on \(k=\{2,...,6\}\) and select the best \(k\) following their formulation. We tested above metrics across all 24 data scenarios and report the pattern number selections results in box plots (Figure \ref{fig:model_select}). Here red dash marked the ground truth of \(k\). Overall, CV showed consistently accurate selection of pattern number, with only marginal derivations for a small number of cases. AIC and BIC are impacted by the size of input data given a rather large number of variables in the model. Particularly, BIC tends to select a small \(k\) for the model.

\begin{figure}
    \centering
    \includegraphics[width=\linewidth]{model_select.png}
    \caption{Model selection of pattern number \(k\)}
    \label{fig:model_select}
\end{figure}

\begin{table}
  \caption{Reconstruction error on different stopping criteria}
  \begin{adjustbox}{max width=\linewidth,center}\label{table1}
  \begin{tabular}{lcccc}
    \toprule
   \backslashbox{\(t_{BI}\)}{\(t_{MF}\)} & \textbf{10} & \textbf{25} & \textbf{50} & \textbf{100} \\
    \midrule
\textbf{5} & 56.1(242.0) & 2.2(5.5) & 2.2(5.5) & 33.0(.6) \\
\textbf{10} & 54.6(230.5) & 2.2(5.5) & 2.2(5.5) & 2.2(5.5) \\
\textbf{15} & 41.9(180.1) & 57.6(241.0) & 2.2(5.5)& 2.2(5.5)\\
    \bottomrule
  \end{tabular}
  \end{adjustbox}

\end{table}

\begin{figure*}
    \centering
    \includegraphics[width=\textwidth]{real_llh.png}
    \caption{Goodness of fitting on the decomposition for real-world data}
    \label{fig:llh}
\end{figure*}

\subsubsection{Testing on the stopping criteria}
The optimization scheme of our bias aware BMF alternatively fits the bias and the pattern matrices: fitting pattern while giving bias corresponds to the algorithm component \textbf{prob\_BMF}, and fitting bias while given pattern information corresponds to the algorithm component \textbf{Bias\_infer}. At each iteration, we provide the option to set the maximum number of runs per step for \textbf{prob\_BMF} and \textbf{Bias\_infer} (corresponding to \(t_{MF}\) and \(t_{BI}\) in algorithm 1). In setting the correct \(t_{MF}\) and \(t_{BI}\), our goal is to find a rather \textit{"optimal"} point that will not lead to premature overfitting of pattern or bias before the final convergence. We checked 12 different combinations where \(t_{MF}=10,25,50,100\) and \(t_{BI}=5,10,15\). For every combination, the maximum steps are set to 10000 to ensure convergence. We expect that \(t_{MF}\) in general needs to be higher than \(t_{BI}\) as the factor graph of \textbf{prob\_BMF} is denser than \textbf{Bias\_infer}. In table \ref{table1}, we report the mean(standard derivation) reconstruction error of one scenario: high density, low bias, with noise and \(k=3\). As expected, it is not always higher the better for \(t_{MF}\) or \(t_{BI}\); instead, \(t_{MF}\) and \(t_{BI}\)  need to be balanced to achieve small reconstruction error. Similar results can be seen across different scenarios. In practice we set \(t_{MF}=50\) and \(t_{BI}=10\) as default.




\subsection{Analysis on real-world data}
We tested the performance of BABF on three real world datasets,  movie lens data from \citet{harper2015movielens} and two biological gene expression datasets, head and neck cancer and melanoma single cell RNAseq (scRNA-seq) data from \citet{puram2017single,tirosh2016dissecting}. The choice of the datasets as well as the pre-processing procedures follow previous works \citep{rukat2017bayesian,wan2020data}. In movie lens data, we have 943 users that rated/not rated 1682 films. In head and neck, and melanoma data, we have 5902 cells that express/not express 7954 genes, and 4486 cells that express/not express 8210 genes, respectively. For each dataset, we first identify the number of patterns \(k\in\{2,20\}\) through cross validation, which yield 5 patterns in movie lens, 3 patterns in melanoma and 6 patterns in head and neck. BABF is then applied to retrieve \(\hat{X},\hat{Y}\), \(\hat{\bm{\mu}}\) and \(\hat{\bm{\nu}}\) following the specific pattern number for each dataset. We mainly focus on addressing two  questions: 1. Would the consideration of individual bias benefit our interpretation of real world data? 2. Does the inferred bias carry any practical meaning?





\subsubsection{Data interpretation}

Since the underlying true patterns of real world data is not accessible, instead of comparing decomposed pattern \(\hat{Z}\) with input matrix \(A\), where \(A\) is constituted of not only the true pattern matrix \(Z\) but also likely noise matrix \(E\) and bias matrix \(B\), we use a likelihood metric. Specifically, we evaluate the goodness of model fitting as the overall likelihood, where a larger likelihood indicating a better fitting of the data. We compare the likelihood of BABF with the probabilistic BMF method MP. Similar to \cite{ravanbakhsh2016boolean,rukat2017bayesian}, we investigate the methods' performance by only keeping a certain percentage of the observations, called observation level, while masking the rest of the observations. At every observation level, we replicate the analysis for five times and report the mean log-likelihood value. We report the likelihood results in Figure \ref{fig:llh}A,B,C. On all three datasets, BABF showed higher overall likelihood compared with MP, which suggests that the individual bias assumption is more realistic for real-world data, that movie viewers or cells could be vastly different from each other even in the same pattern group, and such bias is independent with the latent pattern. This advocates the necessity to consider the individual bias in the BMF problem.

To further test the interpretability of the patterns, we examined how the patterns coincide with cell type labels in the two expression datasets, and the movie genres in the movielens dataset, using adjusted rand index, where a higher value corresponds to a greater similarity \citep{rand1971objective}. Figure \ref{fig:llh}D shows the peformance of BABF and MP on three datasets. Though both BABF and MP perform poorly on movie lens data, the decomposition from BABF showed higher similarity with given labels in both biological data, which partially revealed a better decomposition of BABF compared with MP.      

\subsubsection{Practical meaning of inferred bias}

\begin{figure}
    \centering
    \includegraphics[width=\linewidth]{real_bias.png}
    \caption{Interpretation analysis on inferred bias}
    \label{fig:bias}
\end{figure}

The individual bias assumption allows BABF to outperform or have comparable performance with the existing BMF methods, whether such bias is present or not. Here, we want to understand whether the inferred bias could reflect certain practical meaning. Inspired by \citet{wan2020denoising}, in movie lens data, we want to explore the inferred bias on individual user with their taste on movie types. In our hypothesis, if a user only focus on certain genres of movie, then their behavior could be majorly explained by pattern information \(Z\), while with less effect from \(B\). Here we design the \textit{focus index} to quantify such effect. Specifically if a user watched \(a\) movies in \(c\) categories, i.e., \(a=b_1+...+b_c\), the focus index of this user is calculated as \(focus\_index:=\sum_{i=1}^c(\frac{b_i}{a})^2\). As anticipated, inferred bias is negatively correlated with the focus index (Figure \ref{fig:bias}A, \(corr=-0.19,\;p=3.67e-6\)). This significant negative correlation revealed that the inferred bias partially revealed certain taste of the movie viewers. 

In the case of gene expression, we focus on two groups of genes, housekeeping genes and non-housekeeping genes \citep{eisenberg2013human}. As the name revealed, housekeeping genes are to maintain the basic activities of the cells, that each cell, regardless of their cell types, will all express these genes. On the other hand, non-housekeeping genes will be the ones that reflect the cell-type specificity. For example, T cells will express T cell marker genes like CD3D,CD3E \citep{call2002organizing,wan2019ltmg}. Figure \ref{fig:bias}B,C are the density plot visualization of the inferred bias on housekeeping and non-housekeeping genes. As expected, housekeeping genes have a much bigger effect from bias as their expression behavior is not related with any patterns. On the other hand, since the non-housekeeping genes revealed the specificity of the cell, its behavior is largely covered by the latent pattern, such that we witness a small bias in \(\hat{\bm{\mu}}\) on both datasets.



\section{Conclusion}
In this paper, we propose a bias aware model, BABF, which is the first algorithm to derive Boolean matrix decomposition in the presence of individual object- and feature-wise bias. Compared with other methods, BABF is a highly efficient approach, which not only results in good approximation of the true binary pattern with low reconstruction error, but also infers individual bias with high consistency with ground truth. The bias inference from BABF could lead to interesting interpretations depending on different data scenarios.


\section{Acknowledgment}
The work is supported by NSF DBI IIBR 2047631, NSF IIS 2145314, American Cancer Society RSG-22-062-01-MM, NCI 5P30CA082709-22, NIA 1P30AG072976-01, and NIH NIGMS 1R01GM131399.



\bibliography{uai2022-template}

%\appendix
% NOTE: necessary when ptmx or no mathfont class option is given
%\providecommand{\upGamma}{\Gamma}
%\providecommand{\uppi}{\pi}
%\newpage
%\textbf{Appendix}
%\section{Message update}

%Despite the dense connections in the factor graph, max-sum belief propagation achieved admirable performance in the case of approximate the MAP of Boolean matrix factorization \citep{ravanbakhsh2016boolean}. Here we also utilize this strategy that not only derive the MAP of matrix decomposition \(X\) and \(Y\), but also infer the background row- and column-wise bias \(\bm{\mu},\bm{\nu}\). Though the information of \(\bm{\mu},\bm{\nu}\) and \(X,\,Y\) communicates through likelihood factor \(g\) and auxiliary variable \(W\), their independence of each other resulted in disconnected message update between \(\bm{\mu},\bm{\nu}\) and \(X,\,Y\). Conveniently, \(\{X,\,Y,\,W\}\) and \(\bm{\mu},\bm{\nu},\,W\}\) can be considered as two separate systems. In this paper we focus on the message update of \(\{\bm{\mu},\bm{\nu},\,W\}\), and adopt the algorithm in \citet{ravanbakhsh2016boolean} for \(\{X,\,Y,\,W\}\). 

%\subsubsection{update X,Y,W}

%\textbf{Variables to factor message.}

%Conveniently, all the variables in \(\{X,Y,W\}\) are binary variables (\(X_{il},\,Y_{lj},\,W_{ijl}\in \{0,1\}\)). Following the notation in \citet{ravanbakhsh2016boolean}, we denote the message between factors and variables as \(\mathbf{m}\) (e.g., \(\mathbf{m}_{X_{il}\rightarrow f_{ijl}}(X_{ij}):\{0,1\}\rightarrow \mathcal{R}\)). Max-sum BP is utilized to calculate the outgoing message, while consideration all incoming messages from neighbor factors, despite the receiving one, e.g.,
%\[\begin{split}
%  \mathbf{m}_{X_{il}\rightarrow f_{ijl}}(X_{ij})^{t+1}=&\mathbf{m}_{h_{il}\rightarrow X_{ill}}(X_{il})^t+\\&
%  \sum_{j'\neq j}\mathbf{m}_{f_{ij'l}\rightarrow X_{il}}(X_{il})^t\\& 
%\end{split}
%\]

%Our objective is to achieve the maximum likelihood, which align with the difference between the message of \(\mathbf{m}_{X_{il}\rightarrow f_{ijl}}(X_{ij}=1)\) and \(\mathbf{m}_{X_{il}\rightarrow f_{ijl}}(X_{ij}=0)\), i.e.,
%\[\hat{\Phi}=\mathbf{m}_{X_{il}\rightarrow f_{ijl}}(1)-\mathbf{m}_{X_{il}\rightarrow f_{ijl}}(0)\]
%In the case of individual variable \(X_{il}\) to the factor \(f_{ijl}\)
%\[\begin{split}
%\hat{\Phi}_{ijl}^{t+1}&=(\mathbf{m}_{h_{il}\rightarrow %X_{ill}}(1)^t+\sum_{j'\neq j}\mathbf{m}_{f_{ij'l}\rightarrow X_{il}}(1)^t)\\&
%-(\mathbf{m}_{h_{il}\rightarrow X_{ill}}(0)^t+\sum_{j'\neq j}\mathbf{m}_{f_{ij'l}\rightarrow X_{il}}(0)^t)\\&
%=log(\frac{p(X_{il}=1)}{p(X_{il}=0)})+\sum_{j'\neq j}\Phi_{ij'l}^t
%\end{split}\]
%Similarly, the message \(\hat{\Psi}\) can be derived as
%\[\hat{\Psi}_{ijl}=log(\frac{p(Y_{lj}=1)}{p(Y_{lj}=1)})+\sum_{i'\neq i}\Psi^t_{i'jl}
%\]

%For \(W\), since each variable \(W_{ijl}\) has exact two factor neighbors \(g_{ij},\,f_{ijl}\), the message from \(W_{ijk}\) to either factors is the message from the other factor, i.e.,
%\[\textbf{m}_{W_{ijl}\rightarrow g_{ij}}(W_{ijl})=\textbf{m}_{f_{ijl}\rightarrow W_{ijl}}(W_{ijl})\]
%\[\textbf{m}_{ g_{ij} \rightarrow W_{ijl}}(W_{ijl})=\textbf{m}_{ W_{ijl}\rightarrow  f_{ijl} }(W_{ijl})\]
%We will discuss in detail of the message involve factor \(g\) in next section.

%\textbf{factor to variable message}

%For factor \(h\), it only connect to the single variable \(X_{il}\) or \(Y_{lj}\), which works as prior knowledge for the sparsity of \(X\) and \(Y\), where their information is passed through \[h_{il}(X_{il}=1)-h_{il}(X_{il}=0)=log(\frac{p(X_{il}=1)}{p(X_{il}=0)})\]
%\[h_{lj}(Y_{lj}=1)-h_{lj}(Y_{lj}=0)=log(\frac{p(Y_{lj}=1)}{p(Y_{lj}=0)})\]

%Factor \(f\) links \(X,Y\) with the auxiliary variable \(W\), that ensures \(W_{ijl}=X_{il}\wedge Y_{lj}\), i.e.,
%\[f(X_{il},Y_{lj},W_{ijl})=log(\mathcal{I}(W_{ijl}=X_{il}\wedge Y_{lj}))\]
%Notably, \(f(X_{il},Y_{lj},W_{ijl})\rightarrow -\infty\) if \(W_{ijl}\neq X_{il}\wedge Y_{lj}\). Such that it restrict the message scenarios when passing the information from \(f\) to \(X,Y\). Here, we use \(\mathbf{m}_{f_{ijl}\rightarrow X_{il}(X_{il})}\) as example, where \(\mathbf{m}_{f_{ijl}\rightarrow Y_{lj}(Y_{lj})}\) can be similarly derived. For \(X_{il}\) to equal to 1, if \(Y_{lj}=1\), restricted by \(f\), \(W_{ijl}=1\), and if \(Y_{lj}=0\), \(W_{ijl}=0\), thus,
%\[\begin{split}
%   &\mathbf{m}_{f_{ijl}\rightarrow X_{il}}(1)^{t+1}=max(\mathbf{m}_{Y_{lj}\rightarrow f_{ill}}(1)^t+\\& \mathbf{m}_{W_{ijl}\rightarrow f_{ijl}}(1)^t,\quad \mathbf{m}_{Y_{lj}\rightarrow f_{ill}}(0)^t+\mathbf{m}_{W_{ijl}\rightarrow f_{ijl}}(0)^t)\\& 
%\end{split}
%\]
%While if \(X_{il}=0\), \(W_{ijl}=0\) regardless the value of \(Y_{lj}\), i.e.,
%\[\begin{split}
%   &\mathbf{m}_{f_{ijl}\rightarrow X_{il}}(0)^{t+1}=max(\mathbf{m}_{Y_{lj}\rightarrow f_{ill}}(1)^t+\\& \mathbf{m}_{W_{ijl}\rightarrow f_{ijl}}(0)^t,\quad \mathbf{m}_{Y_{lj}\rightarrow f_{ill}}(1)^t+\mathbf{m}_{W_{ijl}\rightarrow f_{ijl}}(0)^t)\\& 
%\end{split}
%\]
%Since \(\hat{\Psi}_{ijl}=\mathbf{m}_{Y_{lj}\rightarrow f_{ijl}}(1)-\mathbf{m}_{Y_{lj}\rightarrow f_{ijl}}(0)\), and \(\Gamma_{ijl}=\mathbf{m}_{W_{ijl}\rightarrow f_{ijl}}(1)-\mathbf{m}_{W_{ijl}\rightarrow f_{ijl}}(0)\) the message from \(f\) to \(X\) can be derived as
%\[\begin{split}
%\Phi_{ijl}&=\mathbf{m}_{f_{ijl}\rightarrow X_{il}}(1)-\mathbf{m}_{f_{ijl}\rightarrow X_{il}}(0)\\&
%=max(\Gamma_{ijl}+\hat{\Psi}_{ijl},0)-max(\hat{\Psi}_{ijl},0)
%\end{split}\]
%Similarly, we have
%\[\begin{split}
%\Psi_{ijl}&=\mathbf{m}_{f_{ijl}\rightarrow Y_{il}}(1)-\mathbf{m}_{f_{ijl}\rightarrow Y_{il}}(0)\\&
%=max(\Gamma_{ijl}+\hat{\Phi}_{ijl},0)-max(\hat{\Phi}_{ijl},0)
%\end{split}\]

%Following the same strategy, while considering the message from factor \(f\) to variable \(W\), if \(W_{ijl}=1\), \(X_{il}=Y_{lj}=1\), whereas if \(W_{ijl}=0\), either \(X_{il}\) or \(Y_{lj}\) should equal to zero, i.e.,
%\[\mathbf{m}_{f_{ijl}\rightarrow W_{ijl}}(1)^{t+1}=\mathbf{m}_{Y_{lj}\rightarrow f_{ill}}(1)^t+\mathbf{m}_{X_{il}\rightarrow f_{ijl}}(1)^t
%\]
%\[\begin{split}
% &\mathbf{m}_{f_{ijl}\rightarrow W_{ijl}}(0)^{t+1}=max(\mathbf{m}_{Y_{lj}\rightarrow f_{ill}}(1)^t+\\&
% \mathbf{m}_{X_{il}\rightarrow f_{ijl}}(0)^t,\mathbf{m}_{Y_{lj}\rightarrow f_{ill}}(0)^t+\mathbf{m}_{X_{il}\rightarrow f_{ijl}}(1)^t,\\&\mathbf{m}_{Y_{lj}\rightarrow f_{ill}}(0)^t+\mathbf{m}_{X_{il}\rightarrow f_{ijl}}(0)^t)  
%\end{split}
%\]
%Such that
%\[\begin{split}
% &\hat{\Gamma}_{ijk}=\mathbf{m}_{f_{ijl}\rightarrow W_{ijl}}(1)  -\mathbf{m}_{f_{ijl}\rightarrow W_{ijl}}(0)\\&
% =min(\hat{\Phi}_{ijl}+\hat{\Psi}_{ijl},\hat{\Phi}_{ijl},\hat{\Psi}_{ijl})
%\end{split}\]

%\subsection{update \(\mu,\nu,W\)}

%In the previous section, we have derived the messages passing between the \(X,\,Y\) and \(W\). In this section, we derive the message passing between \(\mathbf{\mu},\,\mathbf{v}\) and \(W\), where they all related to the likelihood factor \(g\). Also, different with binary variable \(W\), \(\bm{\mu}\) and \(\bm{\nu}\) are Bernoulli variable, that the simplified singleton message does not applied for their message update. We first reinstate the log likelihood function of each element (\(A_{ij}\)) that represent the factor \(g\). 
%\[\begin{split}
%&p(A_{ij}=1|Z_{ij}=0)=1-(1-p_f)(1-\bm{\mu}_i\bm{\nu}_j)\\&
%p(A_{ij}=0|Z_{ij}=0)=(1-p_f)(1-\bm{\mu}_i\bm{\nu}_j)\\&
%p(A_{ij}=1|Z_{ij}=1)=1-p_f(1-\bm{\mu}_i\bm{\nu}_j)\\&
%p(A_{ij}=0|Z_{ij}=1)=p_f(1-\bm{\mu}_i\bm{\nu}_j)
%\end{split}\]

%In the case of Bernoulli variables \(\bm{\mu}_i\), the incoming message from factor \(g\) to \(\bm{\mu}_i\) is certainly the likelihood information, 
%\[\Omega_i=log(\prod_{j=1}^np(A_{ij}))\]
%while the message from \(\bm{\mu}_i\) to \(g\) would be the MAP of the posterior distribution, i.e., 
%\[\begin{split}
%  \hat{\Omega}_i&=\argmax_{\bm{\mu}_i} log(\prod_{j=1}^np(A_{ij})p(\bm{\mu}_i))\\&
%  =\argmax_{\bm{\mu}_i}(\sum_{j=1}^nlog(p(A_{ij}))+b_i(\bm{\mu_i}) ) 
%\end{split}
%\]

%Given no knowledge on the bias before hand, here we impose a uniform prior on the Bernoulli variable, such that \(b_i(\bm{\mu}_i)=0\). In addition, the log posterior is related to 4 situations, 
%\[\begin{split}
% \Omega_i&=\sum_{j=1,A_{ij}=1, Z_{ij}=0}^n log(1-(1-p_f)(1-\bm{\mu}_i\bm{\nu}_j))\\&
%+\sum_{j=1,A_{ij}=1, Z_{ij}=1}^n log(1-p_f(1-\bm{\mu}_i\bm{\nu}_j))\\&
%+\sum_{j=1,A_{ij}=0, Z_{ij}=0}^n log((1-p_f)(1-\bm{\mu}_i\bm{\nu}_j))\\&
%+\sum_{j=1,A_{ij}=0, Z_{ij}=1}^n log(p_f(1-\bm{\mu}_i\bm{\nu}_j)   
%\end{split}
%\]

%Here we assume \(P_f\rightarrow 0\), such that \(p_f(1-\bm{\mu}_i\bm{v}_i)\rightarrow 0\), and both \(\sum_{j=1,A_{ij}=1, Z_{ij}=1}^n log(1-p_f(1-\bm{\mu}_i\bm{\nu}_j))\) and \(\sum_{j=1,A_{ij}=0, Z_{ij}=1}^n log(p_f(1-\bm{\mu}_i\bm{\nu}_j)\) can be approximate by a constant that does not contribute to the inference of \(\hat{\Omega}_i\). Also \((1-p_f)(1-\bm{\mu}_i\bm{\nu}_j)\) can be approximated by \((1-\bm{\mu}_i\bm{\nu}_j)\). It also has practical meanings, that for the inference of background bias, we only consider the values that are not covered by the latent pattern \(X,Y\). While our objective is to infer \(\bm{\mu}_i\) that better reflect the background information of \(A_{i:}\). However, it is still non-trivial to derive \(\hat{\Omega}_i\) as every observation is related to a different \(\bm{v}_j\). Instead of deriving exact MAP of likelihood, we treat this as an optimization problem, where we could utilize conventional loss function to achieve the same objective that optimize the difference between \(\bm{\mu}_i\) with \(A_{i:}\). Here, we apply a modified mean square loss, i.e., 
%\[\Omega=\sum_{j=1,Z_{ij}=0}^n\bm{v}_j(A_{ij}-\bm{\mu}_i)^2\] 
%The most important benefit of this modified loss is that it ensures the probability of each \(\bm{\mu}_i\) would be from \([0,1]\) and still consider the impact from \(\bm{v}_j\) for each observations. Conveniently, \(\hat{\Omega}_i\) is inferred from the derivative of \(\Omega\), i.e.,
%\[\hat{\Omega}=\argmax_{\bm{\mu}_i} \Omega=\frac{\sum_{j=1,Z_{ij}=0}^n A_{ij}v_j}{\sum_{j=1,Z_{ij}=0}^n v_j}\]
%Similarly, we have
%\[\Theta_j=\sum_{i=1,Z_{ij}=0}^m \bm{\mu}_i(A_{ij}-\bm{v}_j)^2\]
%\[\hat{\Theta}_j=\frac{\sum_{i=1,Z_{ij}=0}^m A_{ij}\bm{\mu}_i}{\sum_{i=1,Z_{ij}=0}^n \bm{\mu}_i}\]

%Now we have derived all messages in the likelihood despite \(\Gamma_{ijl}: \textbf{m}_{g_{ij}\rightarrow W_{ijl}}\) that passed the information from the likelihood factor to each of auxiliary variable \(W_{ijl}\). Overall, the message take the form of
%\[\begin{split}
%  \textbf{m}_{g_{ij}\rightarrow W_{ijl}}(W_{ijl})^{t+1}&=\max_{W_{ijl'},l'\neq l}(g_{ij}(Z_{ij},\bm{\mu}_{i},\bm{v}_j)\\&
%  +\sum_{l'\neq l}{\textbf{m}_{W_{ijl'}\rightarrow g_{ij}}(W_{ijl'})^t})  
%\end{split}
%\]

%When updating \(W_{ijl}\), we consider two scenarios: 1. \(Z_{ij}=\vee_{l=1}^k W_{ijl}=1\) with likelihood factor \(p(A_{ij}|Z_{ij}=1)\) and 2. \(\vee_{l=1}^k W_{ijl}=0\), \(p(A_{ij}|Z_{ij}=0)\).

%\(W_{ijl}=1\) falls into the situation of scenarios 1, that no matter the value of \(W_{ijl'}\), \(Z_{ij}=\vee W_{ijl}=1\). The message for \(W_{ijl}=1\) can be derived as
%\[\begin{split}
% \textbf{m}_{g_{ij}\rightarrow W_{ijl}}(1)&=\max_{W_{ijl'},l'\neq l}(g_{ij}(Z_{ij},\bm{\mu}_{i},\bm{v}_j)\\&
%  +\sum_{l'\neq l}{\textbf{m}_{W_{ijl'}\rightarrow g_{ij}}(W_{ijl'})^t})\\&
%  =log(p_(A_{ij}|1))\\&
%  +\sum_{l'\neq l} max(\textbf{m}_{W_{ijl'}\rightarrow g_{ij}}(1),\textbf{m}_{W_{ijl'}\rightarrow g_{ij}}(0))
%\end{split}
%\]

%\(W_{ijl}=0\) could involve both cases. If \(Z_{ij}=0\), all \(W_{ijl'}=0\), i.e.,
%\[\textbf{m}_{g_{ij}\rightarrow W_{ijl}}(0)
%  =log(p_(A_{ij}|0))
%  +\sum_{l'\neq l} \textbf{m}_{W_{ijl'}\rightarrow g_{ij}}(0)
%\]
%If \(Z_{ij}=1\), at least one of \(W_{ijl'}\) equal to zero. To achieve the maximum likelihood, the \(W_{ijl'}\) with the maximum likelihood difference on 0 or 1 should be set as 1, we denote it as \(W_{ijl^*}\), where \(l^*=\argmax_{l'\neq l}(\textbf{m}_{W_{ijl'}\rightarrow g_{ij}}(1)-\textbf{m}_{W_{ijl'}\rightarrow g_{ij}}(0))\), such that we have
%\[\begin{split}
% &\textbf{m}_{g_{ij}\rightarrow W_{ijl}}(0)
%  =log(p_(A_{ij}|1))\\&
%  +\sum_{l'\neq l} max(\textbf{m}_{W_{ijl'}\rightarrow g_{ij}}(1),\textbf{m}_{W_{ijl'}\rightarrow g_{ij}}(0))\\&
%  - \textbf{m}_{W_{ijl*}\rightarrow g_{ij}}(0)   
%\end{split}
%\]

%Taken together,
%\[\begin{split}
% &\textbf{m}_{g_{ij}\rightarrow W_{ijl}}(0)
%  =max(log(p_(A_{ij}|0))\\&
%  +\sum_{l'\neq l} \textbf{m}_{W_{ijl'}\rightarrow g_{ij}}(0),log(p_(A_{ij}|1))\\&
%  +\sum_{l'\neq l} max(\textbf{m}_{W_{ijl'}\rightarrow g_{ij}}(1),\textbf{m}_{W_{ijl'}\rightarrow g_{ij}}(0))\\&
%  - \textbf{m}_{W_{ijl*}\rightarrow g_{ij}}(0))   
%\end{split}
%\]
%Therefore
%\[\begin{split}
%  &\Gamma_{ijl}=\textbf{m}_{g_{ij}\rightarrow W_{ijl}}(1)-\textbf{m}_{g_{ij}\rightarrow W_{ijl}}(0)\\&
%=log(p_(A_{ij}|1))\\&
%  +\sum_{l'\neq l} max(\textbf{m}_{W_{ijl'}\rightarrow g_{ij}}(1),\textbf{m}_{W_{ijl'}\rightarrow g_{ij}}(0))\\&
%  -max(log(p_(A_{ij}|0))\\&
%  +\sum_{l'\neq l} \textbf{m}_{W_{ijl'}\rightarrow g_{ij}}(0),log(p_(A_{ij}|1))\\&
%  +\sum_{l'\neq l} max(\textbf{m}_{W_{ijl'}\rightarrow g_{ij}}(1),\textbf{m}_{W_{ijl'}\rightarrow g_{ij}}(0))\\&
%  - \textbf{m}_{W_{ijl*}\rightarrow g_{ij}}(0))\\&
%  =min(log(\frac{p(A_{ij}|1)}{p(A_{ij}|0)})+\sum_{l'\neq l}max(0,\hat{\Gamma}_{ijl'}^t),\\&
%  max(0,-max_{l'\neq l}\hat{\Gamma}_{ijl'}^t))
%\end{split}\]

%Noted, \(p(A_{ij}=1|1)=p_Z+p_{B_{ij}}(1-p_Z)\), \(p(A_{ij}=1|0)=p_{B_{ij}}\), \(p(A_{ij}=0|1)=(1-p_{B_{ij}})(1-p_Z)\) and \(p(A_{ij}=0|0)=1-p_{B_{ij}}\).


\end{document}
