\documentclass[accepted]{uai2025} % for initial submission
%\documentclass[accepted]{uai2025} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2025} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2025} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{comment}
\usepackage{amsmath}   
\usepackage{amssymb}   
\usepackage{amsthm}
\usepackage{algorithm}
\usepackage{algorithmicx}
\usepackage{algpseudocode} 
\usepackage{graphicx}


\usepackage{epstopdf}
\usepackage{epsfig}

\usepackage{subcaption}
\usepackage{amsfonts}
\usepackage{authblk}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\renewcommand{\cite}{\citep}

\title{Optimal Submanifold Structure in Log-linear Models}

% The standard author block has changed for UAI 2025 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%

% Add authors
%\author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2025 paper}%{Jane~J.~von~O'L\'opez}{}}
\author[1,2]{Derun Zhou}
\author[1,2]{Mahito Sugiyama}
%\author[3]{Further~Coauthor}
%\author[1]{Further~Coauthor}
%\author[3]{Further~Coauthor}
%\author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    National Institute of Informatics, Tokyo, Japan
}
\affil[2]{%
    The Graduate University for Advanced Studies, SOKENDAI
}
%\affil[3]{%
%    Another Affiliation\\
%    Address\\
%    …
%  }
  
\begin{document}
\maketitle

\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem*{reptheorem}{Theorem}
\newtheorem*{replemma}{Lemma}
\newtheorem*{repcorollary}{Corollary}
\begin{abstract}
  % The log-linear model has received significant theoretical attention in previous decades and remains the fundamental tool for learning probability distributions and non-negative tensor decomposition over discrete variables.
  % Many-body approximation directly chooses part of many-body natural parameters to be 0 as fixed hyperparameters. However, hyperparameters are not 0 has never been discussed before. In this paper, we prove the asymptotic characteristic of the fixed many-body natural parameters from the view of information geometry. Specifically, the optimal fixed hyperparameters will converge to 0 as the size of the input non-negative tensor(or empirical distribution) increases. Moreover, our conclusions are also validated on both synthetic and real-world datasets. 
  In the modeling of discrete distributions using log-linear models, the model selection process is equivalent to imposing zero-value constraints on a subset of natural parameters, which is an established concept in information geometry.
  This \emph{zero-value constraint} has been implicitly employed, from classic Boltzmann machines to recent many-body approximations of tensors.
  However, in theory, any constant value other than zero can be used for these constraints, leading to different \emph{submanifolds} onto which the empirical distribution is projected, a possibility that has not been explored.
  Here, we investigate the asymptotic behavior of these constraint values from the perspective of information geometry.
  Specifically, we prove that the optimal value converges to zero as the size of the support of the empirical distribution increases, which corresponds to the size of the input tensors in the context of tensor decomposition.
  While our primary focus is on many-body approximation of tensors, it can serve as a basis for extending to a wide range of log-linear modeling applications.
\end{abstract}

\section{Introduction}\label{sec:intro}
The energy-based model is widely used in machine learning areas~\cite{lecun2006tutorial,jaynes1957information}. Since the exponential family is in the energy-based model, it covers a wide variety of classical distributions for continuous variables, such as Gaussian, exponential, and gamma distributions \cite{mackay2003information}. Moreover, the \emph{log-linear model}, which is also in the exponential family, covers all the positive probability distributions over a finite space~\cite{log-linear}. Recently, the log-linear model has been used to model distributions over partially ordered sets (posets) and its dually-flat manifold structure has been analyzed in an information geometric manner~\cite{sugiyama17a}.

The log-linear model on a partial-order structure (LPS) provides an alternative approach to decomposition for positive tensors, which avoids the optimization difficulties associated with the common low-rank based decompositions by replacing the squared error loss with the Kullback-Leibler (KL) divergence~\cite{NEURIPS2018_56a3107c}. Each positive tensor is treated as a discrete distribution with a partial-order structure. It is parameterized by the natural parameters of the exponential family, and the optimization is realized as a projection onto a model submanifold constrained by a subset of these natural parameters. 
% theory is well-explored
% for positive tensors within the log-linear model, where several established constructions for flat base submanifolds have proven powerful in
% capturing the non-trivial structure of positive tensors after the projection~\cite{sugiyama2018legendre}.
We can capture the non-trivial structure of positive tensors after the projection, one of which is the \emph{many-body tensor approximation} that captures a hierarchy of mode interactions~\cite{NEURIPS2023_ea94957d}. The mode interaction selection based on many-body approximation can be regarded as the feature selection in distribution learning.

In this paper, we focus on many-body approximation, as it is not only a key application of LPS but also includes a wide variety of graphical models such as standard and high-order Boltzmann machines~\cite{ackley1985learning,sejnowski1986higher}.
In many-body approximation, specifying the model submanifold, which can be viewed as a model selection problem or hyperparameter tuning, typically involves imposing the zero-value constraint on a subset of the natural parameters.
In Boltzmann machines, this process corresponds to selecting a graphical model, where a zero-value constraint is implicitly applied. Specifically, removing an edge between nodes, each of which represents a random variable, effectively sets the corresponding natural parameter to zero.
However, from an information geometric perspective, these constraints could, in principle, take any constant value other than zero.
Despite this flexibility, this possibility remains largely unexplored.

% It is well-known that restricted Boltzmann machines (RBMs)~\cite{ackley1985learning} and high-order Boltzmann machines (HBMs)~\cite{min2014interpretable} can learn real-world data distributions. However, they still apply only to binary variables and there are still many challenges that remain even in medium-scale datasets. LPS can not only learn more complex distributions based on tabular data directly~\cite{enouen2024complete} but also provide theoretical analysis for RBM and HBM as the fundamental distribution learning model by bias-variance decomposition~\cite{luo2019bias}.

% The popular data augmentation method such as the auto-encoder model encodes original data into a compact latent representation and then decodes it back, with both processes usually handled by black-box neural networks~\cite{kingma2013auto}. The new data augmentation algorithm based on LPS regarded the flat low-dimensional submanifold as the latent space. Through the forward and backward projection to the submanifold, an interpretable, fully white box, and energy-minimizing
% augmentation algorithm can be proposed~\cite{hu2024pseudo}.

We provide a formal description and a simple example below.
In the modeling based on the LPS, including many-body approximation and Boltzmann machines, we first select the model submanifold, also known as an $e$-flat submanifold, described as 
$
\mathcal{S}^{0}_\mathcal{B}=\left\{\mathcal{Q} \in \mathcal{S} \mid \theta_v=0 \text { for all } v \in \Omega_d^{+} \backslash \mathcal{B}\right\}$,
where $\mathcal{S}$ denotes the set of distributions, and $\theta_v$ represents the natural parameters of the LPS (exponential family). We define $\Omega_d = [I_1] \times \cdots \times [I_d]$, where $[I_k] = \{1, 2, \ldots, I_k\}$. To exclude the normalization constant, we often work in the reduced space $\Omega_d^{+} = \Omega_d \setminus \{(1, 1, \ldots, 1)\}$, and consider a subset $\mathcal{B} \subseteq \Omega_d^{+}$. The parameters in $\mathcal{B}$ are optimized by minimizing the KL divergence.

Here it is clear from the equation that this model submanifold allows not only $\mathcal{S}^{0}_{\mathcal{B}}$ with the ``$\theta_v = 0$'' constraint but also $\mathcal{S}^{c}_{\mathcal{B}}$ with the ``$\theta_v = c$'' constraint for any constant value $c$, which may help to decrease the KL error further.
% There have been few discussions on the submanifold selection issue so far, which means $\theta_v=c$ and $c$ are not restricted to 0. But it is a very important problem to be discussed, projection to submanifold $\mathcal{S}^{c}_{\mathcal{B}}$ may help to decrease the KL divergence error and the root mean squared error (RMSE).
For example, let us consider decomposing the following toy matrix:
$$\left[\begin{array}{ccccccc}833 & 1 & 2 & 4 & 7 & 4 & 8 \\ 430 & 33 & 5 & 1 & 711 & 112 & 4 \\ 39 & 6 & 29 & 2 & 9 & 3 & 121 \\ 2 & 2 & 8 & 6 & 311 & 10 & 122\end{array}\right].$$  We choose the decomposition basis as one body natural parameters, which means $\theta_{1j}$ and $\theta_{i1}$ are selected as decomposition basis. If we choose the submanifold as $
\mathcal{S}^{0}_{\mathcal{B}_1}$, where $\mathcal{B}_1$ is the index set of one-body natural parameters, the KL error is 0.46 and the RMSE is 0.56, and the projection result is a rank-1 matrix as follows: 
\begin{align*}
&\left[
\begin{array}{c}
396.5 \\
598.2 \\
96.5 \\
212.8
\end{array}
\right]
% \left[
% \begin{array}{ccccccc}
\begin{bmatrix}
1.0 & 0.03 & 0.03 & 0.01 & 0.8 & 0.1 & 0.2
\end{bmatrix}\\
% \end{array}
% \right]\\
% $$
% It can also be reconstructed as:
% $$
= &\begin{bmatrix}
396.5 & 11.9 & 11.9 & 4.0 & 317.2 & 39.7 & 79.3 \\
598.2 & 17.95 & 17.95 & 5.98 & 478.6 & 59.8 & 119.6 \\
96.5 & 2.9 & 2.9 & 0.97 & 77.2 & 9.65 & 19.3 \\
212.8 & 6.38 & 6.38 & 2.13 & 170.2 & 21.3 & 42.56    
\end{bmatrix}.
\end{align*}
In contrast, if we choose $
\mathcal{S}^{0.54}_{\mathcal{B}_1}$ as the model submanifold, the resulting KL error is 0.19 and RMSE is only 0.24, which is a half of the result of $\mathcal{S}^{0}_{\mathcal{B}_1}$.
The reconstruction matrix is in the following.
% The definition of $\mathcal{S}^{c}_{\mathcal{B}_1}$ can be referred in section ~\ref{sec:math}.
$$
\begin{bmatrix}
731.1 & 17.6 & 12.5 & 2.2 & 88.7 & 4.3 & 2.55 \\
555.05 & 22.96 & 27.99 & 8.49 & 583.33 & 48.7 & 49.49 \\
14.95 & 1.06 & 2.22 & 1.16 & 136.1 & 19.5 & 34 \\
2.9 & 0.35 & 1.27 & 1.14 & 229.9 & 56.5 & 169
\end{bmatrix}.
$$
Please note that it is no longer rank-1, while the number of free parameters is the same with the case of $\mathcal{S}^{0}_{\mathcal{B}_1}$.
This example highlights the necessity of studying the submanifold selection problem. For the detail explanation of this example, please refer to Appendix~\ref{td}.

To summarize, our contribution is threefold:
\begin{itemize}[leftmargin=2em]
\item We theoretically prove that, for any order many-body approximation, the optimal $e$-flat model submanifold converges to $\mathcal{S}^{0}_\mathcal{B}$ as the tensor size (the number of entries of a tensor) increases.
\item We present an optimal $e$-flat submanifold searching algorithm. This algorithm is formulated as a convex optimization, hence it always finds the globally optimal solution of a KL divergence minimization problem with linear constraint conditions. This algorithm can be used to improve the performance of small or medium-scale datasets for tensor decomposition or distribution learning for tabular data. 
\item  We provide and empirical evaluation on synthetic and real-world datasets and show the consistency between theory and experimental results.
\end{itemize}



\begin{comment}
UAI 2025 papers have to be prepared using \LaTeX.
To start writing your paper, copy \texttt{uai2025-template.tex} and replace title, authorship, and content with your own.

The UAI 2025 paper style is based on a custom \textsf{uai2025} class.
The class file sets the page geometry and visual style.\footnote{%
    The class uses the packages \textsf{adjustbox}, \textsf{environ}, \textsf{letltxmacro}, \textsf{geometry}, \textsf{footmisc}, \textsf{caption}, \textsf{textcase}, \textsf{titlesec}, \textsf{titling}, \textsf{authblk}, \textsf{enumitem}, \textsf{microtype}, \textsf{lastpage}, and \textsf{kvoptions}.
}
The class file also loads basic text fonts.\footnote{%
    Fonts loaded are \textsf{times} (roman), \textsf{helvet} (sanserif), \textsf{courier} (fixed-width), and \textsf{textcomp} (common symbols).
}
\emph{You may not modify the geometry or style in any way, for example, to squeeze out a little bit of extra space.}
(Also do not use \verb|\vspace| for this.)
Feel free to use convenience functionality of loaded packages such as \textsf{enumitem}.
The class enables hyperlinking by loading the \textsf{hyperref} package.

You are free to load any packages available in \TeX{Live}~2020 that are compatible with the UAI class.\footnote{In case this template or your submission does not compile, always first make sure your \TeX\ installation is up-to-date.}
(Mik\TeX{} and Mac\TeX{} generally contain the same packages.)
Do not load conflicting packages—you will get an error message—, as this complicates creating the proceedings.
Please avoid using obsolete commands, such as \verb|\rm|, and obsolete packages, such as \textsf{epsfig}.\footnote{%
    See \url{https://ctan.org/pkg/l2tabu}.
}

\swap[ ]{in the header of your source file.}{Feel free to include your own macros}
\end{comment}
\section{PRELIMINARIES}
\subsection{Formulation}
\label{prelimiaries}
We start with a positive $d^{\text{th}}$-order input tensor $\mathcal{X} \in \mathbb{R}_{> 0}^{I_1 \times \cdots \times I_d}$ and normalize it as $
\hat{\mathcal{P}}_{i_1, \dots, i_d} = \mathcal{X}_{i_1, \dots, i_d} / \sum_{j_1=1}^{I_1} \ldots \sum_{j_d=1}^{I_d} \mathcal{X}_{j_1, \ldots, j_d}.$
For the remainder of this paper, we consistently work on the normalized tensor $\hat{\mathcal{P}}$ as the input tensor. 
We can treat any normalized tensor as a discrete distribution (or a probabilistic vector) with the sample space \(\Omega_d = \left[I_1\right] \times \cdots \times \left[I_d\right]\), where \(\left[I_k\right] = \{1, 2, \ldots, I_k\}\).
% and a parameter basis \(\mathcal{B} \subset \Omega_d \backslash \{(1, 1, \ldots, 1)\}\).
% Moreover, we define \(\Omega_d^{+} = \Omega_d \backslash \{(1, 1, \ldots, 1)\}\).
Hence, it is exactly modeled by the \emph{log-linear model}:
\begin{equation}
\log \mathcal{P}_{i_1,\dots,i_d}=\sum_{i_1^{\prime}=1}^{i_1} \cdots \sum_{i_d^{\prime}=1}^{i_d} \theta_{i_1^{\prime}, \ldots, i_d^{\prime}}
\label{ept}
\end{equation}
for each \((i_1,\dots,i_d) \in \Omega_d\), where each $\theta_{i_1', \dots, i_d'} \in \mathbb{R}$ corresponds to a \emph{natural parameter}.
The normalization is exposed on $\theta_{\perp}$ with $\bot = (1, \dots, 1)$ as
\begin{equation}
\theta_{\perp}=-\log \left(\sum_{\left(i_1, \ldots, i_d\right) \in \Omega_d^{+}} \exp \left(\sum_{i_1^{\prime}=1}^{i_1} \cdots \sum_{i_d^{\prime}=1}^{i_d} \theta_{i_1^{\prime}, \ldots, i_d^{\prime}}\right)\right).
\end{equation}
For example of \emph{log-linear model}, please refer to Appendix~\ref{elog}. Thus we often work on the space \(\Omega_d^{+} = \Omega_d \backslash \{(1, 1, \ldots, 1)\}\) by excluding the normalization constant.
In addition to natural parameters, we also have another set of parameters called expectation parameters, denoted as a vector $(\eta)_{i_1, \ldots, i_d}$. Each value of the $\eta$-parameter vector is written as follows:
\begin{equation}
\eta_{i_1, \ldots, i_d}=\sum_{i_1^{\prime}=i_1}^{I_1} \cdots \sum_{i_d^{\prime}=i_d}^{I_d} \mathcal{P}_{i_1^{\prime}, \ldots, i_d^{\prime}},
\label{etatop}
\end{equation}
and uniquely identifies a normalized positive tensor $\mathcal{P}$ by the following equation.\begin{equation}
\mathcal{P}_{i_1, \ldots, i_d}=\sum_{\left(i_1^{\prime}, \ldots, i_d^{\prime}\right) \in \Omega_d} \mu_{i_1, \ldots, i_d}^{i_1^{\prime}, \ldots, i_d^{\prime}} \eta_{i_1^{\prime}, \ldots, i_d^{\prime}},
\label{ptoeta}
\end{equation}
where $\mu$ is the M{\"o}bius function defined inductively as
\begin{equation}
\mu_{i_1, \ldots, i_d}^{i_1^{\prime}, \ldots, i_d^{\prime}}= \begin{cases}1 &  i_k=i_k^{\prime}, \forall  k \in[d], \\ -\prod_{k=1}^d \sum_{j_k=i_k}^{i_k^{\prime}-1} \mu_{i_1, \ldots, i_d}^{j_1, \ldots j_d} &  i_k<i_k^{\prime}, \forall  k \in[d], \\ 0 & \text{otherwise.}\end{cases}
\label{mobius}
\end{equation}
An example of Equation~\eqref{ptoeta} is presented in Appendix~\ref{efmf}. The normalization condition is realized as $\eta_{1, \ldots, 1}=1$.
Both of $(\theta)_{i_1, \ldots, i_d}$ and $(\eta)_{i_1, \ldots, i_d}$ serve as coordinate systems for the set of distributions.
% Each distribution $\mathcal{P}$ can also be described as
% using the $\eta$-coordinate system. 




\subsection{Legendre decomposition}
We introduce the Legendre decomposition~\cite{NEURIPS2018_56a3107c}, which decomposes a given tensor via log-linear modeling introduced in the previous subsection.
Let $\mathcal{S}$ be the set of all normalized positive tensors.
When we have an index set $\mathcal{B} \subseteq \Omega_d^{+}$ as a \emph{decomposition basis},
% \[
% \mathcal{S}=\left\{\mathcal{P} \mid 0<\mathcal{P}_{i_1,\dots,i_d}<1 \text{ and } \sum_{i_1=1}^{I_1} \cdots\sum_{i_d=1}^{I_d}\mathcal{P}_{i_1, \ldots, i_d}=1 \right\}
% \]
the corresponding submanifold $\mathcal{S}^{0}_\mathcal{B}$ is given as
$$
\mathcal{S}^{0}_\mathcal{B}=\left\{Q \in \mathcal{S} \mid \theta_{i_1, \ldots, i_d}=0 \text { for all } \left(i_1, \ldots, i_d \right) \in \Omega_d^{+} \backslash \mathcal{B}\right\}.
$$
Legendre decomposition is formulated as optimization that finds $\mathcal{P}^{\mathcal{B},0}$ in the submanifold $\mathcal{S}^{0}_\mathcal{B}$ minimizing the following KL divergence:
$$
\mathcal{P}^{\mathcal{B},0} = \underset{\mathcal{R} \in \mathcal{S}^{0}_\mathcal{B}}{\operatorname{argmin}} \, D_{\mathrm{KL}}(\hat{\mathcal{P}}, \mathcal{R}),
$$
where the KL divergence from $\hat{\mathcal{P}} \in \mathcal{S}$ to $\mathcal{R} \in \mathcal{S}$ is given as
$$
D_{K L}(\hat{\mathcal{P}}, \mathcal{R})=\sum_{i_1=1}^{I_1} \ldots \sum_{i_d=1}^{I_d} \hat{\mathcal{P}}_{i_1, \ldots, i_d} \log \frac{\hat{\mathcal{P}}_{i_1, \ldots, i_d}}{\mathcal{R}_{i_1, \ldots, i_d}}.
$$
It is known that the derivative of the KL divergence is
\begin{equation}
\frac{\partial}{\partial {\theta}_{i_1, \dots, i_d}} D_{K L}(\mathcal{\hat{P}}, \mathcal{R})={\eta}_{i_1, \dots, i_d} - \hat{{\eta}}_{i_1, \dots, i_d}
\label{legd}
\end{equation}
for every $(i_1, \dots, i_d) \in \mathcal{B}$, where $(\eta)_{i_1, \ldots, i_d}$ and $(\hat\eta)_{i_1, \ldots, i_d}$ are the expectation parameters of $\mathcal{R}$ and $\mathcal{\hat{P}}$, respectively. 
This equation implies that the KL divergence is minimized if and only if $\eta_{i_1,\ldots,i_d}=\hat{\eta}_{i_1,\ldots,i_d}$ for all $(i_1, \dots, i_d) \in \mathcal{B}$.
In information geometry, this optimization problem can be regarded as the $m$-projection onto the $e$-flat submanifold $\mathcal{S}^{0}_\mathcal{B}$. The tensor $\mathcal{P}^{\mathcal{B},0}$, such that \( \mathcal{P}^{\mathcal{B},0} \in \mathcal{S}^{0}_\mathcal{B} \cap \mathcal{S}^{\mathcal{B}}_{\hat{\mathcal{P}}}\), always uniquely exists, where
\begin{equation}
\mathcal{S}^{\mathcal{B}}_{\hat{\mathcal{P}}} = \left\{ Q \in \mathcal{S} \mid \eta_{i_1,\dots,i_d} = \hat{\eta}_{i_1,\dots,i_d} \text{ for all } \left({i_1,\dots,i_d}\right) \in \mathcal{B} \right\}.
\label{datamanifold}
\end{equation}
 Moreover, $\mathcal{S}^{\mathcal{B}}_{\hat{\mathcal{P}}}$ is an $m$-flat submanifold since it imposes constraints on the $\eta$ coordinate. For the definitions of $m$-flat and $e$-flat submanifolds, as well as the concept of projection theory, please refer to Appendix~\ref{candg} and ~\ref{flatandproject}.

\subsection{Many-body approximation}
% \subsubsection{Definition of many body parameters}
Many-body approximation is a special case of Legendre decomposition, which emphasizes the connection to the mode interactions of tensors by explicitly incorporating them in the modeling~\cite{NEURIPS2023_ea94957d}. 
% First, we review the concept of $h$-body parameters.
For each $\theta_{i_1, \ldots, i_d}$, if there are $h$ non-one indices, we call it an $h$-body parameter.
For example, if we consider a $4^{\text{th}}$-order input tensor, $\theta_{1,2,1,1}$ is a one-body parameter, $\theta_{4,3,1,1}$ is a two-body parameter, $\theta_{1,2,4,3}$ is a three-body parameter and $\theta_{5,2,4,3}$ is a four-body parameter. 

% \subsubsection{Many-body approximation}
The definition of many-body approximation can be summarized in the following: 
For a given tensor $\hat{\mathcal{P}}$, its $h$-body approximation is the optimal solution $\mathcal{P}^{\mathcal{B}_h,0}$ such that
\[
\mathcal{P}^{\mathcal{B}_h,0}= \mathop{\mathrm{argmin}}_{\mathcal{R} \in S^{0}_{\mathcal{B}_h}} D_{K L}(\hat{\mathcal{P}}, \mathcal{R}),
\]
where the solution space $S^{0}_{\mathcal{B}_h}$ is given as $\mathcal{S}^{0}_{\mathcal{B}_h}=\left\{\mathcal{Q} \in \mathcal{S} \mid \theta_{i_1, \ldots, i_d}=0\right.$ if $\theta_{i_1, \ldots, i_d}$ is $n \left(n > h\right)$-body parameters of $\left.\mathcal{Q}\right\}$. Therefore, the decomposition basis \( \mathcal{B}_h \) is the index set composed of all \( i \)-body parameters with \( 1 \leq i \leq h \), and the inclusion relationship \( \mathcal{B}_h \subseteq \mathcal{B}_{h+1} \) always holds. Moreover, it is important to note that \( \mathcal{P}^{\mathcal{B}_d,0} = \hat{\mathcal{P}} \).

\section{Theoretical Analysis}
\label{sec:math}
We theoretically analyze the behavior of $c \in \mathbb{R}$ for the $e$-flat model submanifold:
\[
    \mathcal{S}^{c}_{\mathcal{B}_h} = \left\{Q \in \mathcal{S} \mid \theta_v=c \text { for all } v \in \Omega_d^{+} \setminus \mathcal{B}_h\right\}
\]
in $h$-body approximation of an input tensor ${\hat{\mathcal{P}} \in \mathbb{R}_{> 0}^{I_1 \times \cdots \times I_d}}$.
Compared with  $\mathcal{S}^{0}_{\mathcal{B}_h}$ that we mentioned in the previous section, here $c$ is a constant that is not limited to 0. The result of $m$-projection of ${\hat{\mathcal{P}}}$ onto the submanifold $\mathcal{S}^{c}_{\mathcal{B}_h}$ is still formulated as $\mathcal{P}^{\mathcal{B}_h,c}= \mathop{\mathrm{argmin}}_{\mathcal{R} \in S^{c}_{\mathcal{B}_h}} D_{K L}(\hat{\mathcal{P}}, \mathcal{R})$ and, according to the projection theory, it is always guaranteed that \( \mathcal{P}^{\mathcal{B}_h,c} \) not only exists but is also unique. For further details, refer to Appendix~\ref{Theoreticalremarks}.
The objective of our theoretical analysis is to find out whether there exists an $e$-flat submanifold $\mathcal{S}^{c_0}_{\mathcal{B}_h}$ and its $m$-projection result $\mathcal{P}^{\mathcal{B}_h,c_0}$ satisfying
$$D_{\mathrm{KL}}(\mathcal{\hat{P}}, \mathcal{P}^{\mathcal{B}_h,c_0}) \leq D_{\mathrm{KL}}(\mathcal{\hat{P}}, \mathcal{P}^{\mathcal{B}_h,c}) \quad \text{for all } c \in \mathbb{R}.$$
This means that there exists an optimal low-dimensional submanifold $\mathcal{S}^{c_0}_{\mathcal{B}_h}$ which ensures that the KL divergence reaches its minimum value under the same dimensionality, under the fixed number $h$ of bodies.
% It also implies that, under the same number of many-body parameters, we have found the optimal many-body approximation for the input tensor. 
As we show in Figure~\ref{fig:ImpLegendre}, each $e$-flat model submanifold, $\mathcal{S}^{c}_{\mathcal{B}_h}$, $\mathcal{S}^{0}_{\mathcal{B}_h}$, $\mathcal{S}^{-c}_{\mathcal{B}_h}$, and $\mathcal{S}^{c_0}_{\mathcal{B}_h}$, has a unique intersection with the m-flat (data) submanifold  $\mathcal{S}^{\mathcal{B}_h}_\mathcal{\hat{P}}$, which corresponds to $\mathcal{P}^{\mathcal{B}_h,c}$, $\mathcal{P}^{\mathcal{B}_h,0}$, $\mathcal{P}^{\mathcal{B}_h,-c}$, and $\mathcal{P}^{\mathcal{B}_h,c_{0}}$, respectively.
% which means $\mathcal{S}^{c}_{\mathcal{B}_h} \cap \mathcal{S}_\mathcal{P} = \mathcal{P}^{\mathcal{B}_h,c}$. 
Please note that \( \mathcal{S}^{\mathcal{B}_h}_\mathcal{\hat{P}} \) is defined by replacing \( \mathcal{B} \) with \( \mathcal{B}_h \) in Equation~\eqref{datamanifold}.



\begin{figure}[t]
    \centering
    \includegraphics[width=\linewidth]{uai2025/233_copy.png}
    \caption{The $m$-projection onto different $e$-flat model submanifolds.}
    \label{fig:ImpLegendre}
\end{figure}


In the following, we theoretically prove that \( c_0 \) exists and converges to \( 0 \) as the size of a input tensor \( \hat{\mathcal{P}} \) increases. Here, the size of the tensor is defined as the total number of elements in the normalized tensor \( \hat{\mathcal{P}} \), simply given by \( \prod_{j=1}^{d} I_j \). Specifically, we primarily consider two approaches to increasing the size of the tensor.
The first approach involves increasing the values of \( I_j \) for each \( j = 1, \ldots, d \), while the second approach increases the dimensionality \( d \) of the tensor.
First, to prove the main result, we derive the closed form of $\mathcal{P}^{\mathcal{B}_{I_j,m_j},c}$, which is the result of $m$-projection of $\hat{\mathcal{P}}$ onto a special submanifold $$\mathcal{S}^{c}_{\mathcal{B}_{I_j,m_j}}=\left\{Q \in \mathcal{S} \mid \theta_v=c \text { for all } v \in \Omega_d^{+} \backslash \mathcal{B}_{I_j,m_j}\right\}.$$
where
$\mathcal{B}_{I_j,m_j} = [I_1] \times \dots \times [I_{j - 1}] \times [m_j] \times [I_{j + 1}] \times \dots \times [I_d]$
% $\mathcal{B}_{I_j,m_j} = \begin{aligned}
% & {\left[I_1\right] \times \ldots \times\left[I_{j-1}\right] \times\left[I_{j, m_j}\right] \times\left[I_{j+1}\right] \times \ldots \times\left[I_d\right] } \\
% \end{aligned}$
and $[m_j] = \{1, \dots, m_j\}$ with $m_j \leq I_j $.
Figure~\ref{fig:example} shows an example of the submanifold  $\mathcal{S}^{c}_{\mathcal{B}_{{I_3,m_3}}}$ for a $3^{th}$-order tensor. 

\begin{figure}[t]
  \centering
  \includegraphics[width=\linewidth]{uai2025/example.png}
  \caption{The submanifold  $\mathcal{S}^{c}_{\mathcal{B}_{{I_3,m_3}}}$, with  $\mathcal{B}_{{I_3,m_3}}=\left[I_1\right] \times \left[I_2\right]
  \times \left[m_3\right], \left[ m_3\right]=\left\{1, \ldots, m_3\right\}$.  }
  \label{fig:example}
\end{figure}

To obtain the closed formula of $\mathcal{P}^{\mathcal{B}_{I_j,m_j},c}$, first we show the following lemma.

\begin{lemma}
For any input tensor ${\hat{\mathcal{P}} \in \mathbb{R}_{> 0}^{I_1 \times \cdots \times I_d}}$ and its $m$-projection $\mathcal{P}^{\mathcal{B}_{I_j,m_j},c}$ onto the submanifold $\mathcal{S}^{c}_{\mathcal{B}_{I_j,m_j}}$, we have
\label{lem:closeform}
\begin{equation}
     \sum_{i_j=m_j}^{I_{j}} \mathcal{P}^{\mathcal{B}_{I_j,m_j},c}_{i_{1},\ldots, i_{j-1},i_{j},i_{j+1},\ldots,i_d}= \sum_{i_j=m_j}^{I_{j}} \hat{\mathcal{P}}_{i_{1},\ldots, i_{j-1},i_{j},i_{j+1},\ldots,i_d}.
\end{equation}
\end{lemma}

The proof of Lemma~\ref{lem:closeform} can be found in Appendix~\ref{appendix:proof}. This lemma indicates that the $m$-projection onto the submanifold $\mathcal{S}^{c}_{\mathcal{B}_{I_j,m_j}}$ preserves the marginal sum over the \( j \)-th mode. 
Based on the above lemma, we give the following theorem.
\begin{theorem}
For any input tensor ${\hat{\mathcal{P}} \in \mathbb{R}_{> 0}^{I_1 \times \cdots \times I_d}}$, its $m$-projection $\mathcal{P}^{\mathcal{B}_{I_j,m_j},c}$ onto the submanifold $\mathcal{S}^{c}_{\mathcal{B}_{I_j,m_j}}$ is given as
\label{the:closeform}
\begin{align*}
\mathcal{P}^{\mathcal{B}_{I_j,m_j},c}_{i_{1},\ldots, i_{j-1},i_{j},i_{j+1},\ldots,i_d} =  \hat{\mathcal{P}}_{i_{1},\ldots, i_{j-1},i_{j},i_{j+1},\ldots,i_d}
\end{align*}
for $i_j = 1, 2, \dots, m_j-1$ and
\begin{align}
% \begin{aligned}
&\mathcal{P}^{\mathcal{B}_{I_j,m_j},c}_{i_{1},\ldots, i_{j-1},m_j+k,i_{j+1},\ldots,i_d}\\
= &\left(\sum_{k=0}^{I_{j}- m_j} \hat{\mathcal{P}}_{i_{1},\ldots, i_{j-1},m_j+k,i_{j+1},\ldots,i_d}\right) \nonumber\\
&\quad\quad\cdot \exp\left(c k \prod_{\substack{s = 1 \nonumber \\
s \neq j}}^d  i_s \right)\left( \sum_{k=0}^{I_{j}- m_j} \exp\left(ck \prod_{\substack{s = 1 \\ s \neq j}}^d i_s\right)\right)^{-1}
% \end{aligned}
\end{align}
for $k=0, \dots, I_j-m_j$.
\end{theorem}
The proof can be found in Appendix~\ref{appendix:proof}.
Theorem~\ref{the:closeform} provides the closed formula of \( \mathcal{P}^{\mathcal{B}_{I_j,m_j},c} \), hence the closed formulae of \( D_{\mathrm{KL}}(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_{I_j,m_j},c}) \) and \( D_{\mathrm{KL}}(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_{I_j,m_j},0}) \) can also be obtained directly, facilitating the estimation of their bounds.
Because
\[
D_{K L}(\hat{\mathcal{P}}, \mathcal{R})=\sum_{i_1=1}^{I_1} \ldots \sum_{i_d=1}^{I_d} \hat{\mathcal{P}}_{i_1, \ldots, i_d} \log \frac{\hat{\mathcal{P}}_{i_1, \ldots, i_d}}{\mathcal{R}_{i_1, \ldots, i_d}}
\]
and the term $\sum_{i_1=1}^{I_1} \ldots \sum_{i_d=1}^{I_d} \hat{\mathcal{P}}_{i_1, \ldots, i_d} \log \hat{\mathcal{P}}_{i_1, \ldots, i_d}$ is not related to $\mathcal{R}$, we only need to consider
\[
F(\hat{\mathcal{P}}; \mathcal{R})=-\sum_{i_1=1}^{I_1} \ldots \sum_{i_d=1}^{I_d} \hat{\mathcal{P}}_{i_1 \ldots i_d} \log {\mathcal{R}_{i_1, \ldots, i_d}}.
\]

% Then we introduce some notations.
Let $s_{\min}$ be the value defined by the equation $\min(\hat{\mathcal{P}}) = 1 / (\prod_{j=1}^{d} I_j)^{s_{\min}}$, where the term \( \prod_{j=1}^{d} I_j \) is the total number of elements in the normalized tensor \( \hat{\mathcal{P}} \).
Then it is trivial that $s_{\min} \ge 1$ always holds.
Similarly, let $s_{\max}$ be the value satisfying $\max(\hat{\mathcal{P}}) = 1 / (\prod_{j=1}^{d} I_j)^{s_{\max}}$. Then we always have $0 < s_{\max} \leq 1$.
% Therefore, the minimum element of \( \hat{\mathcal{P}} \) must satisfy 
% $
% \min(\hat{\mathcal{P}}) = 1 / \left(\prod_{j=1}^{d} I_j \right)^{s_{\min}},
% $
% where \( s_{\min} \geq 1 \). Similarly, the maximum element satisfies
% $
% \max(\hat{\mathcal{P}}) = 1 / \left(\prod_{j=1}^{d} I_j \right)^{s_{\max}},
% $
% where \( 0 < s_{\max} \leq 1 \).
Furthermore, to facilitate the discussion, we assume that \( 0<a \leq \mathcal{X}_{i_1, \dots, i_d} \leq b \) always holds for some constant values $a$ and $b$ and remains unchanged as the tensor size increases. Consequently, we have 
$\min(\hat{\mathcal{P}}) \geq a / (b \prod_{j=1}^{d} I_j)$ and
$\max(\hat{\mathcal{P}}) \leq b / (a \prod_{j=1}^{d} I_j)$.

Here we present the following theorem using the above properties.
\begin{theorem}
\label{the:lowerbound} 
$F(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_{I_j,m_j},c}) \geq \max{\{l, -l\}}$, where
\[
l = \frac{c\left( I_j - m_j\right)\left( I_j - m_j+ 1 \right) 
\prod_{\substack{h = 1 \\ h \neq j}}^d I_h \left( 1 + I_h \right)}{2^d\left(\prod_{j=1}^{d} I_j \right)^{s_{min}}}.
\]
\end{theorem}


The proof of Theorem~\ref{the:lowerbound} can be found in Appendix~\ref{appendix:proof}. In the following, we use \( \underline{F}(\cdot) \) and \( \overline{F}(\cdot) \) to denote the lower and upper bounds of the function \( F \), respectively. Consequently, this theorem establishes the lower bound of \( F(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_{I_j,m_j},c}) \), denoted as \( \underline{F}(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_{I_j,m_j},c}) \), which enables us to determine the range of \( c \) that satisfies  
$
F(\hat{\mathcal{P}}; \mathcal{P}^{\mathcal{B}_{I_j,m_j},c}) \geq \underline{F}(\hat{\mathcal{P}}; \mathcal{P}^{\mathcal{B}_{I_j,m_j},c}) \geq \overline{F}(\hat{\mathcal{P}}; \mathcal{P}^{\mathcal{B}_{I_j,m_j},0}) \geq F(\hat{\mathcal{P}}; \mathcal{P}^{\mathcal{B}_{I_j,m_j},0}).
$
Moreover, the range of \( c \) satisfying
$F(\hat{\mathcal{P}}; \mathcal{P}^{\mathcal{B}_{I_j,m_j},c}) \leq F(\hat{\mathcal{P}}; \mathcal{P}^{\mathcal{B}_{I_j,m_j},0})$
is a subset of the complement of the range of $c$ that satisfies  
$\underline{F}(\hat{\mathcal{P}}; \mathcal{P}^{\mathcal{B}_{I_j,m_j},c}) \geq \overline{F}(\hat{\mathcal{P}}; \mathcal{P}^{\mathcal{B}_{I_j,m_j},0}).$
Based on the above analysis, the following corollary holds. 
\begin{corollary}
To satisfy the condition $D_{\mathrm{KL}}(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_{I_j,m_j},c}) \leq D_{\mathrm{KL}}(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_{I_j,m_j},0})$, for every $j = 1, \ldots, d$, $c$ should at least satisfy $- l \leq c \leq l$, where
\label{col:c_bounds}
\begin{align}
% \begin{aligned}
% - l &\leq c \leq l, \\
l &= \frac{2^d  s_{\max}  \log\left( \prod_{j=1}^{d} I_j \right)}{\left( I_j-m_j \right) \prod_{\substack{h = 1 \\ h \neq j}}^d \left(1+I_h\right) \left(  \prod_{j=1}^{d} I_j \right)^{s_{\max}-s_{\min}}} \nonumber \\
  &<\frac{2^d    \log\left( \prod_{j=1}^{d} I_j \right) b^2}{\left( I_j-m_j \right) \prod_{\substack{h = 1 \\ h \neq j}}^d \left(1+I_h\right) a^2}.
% \end{aligned}
\end{align}
\end{corollary}







%From equation ~\ref{eq:c_bounds}, it is easy to observe that as the tensor %size increases, $\mathcal{P}^{\mathcal{B}_{I_j,m_j},c_0}$ will converge to 0.\\


The proof of Corollary~\ref{col:c_bounds} can be found in Appendix~\ref{appendix:proof}. This corollary shows that $\mathcal{P}^{\mathcal{B}_{I_j,m_j},c_0}$ eventually converges to $\mathcal{P}^{\mathcal{B}_{I_j,m_j},0}$ as the tensor size increases.
We set $m_j = \left\lfloor I_{j} / \alpha \right\rfloor \geq 1$, where \(\alpha\) is a constant that remains unchanged as $I_j$ increases, and \(\left\lfloor \cdot \right\rfloor\) denotes the floor function.
For an index set $\mathcal{B}$, $\lvert \mathcal{B} \rvert$ denotes the number of elements in $\mathcal{B}$.

It is evident that  
\begin{align}\label{eq:f_condition}
F(\hat{\mathcal{P}} ;\mathcal{P}^{\mathcal{B}_{h},c} ) \geq F(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_{I_j,m_j},c})
\end{align}
for \( h = 1, \ldots, d-1 \) as the tensor size increases. This holds because the KL divergence is primarily determined by the number of parameters that can be optimized. This implies that, if  $
\lvert \mathcal{B}_{I_j, m_j} \rvert \gg \lvert \mathcal{B}_h \rvert,$
then  
$F(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_{h},c}) \geq F(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_{I_j,m_j},c}).$
Accordingly, if we define the function $g_h$ as
\[
g_h(I_1,\ldots,I_d) = \frac{\lvert \mathcal{B}_h \rvert}{\lvert \mathcal{B}_{I_j,m_j} \rvert},
\]

It is apparent that if one or more elements in the set \(\{I_1, \ldots, I_d\}\) increase, \( g_h \) will monotonically decrease and converge to zero. This can also be interpreted as  
\[
\lvert \mathcal{B}_{I_j, m_j} \rvert \gg \lvert \mathcal{B}_h \rvert
\]  
when the tensor size is large enough.
Therefore, Equation~\eqref{eq:f_condition} follows.

Moreover,  $F(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_1,0}) \geq F(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_h,0})$ always holds because $\mathcal{B}_h \subseteq \mathcal{B}_{h+1}$. Therefore, once we determine the range of \( c \) for which  
$
F(\hat{\mathcal{P}}; \mathcal{P}^{\mathcal{B}_{I_j,m_j},c}) \geq F(\hat{\mathcal{P}}; \mathcal{P}^{\mathcal{B}_1,0})
$
holds, we can subsequently establish that  
\begin{align*}
F(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_{h},c}) &\geq F(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_{I_j,m_j},c})\\
&\geq F(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_1,0}) \geq F(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_{h},0}).
\end{align*}
\cite{NEURIPS2021_040ca38c} shows the closed formula of $\mathcal{P}^{\mathcal{B}_1,0}$, which is given as
\begin{multline*}
\mathcal{P}^{\mathcal{B}_1,0}_{ \, i_1, \ldots, i_d} = \prod_{k=1}^d \Bigg(\sum_{i_1^{\prime}=1}^{I_1} \dots 
\sum_{i_{k-1}^{\prime}=1}^{I_{k-1}} \sum_{i_{k+1}^{\prime}=1}^{I_{k+1}} \\
\cdots \sum_{i_d^{\prime}=1}^{I_d} 
\hat{\mathcal{P}}_{i_1^{\prime}, \dots, i_{k-1}^{\prime}, i_k, i_{k+1}^{\prime}, \ldots, i_d}\Bigg).
\end{multline*}
Thus, \( F(\hat{\mathcal{P}}; \mathcal{P}^{\mathcal{B}_{1},0}) \) can be computed directly, allowing us to estimate its upper bound, \( \overline{F}(\hat{\mathcal{P}}; \mathcal{P}^{\mathcal{B}_1,0}) \), as given in Equation~\eqref{upbofb1} in the appendix.
Consequently, the range of \( c \) that satisfies the inequality 
$F(\hat{\mathcal{P}}; \mathcal{P}^{\mathcal{B}_{I_j,m_j},c}) \geq \underline{F}(\hat{\mathcal{P}}; \mathcal{P}^{\mathcal{B}_{I_j,m_j},c}) \geq \overline{F}(\hat{\mathcal{P}}; \mathcal{P}^{\mathcal{B}_1,0}) \geq F(\hat{\mathcal{P}}; \mathcal{P}^{\mathcal{B}_1,0})$ can be easily determined. As we discussed earlier, this range of \( c \) also ensures  $F(\hat{\mathcal{P}}; \mathcal{P}^{\mathcal{B}_{h},c}) \geq F(\hat{\mathcal{P}}; \mathcal{P}^{\mathcal{B}_h,0})$.
Therefore, the range of \( c \) that satisfies $
F(\hat{\mathcal{P}}; \mathcal{P}^{\mathcal{B}_{h},c}) \leq F(\hat{\mathcal{P}}; \mathcal{P}^{\mathcal{B}_h,0})$ is simply a subset of the complement of this range. This leads to the following theorem.



\begin{theorem}\label{the:finalresult} 
  For many-body approximation of $\hat{\mathcal{P}}$, to satisfy the condition $D_{\mathrm{KL}}(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_h,c}) \leq D_{\mathrm{KL}}(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_h,0})$ for every $h=1,\dots,d$, $c$ should at least satisfy: 
\begin{equation}
-\min_{j=1,2,\ldots,d} \, l_j \leq c \leq \min_{j=1,2,\ldots,d} \, l_j,
\end{equation}
where 
\begin{align}
l_j &= \frac{2^{d}\left(\left(s_{min}-1\right)d+1\right) I_j \log \left(\tau \right)\left(\tau \right)^{s_{min}}}{\left( I_j - m_j\right)\left( I_j - m_j+ 1 \right) 
\prod_{\substack{h = 1 \\ h \neq j}}^d \left( 1 + I_h \right) {\left(\tau\right)^{s_{max}}}} \nonumber \\
  &< \frac{2^{d}\left(d+1\right) I_j \log \left(\tau \right)b^2}{\left( I_j - m_j\right)\left( I_j - m_j+ 1 \right) 
\prod_{\substack{h = 1 \\ h \neq j}}^d \left( 1 + I_h \right) \left(a^2\right)},  \nonumber \\
j &= 1, \ldots, d, \quad \tau=\prod_{j=1}^{d} I_j.
\label{eq:c_bounds}
\end{align}
\end{theorem}


The proof of Theorem~\ref{the:finalresult} can be found in Appendix~\ref{appendix:proof}.
From the above theorem, it is easy to observe that $\mathcal{P}^{\mathcal{B}_h,c_{0}}$ converges to $\mathcal{P}^{\mathcal{B}_h,0}$ as the tensor size increases. Please note that this theorem can also be applied to the high-order Boltzmann machine, where each \( I_j = 2 \) for \( j = 1, \ldots, d \).



Moreover, it is worth mentioning that $\mathcal{P}^{\mathcal{B}_h,0}$ has the maximum entropy among $\mathcal{P}^{\mathcal{B}_h,c}$ for all $c \in \mathbb{R}$. This suggests that as the tensor size increases, the 
$m$-projection selects the point with the maximum entropy to minimize the KL divergence, as summarized below.
\begin{theorem}[Maximum Entropy Principle]
Consider the set:
\[
\widetilde{\mathcal{P}}^{\mathcal{B}}=\bigcup_{c \in \mathbb{R}} \mathcal{P}^{\mathcal{B},c}, \quad \mathcal{P}^{\mathcal{B},c} = \underset{\mathcal{R} \in \mathcal{S}_\mathcal{B}^c}{\operatorname{argmin}} \, D_{\mathrm{KL}}(\hat{\mathcal{P}}, \mathcal{R}), 
\]
we have \( \mathcal{P}^{\mathcal{B},0} \in \bigcup_{c \in \mathbb{R}} \mathcal{P}^{\mathcal{B},c} \) and \( \mathcal{P}^{\mathcal{B},0} \) maximizes the entropy in the set \(\widetilde{\mathcal{P}}^{\mathcal{B}} \).
\label{maxentropy}
\end{theorem}
% Here $\mathcal{S}^{c}_{\mathcal{B}}=\left\{Q \in \mathcal{S} \mid \theta_v=c \text { for all } v \in \Omega_d^{+} \backslash \mathcal{B}\right\}.$
Please note that, in this theorem, $\mathcal{B}$ can be any index set that satisfies $\mathcal{B} \subseteq \Omega_d^{+}$, not restricting to $\mathcal{B}_{I_j,m_j}$
or $\mathcal{B}_{{h}}$. The proof of Theorem~\ref{maxentropy} and the definition of the entropy is in Appendix~\ref{appendix:proof}. This demonstrates that many-body approximation, as a learning model, gradually evolves into a maximum entropy model as the tensor size increases. Moreover, it can be connected to other maximum entropy learning models widely utilized in various machine learning domains~\cite{mezard2009information,wainwright2008graphical}.

\section{Searching algorithm}
To verify our theory, we propose an optimization algorithm to search for the  $c_0$ value, which has been mentioned in the previous section. Please note that our method differs from the Legendre decomposition introduced in the preliminaries, particularly in terms of the constraints and optimality conditions.

\subsection{Optimization problem}
Our optimization problem  can be  formulated as
\begin{equation}
\begin{aligned}
  \mathcal{P}^{\mathcal{B},c_{0}} &= \underset{\mathcal{P} \in \mathcal{S}_{\mathcal{B}}^\mathcal{H}}{\operatorname{argmin}} \, D_{\mathrm{KL}}(\hat{\mathcal{P}}, \mathcal{P}), \\
  \mathcal{S}_{\mathcal{B}}^\mathcal{H} &= \left\{ \mathcal{R} \in \mathcal{S} \mid \theta_\alpha = \theta_\beta \, \text{for all } \alpha, \beta \in \Omega_d^{+} \backslash \mathcal{B} \right\},
\end{aligned}
\label{eq:optimization_problem}
\end{equation}
This optimization problem \eqref{eq:optimization_problem} can also be recognized as an $m$-projection onto the $e$-flat submanifold $\mathcal{S}_{\mathcal{B}}^\mathcal{H}$.
The resulting distribution of the projection, denoted as $\mathcal{P}^{\mathcal{B},c_{0}}$, satisfies $\theta_\alpha = c_{0}$ for all $\alpha \in \Omega_d^{+} \backslash \mathcal{B}$.
According to the principles of information geometry~\cite{amari2016information}, the result of the $m$-projection to the $e$-flat submanifold is guaranteed to exist and unique. Obviously, we have the relationship $\mathcal{S}^{c}_\mathcal{B}  \subseteq  \mathcal{S}_{\mathcal{B}}^\mathcal{H}$ for all $c \in \mathbb{R}  $.  Please note that in our optimization method, $\mathcal{B}$ can become any index set that satisfies $\mathcal{B} \subseteq \Omega_d^{+}$, not restricting to $\mathcal{B}_{I_j,m_j}$
or $\mathcal{B}_{{h}}$.

\begin{figure}[t]
  \centering
  \includegraphics[width=.8\linewidth]{uai2025/implegwor_cropped.png}
  \caption{Optimization procedure in the submanifold $\mathcal{S}_{\mathcal{B}}^\mathcal{H}$. $\hat{\mathcal{P}}$ is an input positive tensor, and $\mathcal{P}^{\mathcal{B},c_0}$ is the result of the optimization problem \eqref{eq:optimization_problem}. $\mathcal{P}_{L}$ is a tensor of the $L^{th}$ step of gradient decent. $\mathcal{O}$ is the initial point of optimization, which is usually a uniform distribution.}
  \label{fig:SH}
\end{figure}

\subsection{Optimization method}
First we reformulate the optimization problem as follows:
\begin{equation}
\begin{aligned}
    &\mathop{\mathrm{argmin}}_{\mathcal{P} \in \mathcal{S}_{\mathcal{B}}^\mathcal{H}} F(\mathcal{\hat{P}}, \mathcal{P}) \\
    =\ &-\sum_{i_1=1}^{I_1} \cdots \sum_{i_d=1}^{I_d} 
    \mathcal{\hat{P}}_{i_1, \ldots, i_d} \log \mathcal{P}_{i_1, \ldots, i_d} \\
    =\ &-\sum_{\mathclap{\left(i_1, \ldots, i_d \right) \in \Omega_d^{}}} 
    \mathcal{\hat{P}}_{i_1, \ldots, i_d} 
    \left(
    \sum_{i_1^{\prime}=1}^{i_1} \cdots \sum_{i_d^{\prime}=1}^{i_d} \theta_{i_1^{\prime}, \ldots, i_d^{\prime}}
     \right)
    % \text{s.t.} \quad & \theta_{\alpha} = \theta_{\beta}, \quad 
    % \forall \alpha, \beta \in \Omega_d^{+} \backslash \mathcal{B}.
\end{aligned}
\label{eq:optimization}
\end{equation}
subject to $\theta_{\alpha} = \theta_{\beta}$ for all $\alpha, \beta \in \Omega_d^{+} \backslash \mathcal{B}$,
where \({\theta}_{\alpha}\) represents the \(\theta\) coordinates of the tensor \(\mathcal{P} \in \mathcal{S}_{\mathcal{B}}^\mathcal{H}. \)  Since the KL divergence function is convex, and the constraint $\theta_{\alpha} = \theta_{\beta}$ is linear, this forms a convex optimization problem. As a result, the optimal solution not only exists but is also unique, aligning with the principles of information geometry that we have previously discussed.

We use the generalized reduced gradient method \cite{sun2006optimization} to solve this constrained optimization problem. First, we select an index \( \gamma \) from \( \Omega_d^{+} \backslash \mathcal{B} \)
and define $\mathcal{B}_{\gamma}=\mathcal{B} \cup   \{{\gamma}\}  $,  $\theta_{\mathcal{B}} = \left\{\theta_{\alpha} \mid \alpha \in \mathcal{B}\right\}$, then we have $\theta_{\Omega_d^{+}} = \theta_{\Omega_d^{+} \backslash \mathcal{B}_{\gamma}} \cup \theta_{\mathcal{B}_{\gamma}} $.
Moreover, from the constraint condition $$\mathcal{S}_{\mathcal{B}}^\mathcal{H}=\left\{Q \in \mathcal{S} \mid \theta_\alpha=\theta_\beta \text { for all } \alpha,\beta \in \Omega_d^{+} \backslash \mathcal{B}\right\},$$
we obtain \( \theta_{\alpha} = \theta_{\gamma} \) for all \( \theta_{\alpha} \in \theta_{\Omega_d^{+} \backslash \mathcal{B}_{\gamma}} \). Therefore, we can rewrite $F$ as: 
$F(\theta_{_{\Omega_d^{+}}})=F(\theta_{\Omega_d^{+} \backslash \mathcal{B}_{\gamma}},\theta_{\mathcal{B}_{\gamma}})= \widetilde{F}(\theta_{\mathcal{B}_{\gamma}})$. The number of parameters we need to optimize is $|\mathcal{B}_{\gamma}|=|\mathcal{B}|+1$. 
The gradient of $\theta_{w}$ for each $w \in \mathcal{B}$ is calculated as:
\begin{equation}
\frac{\partial}{\partial \theta_{w}} \widetilde{F}=\eta_{w}-\hat{\eta}_{w},
\label{gradient1}
\end{equation}
which is the same as that of Legendre decomposition in Equation~\eqref{legd}.
The gradient of $\theta_{\gamma}$ is calculated as
\begin{equation}
\frac{\partial}{\partial \theta_{\gamma}} \widetilde{F}=\sum_{s \in \Omega_d^{+} \backslash \mathcal{B}_{\gamma}}\frac{\partial}{\partial \theta_{s}}\widetilde{F} \cdot  \frac{\mathrm{d}\theta_{s}}{\mathrm{d}\theta_{\gamma}}+\frac{\partial}{\partial \theta_{\gamma}} \widetilde{F} =\sum_{s \in \Omega_d^{+} \backslash \mathcal{B}}(\eta_s-\hat{\eta}_s).
\label{gradient2}
\end{equation} 
Equations \eqref{gradient1} and \eqref{gradient2} also show that the function $\widetilde{F}$ is minimized if and only if 
$\eta_{w}=\hat{\eta}_{w}$ for all $w \in \mathcal{B}$ and $\sum_{s \in \Omega_d^{+} \backslash \mathcal{B}}(\eta_s-\hat{\eta}_s)=0$. However, in Legendre decomposition, the optimality condition is only given by \( \eta_{w}=\hat{\eta}_{w} \) for all \( w \in \mathcal{B} \), which implies that \( \sum_{s \in \Omega_d^{+} \backslash \mathcal{B}}(\eta_s-\hat{\eta}_s)=0 \) can further reduce the KL error.




We show the pseudo-code of the above gradient method in Algorithm~\ref{alg:gradient_descent}.
The time complexity of each iteration is $O(|\Omega_d||\mathcal{B_{\gamma}}|)$, as that of computing $\mathcal{P}$ from $\left(\theta_v\right)_{v \in \mathcal{B}_{\gamma}}$ (line 5 in Algorithm~\ref{alg:gradient_descent}) is $O(|\Omega_d||\mathcal{B}_{\gamma}|)$ and computing $\left(\eta_v\right)_{v \in \Omega_d}$ from $\mathcal{P}$ (line 6 in Algorithm~\ref{alg:gradient_descent}) is $O(|\Omega_d|)$. Thus the total complexity is $O\left(h|\Omega_d||\mathcal{B_{\gamma}}|^2\right)$ with the number of iterations $h$ until convergence.


Although gradient descent is an efficient approach, we can also use the Newton method (natural gradient descent)~\cite{amari1998natural}, a second-order optimization method shown in Algorithm~\ref{alg:natural_gradient}, to reduce the number of iterations to gain efficiency. Each element of the Hessian matrix \(\widetilde{\mathbf{G}} \in \mathbb{R}^{|\mathcal{B}_{\gamma}| \times |\mathcal{B}_{\gamma}|}\) of $\widetilde{F}(\theta_{\mathcal{B}_{\gamma}})$ is calculated as:
\begin{equation}
\begin{aligned}
   &\widetilde{\mathbf{G}}_{u,v}=\frac{\partial ^{2} }{\partial \theta_{u}\partial \theta_{v}}\widetilde{F}=\mathbf{G}_{u,v}, \quad u,v \in \mathcal{B},  
 \\&\widetilde{\mathbf{G}}_{\gamma, v}=\frac{\partial}{\partial \theta_{v}}(\sum_{s \in \Omega_d^{+} \backslash \mathcal{B}}(\eta_s-\hat{\eta}_s))=
 \sum_{\mathclap{s \in \Omega_d^{+} \backslash \mathcal{B}}} \mathbf{G}_{s,v} , \quad v \in \mathcal{B},
 \\&\widetilde{\mathbf{G}}_{v, \gamma }=\sum_{s \in \Omega_d^{+} \backslash \mathcal{B}} \mathbf{G}_{v,s} , \quad v \in \mathcal{B}, \text{ and}
 \\&\widetilde{\mathbf{G}}_{\gamma, \gamma }=\frac{\partial}{\partial \theta_{\gamma}}(\sum_{s \in \Omega_d^{+} \backslash \mathcal{B}}(\eta_s-\hat{\eta}_s))=\sum_{s,t \in \Omega_d^{+} \backslash \mathcal{B}} \mathbf{G}_{s,t},
\end{aligned}    
\end{equation}
where $\mathbf{G}=(\mathbf{G}_{u,v}) \in \mathbb{R}^{\lvert  \Omega_d^{+}\rvert \times \lvert  \Omega_d^{+}\rvert}$ is the Hessian matrix of $F(\theta_{_{\Omega_d^{+}}})$ calculated as
% \begin{equation}
% \begin{aligned}
\begin{align}
\mathbf{G}_{u,v}(\theta) &= \frac{\partial \eta_u}{\partial \theta_v} 
=\frac{\partial^2 F }{\partial \theta_{u}\partial \theta_{v}} \nonumber \\
&= \sum_{w \in \Omega_d} \zeta(u, w) \zeta(v, w) \mathcal{P}_w - \eta_u \eta_v,
\label{fishermatrix}
\end{align}
% \end{aligned}
% \end{equation}
where
% $
% \zeta(u, v)= \begin{cases}1 & \text { if } u \leq v, \\ 0 & \text { otherwise. }\end{cases}
% $
$\zeta(u, v)= 1$ if $u \leq v$ and $\zeta(u, v)= 0$ otherwise.

The time complexity of each iteration is $O(|\Omega_d||\mathcal{B}_{\gamma}|+|\mathcal{B}_{\gamma}
|^3)$, where $O(|\Omega_d||\mathcal{B}_{\gamma}|)$ is needed to compute $\mathcal{P}$ from $\theta$ and $O\left(|\mathcal{B}_{\gamma}
|^3\right)$ to compute the inverse of $\widetilde{\mathbf{G}}$, resulting in the total complexity $O\left(h^{\prime}|\Omega_d||\mathcal{B}_{\gamma}|+h^{\prime}|\mathcal{B}_{\gamma}|^3\right)$ with the number of iterations $h^{\prime}$ until convergence. We illustrate the optimization procedure in Figure~\ref{fig:SH}. 

\begin{algorithm}[t]
\caption{Gradient Descent Algorithm}
\label{alg:gradient_descent}
\begin{algorithmic}[1]
\Procedure{GradientDescent}{$\mathcal{\hat{P}}, \mathcal{B}_{\gamma}$}
    \State Initialize $(\theta_{k})_{k \in \Omega_d^{+}}$ \Comment{e.g., $\theta_k = 0$ for all $k$}
    \Repeat
        \For{each $t \in \mathcal{B}_{\gamma} = \left\{v \mid v \in \mathcal{B} \right\} \cup \{\gamma\}$} 
            \State Compute $\mathcal{P}$ using current $(\theta_t)_{t \in \mathcal{B}_{\gamma}}$
            \State Update $\eta_k$ for each $k \in \Omega_d^{+}$ from $\mathcal{P}$ 
            \State $\theta_v \gets \theta_v - \epsilon (\eta_v - \hat{\eta}_v), \quad  v \in \mathcal{B}$ 
            \State $\theta_{\gamma} \gets \theta_{\gamma} - \epsilon 
                \left( \sum_{s \in \Omega_d^{+} \setminus \mathcal{B}} (\eta_s - \hat{\eta}_s) \right)$ 
        \EndFor
    \Until{convergence of $(\theta_t)_{t \in \mathcal{B}_{\gamma}}$} 
\EndProcedure
\end{algorithmic}
\end{algorithm}

\begin{algorithm}[t]
\caption{Natural Gradient Algorithm}
\label{alg:natural_gradient}
\begin{algorithmic}[1]
\Procedure{NaturalGradient}{$\mathcal{\hat{P}}, \mathcal{B}_{\gamma}$}
    \State Initialize $(\theta_k)_{k \in \Omega_d^{+}}$ \Comment{e.g., $\theta_k = 0$ for all $k$}
    \Repeat
        \State Compute $\mathcal{P}$ using current $(\theta_t)_{t \in \mathcal{B}_{\gamma}}$
        \State Update $\eta_k$ for each $k \in \Omega_d^{+}$ from $\mathcal{P}$
        \State Compute matrix $\mathbf{G}$ and $\widetilde{\mathbf{G}}$ using $\eta_k, k \in \Omega_d^{+}$
        \State Compute
\Statex \hspace{3.5em} $\Delta \boldsymbol{\eta} \gets 
\begin{pmatrix}
\eta_v - \hat{\eta}_v \\
\sum_{s \in \Omega_d^{+} \setminus \mathcal{B}} (\eta_s - \hat{\eta}_s)
\end{pmatrix},  \quad v \in \mathcal{B}$



        \State Invert matrix $\widetilde{\mathbf{G}}$ to get $\widetilde{\mathbf{G}}^{-1}$
        \State $\theta \gets \theta - \epsilon \widetilde{\mathbf{G}}^{-1} \Delta \boldsymbol{\eta}$
    \Until{convergence of $(\theta_t)_{t \in \mathcal{B}_{\gamma}}$}
\EndProcedure
\end{algorithmic}
\end{algorithm}

\section{Numerical Experiments}
We numerically examine our theoretical results using synthetic and real-world datasets. Experiments were conducted on Ubuntu 22.04.4 LTS with 88 CPU threads of 2.20GHz Intel Xeon E7-8880 v4 and 3TB of memory. 
\subsection{Experiments setup}


\begin{figure*}[t]
    \centering

    % Top Row
    \begin{minipage}[b]{0.48\textwidth}
        \centering
        \includegraphics[width=0.48\textwidth]{uniform_3_nolog.eps}
        \includegraphics[width=0.48\textwidth]{uniform_3_withlog.eps}
        \caption*{(a) Uniform distribution with increasing dimensionality.}
    \end{minipage}%
    \hfill
    \begin{minipage}[b]{0.48\textwidth}
        \centering
        \includegraphics[width=0.48\textwidth]{uniform_s_wolog.eps}
        \includegraphics[width=0.48\textwidth]{uniform_s_wlog.eps}
        \caption*{(b) Uniform distribution with increasing $s$ of $(s,s,s,s,s)$.}
    \end{minipage}

    \medskip

    % Bottom Row
    \begin{minipage}[b]{0.48\textwidth}
        \centering
        \includegraphics[width=0.48\textwidth]{butterfly_3_wolog.eps}
        \includegraphics[width=0.48\textwidth]{butterfly_3_withlog.eps}
        \caption*{(c) Butterfly figure with increasing dimensionality.}
    \end{minipage}%
    \hfill
    \begin{minipage}[b]{0.48\textwidth}
        \centering
        \includegraphics[width=0.48\textwidth]{butterfly_s_wolog.eps}
        \includegraphics[width=0.48\textwidth]{butterfly_s_wlog.eps}
        \caption*{(d) Butterfly figure with increasing $s$ of $(s,s,s,s,s)$.}
    \end{minipage}

    \caption{Experimental results for uniform distribution and butterfly figure. (a, c) The horizontal axis is the total dimension for the input tensor, with each mode having 3 elements. (b, d) The horizontal axis is the value of $s$, and the total size of the input tensor is $(s,s,s,s,s)$. The vertical axis shows the value of $c_0$ or the log value of $c_0$ in $\mathcal{P}^{\mathcal{B}_h,c_{0}}$.}
    \label{fig:one_row_four_groups}
\end{figure*}



\textbf{Synthetic datasets.}\ \ 
We generate tensors from the uniform continuous distribution in $[5,8]$. In experiment \( (\mathbf{a}) \), we progressively increase the tensor size from \( (3,3,3) \) to \( (3,3,3,3) \), adding one dimension at a time until reaching \( (3,3,3,3,3,3,3,3,3,3) \).  In experiment \( (\mathbf{b}) \), we expand the tensor size from \( (2,2,2,2,2) \) to \( (3,3,3,3,3) \), continuing this process until it reaches \( (10,10,10,10,10) \).

\textbf{Real-world datasets.}\ \ 
In the first real data experiment, we utilize the TokyoTech hyperspectral image data set~\cite{monno2015practical,monno2017adaptive}. For each image, it is a $(500, 500, 31)$ tensor, where each mode represents the width, height, and 31-band hyperspectral images from 420 to 720 nm at 10 nm intervals, respectively. We choose the first figure in the dataset, which is a butterfly image, and each pixel value lies within the range \([0.00265, 1]\). In experiment \( (\mathbf{c}) \), a sub-tensor was extracted from the original tensor, corresponding to the segment \([249{:}330, 249{:}330, 1{:}9]\), and then extracted it and reshaped into a \( (3, 3, 3) \) tensor. This sub-tensor was subsequently expanded and reshaped into a \( (3, 3, 3, 3) \) tensor, with the process continuing until it reached the final shape of \( (3, 3, 3, 3, 3, 3, 3, 3, 3, 3) \).

Furthermore, in experiment $(\mathbf{d})$, from the original tensor, we extracted a sub-tensor defined by the segment \([250:350, 250:350, 1:10]\), which was subsequently reshaped into a \( (10, 10, 10, 10, 10) \) tensor. To progressively enlarge the tensor, we first extracted a smaller sub-tensor of size \( (2, 2, 2, 2, 2) \) and then expanded it to \( (3, 3, 3, 3, 3) \). We continued this process incrementally until the tensor reached its final size of \( (10, 10, 10, 10, 10) \).
In the second real data experiment, we used the Columbia Object Image Library (COIL-100) dataset~\cite{nene1996coil100}; for each image, it can be regarded as a tensor of size $(128,128,3)$. We randomly picked two images and combined them as a $(128,128,3,2)$ tensor, where each mode represents the width, height, color, and image index, respectively. Each pixel value falls within the range $[1, 255]$. We increase the tensor from $(4,4,3,2)$ to $(128,128,3,2)$ in increments of 24 at each step for width and height channels. 


\subsection{Experiments results}
We show the experimental results for the uniform distribution and butterfly figure in Figure~\ref{fig:one_row_four_groups} and those for the COIL-100 data in Figure~\ref{fig:one_row_two_groups}. We plot both the $c_0$ and $\log\left|c_0\right|$ to clearly show its trends. These results show that as the tensor size increases, \( \left|c_0\right| \) of \( \mathcal{P}^{\mathcal{B}_h,c_{0}} \) gradually decreases for any many-body structure, and the results remain consistent with our theoretical bounds. We used $m_j = \lfloor I_{j} / 2 \rfloor$ and the actual values of $s_{\min}$ and $s_{\max}$ of each sub-tensor to compute the theoretical bound. Furthermore, the convergence rate to zero varies depending on the many-body structure. Specifically, lower-body approximations (one or two body) tend to converge faster than higher-body approximations (three, four, or five body). In addition, for these five different experiments, the \( \left| c_0 \right| \) values fall within the range of \([1 \times 10^{-8}, 1 \times 10^{-5}]\) when the dimension or \( s \) is increased to 10, indicating that they are close to zero.

\begin{figure}[t]
    \centering
    \begin{minipage}[b]{0.45\linewidth}
        \centering
        \includegraphics[width=\textwidth]{coil_wolog.eps}
        \caption*{(a) Without log transformation}
    \end{minipage}%
    \hfill
    \begin{minipage}[b]{0.45\linewidth}
        \centering
        \includegraphics[width=\textwidth]{coil_wlog.eps}
        \caption*{(b) With log transformation}
    \end{minipage}
    
    \caption{Experimental results for COIL-100 dataset. The horizontal axis is the value of $s$, and the total size of the input tensor is $(s,s,3,2)$.}
    \label{fig:one_row_two_groups}
\end{figure}





\section{Conclusion}
In this paper, we have discussed the hyperparameter $c_0$ (or the submanifold $S^{c_0}_{\mathcal{B}_h}$) selection problem in many-body approximation
in the optimization problem of minimizing the KL-divergence between the original distribution and statistic model. Our theoretical result shows the asymptotic characteristic of
the hyperparameter $c_0$, which means that as the tensor size increases, the value of $c_0$ converges to 0. The experimental results in the synthetic and real-world datasets validate our theoretical analysis. This paper not only provides a theoretical foundation for the widely used many-body approximation under large-scale parameters but also proposes an optimal many-body model selection algorithm for small-scale non-negative tensors or empirical distributions.

\begin{acknowledgements}
   This work was supported by JST, CREST Grant Number JPMJCR22D3, Japan.
\end{acknowledgements}




%\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
%    Briefly acknowledge people and organizations here.

%    \emph{All} acknowledgements go in this section.
%\end{acknowledgements}

% References
\bibliography{uai2025-template}

\newpage

\onecolumn

\title{Optimal Submanifold Structure in Log-linear Models\\(Supplementary Material)}
\maketitle



% This Supplementary Material should be submitted together with the main paper.

\appendix
\section{ Projection theory in information geometry}
\label{ptig}
We explain concepts of information geometry used in this study, including natural parameters, expectation parameters, and model flatness. In the following discussion, we consider only discrete probability distributions.

\subsection{\texorpdfstring{$(\theta, \eta)$}{(theta, eta)}-coordinate and geodesics}
\label{candg}
In this study, we regard a normalized $d$-order non-negative tensor $\mathcal{P} \in \mathbb{R}_{> 0}^{I_1 \times \cdots \times I_d}$ as a discrete probability distribution with $d$ random variables and the \( i \)th random variable can take values in \( \{1, \dots, I_i\} \).
 Let $\mathcal{S}$ be the set of discrete probability distributions with $d$ random variables. The entire space \(\mathcal{S}\) is a non-Euclidean space, where the Fisher information matrix \(\mathbf{G}\) serves as the Riemann metric. This metric arises from the second-order differentiation of the KL divergence, as shown in Equation~\eqref{fishermatrix}. In Euclidean space, a straight line is the shortest path between two points. In a non-Euclidean space, such a shortest path is called a geodesic. In the space $\mathcal{S}$, two kinds of geodesics can be introduced: $e$-geodesics and $m$-geodesics. For two points $\mathcal{P}_1, \mathcal{P}_2 \in \mathcal{S}$, $e$- and $m$-geodesics are defined as
% 
$$
\left\{\mathcal{R}_t \mid \log \mathcal{R}_t=(1-t) \log \mathcal{P}_1+t \log \mathcal{P}_2-\phi(t)\right\}, \quad\left\{\mathcal{R}_t \mid \mathcal{R}_t=(1-t) \mathcal{P}_1+t \mathcal{P}_2\right\},
$$
% 
respectively, where $0 \leq t \leq 1$ and $\phi(t)$ is a normalization factor to keep $\mathcal{R}_t$ to be a distribution.

We can parameterize the distributions $\mathcal{P} \in \mathcal{S}$ using parameters known as natural parameters. In Equation~\eqref{ept}, we have described the relationship between a distribution $\mathcal{P}$ and a natural parameter vector $\boldsymbol{\theta}=\left(\theta_{2, \ldots, 1}, \ldots, \theta_{I_1, \ldots, I_d}\right)$. The natural parameter $\theta$ serves as a coordinate system of $\mathcal{S}$, hence any distribution in $\mathcal{S}$ is specified by determining $\boldsymbol{\theta}$. Furthermore, we can also specify a distribution $\mathcal{P}$ by its expectation parameter vector $\boldsymbol{\eta}=\left(\eta_{2, \ldots, 1}, \ldots, \eta_{I_1, \ldots, I_d}\right)$, which corresponds to expected values of the distribution and an alternative coordinate system of $\mathcal{S}$. The definition of the expectation parameter $\boldsymbol{\eta}$ is described in Equations~\eqref{etatop} and~\eqref{ptoeta}. The pair of coordinates, $\theta$-coordinates and $\eta$-coordinates, are orthogonal with each other, which means that the Fisher information matrix $\mathbf{G}$ has the following property, $\mathbf{G}_{u, v}=\partial \eta_u / \partial \theta_v$ and $\left(\mathbf{G}^{-1}\right)_{u, v}=\partial \theta_u / \partial \eta_v$.  We can describe $e$- and $m$-geodesics using these parameters as follows.
% 
$$
\left\{\boldsymbol{\theta}^t \mid \boldsymbol{\theta}^t=(1-t) \boldsymbol{\theta}^{\mathcal{P}_1}+t \boldsymbol{\theta}^{\mathcal{P}_2}\right\}, \quad\left\{\boldsymbol{\eta}^t \mid \boldsymbol{\eta}^t=(1-t) \boldsymbol{\eta}^{\mathcal{P}_1}+t \boldsymbol{\eta}^{\mathcal{P}_2}\right\},
$$
% 
where $\boldsymbol{\theta}^{\mathcal{P}}$ and $\boldsymbol{\eta}^{\mathcal{P}}$ are $\theta$- and $\eta$-coordinate of a distribution $\mathcal{P} \in \mathcal{S}$, respectively. 

\subsection{Flatness and projections}
\label{flatandproject}
A submanifold is called $e$-flat if any $e$-geodesic connecting two points in it remains within the submanifold. The vertical descent of an $m$-geodesic from a point $\mathcal{P} \in \mathcal{S}$ onto an $e$-flat submanifold $\mathcal{S}_{\mathcal{B}_{e\text{-}flat}}$ is called the $m$-projection. Similarly, the $e$-projection is obtained by interchanging $e$ and $m$. The flatness of subspaces guarantees the uniqueness of the projection destination, denoted as $\mathcal{P}_{e\text{-flat}}$ or $\mathcal{P}_{m\text{-flat}}$, which minimizes the following KL divergence:
\begin{align*}
    \mathcal{P}_{e\text{-flat}} &= \mathop{\mathrm{argmin}}_{\mathcal{Q} \in \mathcal{S}_{\mathcal{B}_{e\text{-flat}}}} D_{KL}(\mathcal{P}, \mathcal{Q}), \\
    \mathcal{P}_{m\text{-flat}} &= \mathop{\mathrm{argmin}}_{\mathcal{Q} \in \mathcal{S}_{\mathcal{B}_{m\text{-flat}}}} D_{KL}(\mathcal{Q}, \mathcal{P}).
\end{align*}

\subsection{Theoretical remarks}
\label{Theoreticalremarks}
A submanifold with some natural parameters fixed at some constant value $c$ is $e$-flat, which follows directly from the definition of $e$-flatness. Here, our discussion focuses on $m$-projection onto the submanifold    $\mathcal{S}^{c}_{\mathcal{B}} = \left\{Q \in \mathcal{S} \mid \theta_v=c \text { for all } v \in \Omega_d^{+} \setminus \mathcal{B}\right\}$, where $\mathcal{B}$ can be any index set satisfying $\mathcal{B} \subseteq \Omega_d^{+}$. Since the constraint $\theta_v=c$ is linear and the KL divergence function is convex, the optimal solution $\mathcal{P}^{\mathcal{B},c}= \mathop{\mathrm{argmin}}_{\mathcal{R} \in S^{c}_{\mathcal{B}}} D_{K L}(\hat{\mathcal{P}}, \mathcal{R})$ always uniquely exists. From another perspective, the $e$-flat \( \mathcal{S}^{c}_{\mathcal{B}} \) forms a convex set. Consequently, this optimization problem involves minimizing a convex function over a convex set, thereby classified as a convex optimization problem. If a space is both $e$-flat and $m$-flat, it is called dually-flat. The space $\mathcal{S}$ of discrete probability distributions is dually-flat.

\subsection{Examples for Möbius function}
\label{efmf}
In the proposed method, we transform the distribution $\mathcal{P} \in \mathbb{R}_{> 0}^{I_1 \times \dots \times I_d}$ using the Möbius function, defined in Section~\ref{prelimiaries}. By Equation~\eqref{ptoeta}, we can express $\mathcal{P}$ in terms of the expectation parameter $\boldsymbol{\eta}$. For example, for $d=2,3$:
\begin{align*}
    \mathcal{P}_{i_1, i_2} &= \eta_{i_1, i_2}-\eta_{i_1+1, i_2}-\eta_{i_1, i_2+1}+\eta_{i_1+1, i_2+1}, \\
    \mathcal{P}_{i_1, i_2, i_3} &= \eta_{i_1, i_2, i_3}-\eta_{i_1+1, i_2, i_3}-\eta_{i_1, i_2+1, i_3}-\eta_{i_1, i_2, i_3+1} \\
    &\quad +\eta_{i_1+1, i_2+1, i_3}+\eta_{i_1+1, i_2, i_3+1}+\eta_{i_1, i_2+1, i_3+1}-\eta_{i_1+1, i_2+1, i_3+1},
\end{align*}
where we assume $\eta_{I_1+1, i_2}=\eta_{i_1, I_2+1}=0$ and $\eta_{I_1+1, i_2, i_3}=\eta_{i_1, I_2+1, i_3}=\eta_{i_1, i_2, I_3+1}=0$.

\subsection{Examples for application in tensor decomposition}
\label{td}
An application where the effectiveness of the choice of $c$ can be more easily observed is in the compression of multi-dimensional data (e.g., images).
To illustrate this point, we revisit the example introduced earlier. The input tensor is:
$$\left[
\begin{array}{ccccccc}833 & 1 & 2 & 4 & 7 & 4 & 8 \\ 430 & 33 & 5 & 1 & 711 & 112 & 4 \\ 39 & 6 & 29 & 2 & 9 & 3 & 121 \\ 2 & 2 & 8 & 6 & 311 & 10 & 122\end{array}\right].$$
Let us use the model submanifold $\mathcal{S}^{c}_{\mathcal{B}_1}$ with $\mathcal{B}_1$, which denotes the index set of one-body natural parameters (the first row and column of the $\theta$ value in the matrix).
$\theta$-parameters of each tensor in the submanifold are in the form of
\[
\left[
\begin{array}{ccccccc}
\theta_{11} & \theta_{12} & \theta_{13} & \theta_{14} & \theta_{15} & \theta_{16} & \theta_{17} \\
\theta_{21} & c & c & c & c & c & c \\
\theta_{31} & c & c & c & c & c & c \\
\theta_{41} & c & c & c & c & c & c \\
\end{array}
\right].
\]
In the view of multi-dimensional data compression, the original matrix $\mathcal{X}$ requires $4 \times 7 = 28$ values to represent it. 
However, by approximating it by projecting it onto the submanifold $\mathcal{S}^{c}_{\mathcal{B}_1}$, we only need to store 11 parameters, the optimized values of the one-body $\theta$-parameters and the constant $c$ (the traditional method just sets $c=0$, but the number of parameters that need to be stored is still 11). In other words, in our tensor decomposition task, sparsity does not appear in the original matrix space; instead, it manifests in the \( \theta \)-coordinate space.
As shown in the introduction, selecting the submanifold $\mathcal{S}^{0}_{\mathcal{B}_1}$, 
results in a KL error of 0.46 and the RMSE of 0.56. 
In contrast, choosing $\mathcal{S}^{0.54}_{\mathcal{B}_1}$ as the model submanifold reduces the KL error to 0.19 and the RMSE to 0.24---nearly half of the previous values. 
This demonstrates that varying the value of $c$ can significantly affect the reconstruction quality of the tensor.
\subsection{Examples for log-linear model}
\label{elog}
As an example of the \emph{log-linear model}~\cite{agresti2013categorical}, consider the distribution of an $n$-dimensional binary vector $\mathbf{x} = (x^1, \ldots, x^n) \in \{0,1\}^n$, where the log-probability is expressed as:

\[
\log p(\mathbf{x}) = \sum_i \theta^i x^i + \sum_{i<j} \theta^{ij} x^i x^j + \sum_{i<j<k} \theta^{ijk} x^i x^j x^k + \cdots + \theta^{1\ldots n} x^1 x^2 \cdots x^n - \psi,
\]

where $\boldsymbol{\theta} = (\theta^1, \ldots, \theta^{1\ldots n})$ is the natural parameter vector, and $\psi$ is the log-partition function (normalizer). The corresponding expectation parameters $\boldsymbol{\eta} = (\eta^1, \ldots, \eta^{1\ldots n})$ represent the expected values of variable combinations:

\[
\eta^i = \mathbb{E}[x^i] = \Pr(x^i = 1), \quad
\eta^{ij} = \mathbb{E}[x^i x^j] = \Pr(x^i = x^j = 1), \quad
\eta^{1\ldots n} = \mathbb{E}[x^1 \cdots x^n] = \Pr(x^1 = \cdots = x^n = 1).
\]
\section{Proofs.}
\label{appendix:proof}
\begin{replemma}[\ref{lem:closeform}]
For any input tensor ${\hat{\mathcal{P}} \in \mathbb{R}_{> 0}^{I_1 \times \cdots \times I_d}}$ and its $m$-projection $\mathcal{P}^{\mathcal{B}_{I_j,m_j},c}$ onto the submanifold $\mathcal{S}^{c}_{\mathcal{B}_{I_j,m_j}}$, we have
\begin{equation}
     \sum_{i_j=m_j}^{I_{j}} \mathcal{P}^{\mathcal{B}_{I_j,m_j},c}_{i_{1},\ldots, i_{j-1},i_{j},i_{j+1},\ldots,i_d}= \sum_{i_j=m_j}^{I_{j}} \hat{\mathcal{P}}_{i_{1},\ldots, i_{j-1},i_{j},i_{j+1},\ldots,i_d}.
\end{equation}
\end{replemma}
\begin{proof}
Because 
$\eta^{\mathcal{B}_{I_j,m_j},c}_{i_{1},\ldots, i_{j-1},i_{j},i_{j+1},\ldots,i_d} = \hat{\eta}_{i_{1},\ldots, i_{j-1},i_{j},i_{j+1},\ldots,i_d}, \, i_j = 1,\ldots,m_j,$ 
and 
$\mu_{i_1 \ldots i_d}^{i_1^{\prime}, \ldots, i_d^{\prime}} = \prod_{k=1}^d \mu_{i_k}^{i_k^{\prime}}$, it follows that
\begin{flalign*}
&\sum_{i_j=m_j}^{I_j}\mathcal{P}^{\mathcal{B}_{I_j,m_j},c}_{i_{1},\ldots, i_{j-1},i_{j},i_{j+1},\ldots,i_d}\\ 
=\ &\sum_{i_j=m_j}^{I_j} \sum_{\left(i_1^{\prime}, \ldots, i_d^{\prime}\right) \in \Omega_d} \mu_{i_1 \ldots i_d}^{i_1^{\prime}, \ldots, i_d^{\prime}} \eta^{\mathcal{B}_{I_j,m_j},c}_{i_1^{\prime}, \ldots, i_d^{\prime}} \\
=\ &\sum_{i_j=m_j}^{I_j} \sum_{\left(i_1^{\prime}, \ldots, i_d^{\prime}\right) \in \Omega_d} \left( \prod_{k=1}^d \mu_{i_k}^{i_k^{\prime}} \right) \eta^{\mathcal{B}_{I_j,m_j},c}_{i_1^{\prime}, \ldots, i_d^{\prime}} \\
=\ &\sum_{i_j=m_j}^{I_j} \sum_{i_{1}^{\prime}=i_1}^{i_{1}+1} \cdots \sum_{i_{j-1}^{\prime}=i_{j-1}}^{i_{j-1}+1} \sum_{i_{j}^{\prime}=i_j}^{i_{j}+1} \cdots \sum_{i_{d}^{\prime}=i_{d}}^{i_{d}+1}\left( \prod_{\substack{k = 1 \\ k \neq j}}^d \mu_{i_k}^{i_k^{\prime}} \right) \left( \sum_{i_{j}^{\prime}=i_j}^{i_{j}+1} \mu_{i_j}^{i_j^{\prime}} \eta^{\mathcal{B}_{I_j,m_j},c}_{i_1^{\prime}, \ldots, i_d^{\prime}} \right) \\
=\ &\sum_{i_{1}^{\prime}=i_1}^{i_{1}+1} \cdots \sum_{i_{j-1}^{\prime}=i_{j-1}}^{i_{j-1}+1} \sum_{i_{j}^{\prime}=i_j}^{i_{j}+1} \cdots \sum_{i_{d}^{\prime}=i_{d}}^{i_{d}+1} \left( \prod_{\substack{k = 1 \\ k \neq j}}^d \mu_{i_k}^{i_k^{\prime}}  \right) \left( \sum_{i_j=m_j}^{I_j} \left( \eta^{\mathcal{B}_{I_j,m_j},c}_{i_{1}^{\prime},\ldots, i_{j-1}^{\prime},i_{j},i_{j+1}^{\prime},\ldots,i_d^{\prime}} - \eta^{\mathcal{B}_{I_j,m_j},c}_{i_{1}^{\prime},\ldots, i_{j-1}^{\prime},i_{j}+1,i_{j+1}^{\prime},\ldots,i_d^{\prime}} \right) \right)
\end{flalign*}
\begin{flalign*}
\hspace*{25pt}=\ &\sum_{i_{1}^{\prime}=i_1}^{i_{1}+1} \cdots \sum_{i_{j-1}^{\prime}=i_{j-1}}^{i_{j-1}+1} \sum_{i_{j}^{\prime}=i_j}^{i_{j}+1} \cdots \sum_{i_{d}^{\prime}=i_{d}}^{i_{d}+1} \left( \prod_{\substack{k = 1 \\ k \neq j}}^d \mu_{i_k}^{i_k^{\prime}}  \right)\left( \eta^{\mathcal{B}_{I_j,m_j},c}_{i_{1}^{\prime},\ldots, i_{j-1}^{\prime},m_j,i_{j+1}^{\prime},\ldots,i_d^{\prime}} \right) &\\
=\ &\sum_{i_{1}^{\prime}=i_1}^{i_{1}+1} \cdots \sum_{i_{j-1}^{\prime}=i_{j-1}}^{i_{j-1}+1} \sum_{i_{j}^{\prime}=i_j}^{i_{j}+1} \cdots \sum_{i_{d}^{\prime}=i_{d}}^{i_{d}+1} \left( \prod_{\substack{k = 1 \\ k \neq j}}^d \mu_{i_k}^{i_k^{\prime}}  \right)\left( \hat{\eta}_{i_{1}^{\prime},\ldots, i_{j-1}^{\prime},m_j,i_{j+1}^{\prime},\ldots,i_d^{\prime}} \right) &\\
=\ &\sum_{i_j=m_j}^{I_{j}} \hat{\mathcal{P}}_{i_{1},\ldots, i_{j-1},i_{j},i_{j+1},\ldots,i_d}. &
\end{flalign*}

\end{proof}

\begin{reptheorem}[\ref{the:closeform}]
For any input tensor ${\hat{\mathcal{P}} \in \mathbb{R}_{> 0}^{I_1 \times \cdots \times I_d}}$, its $m$-projection $\mathcal{P}^{\mathcal{B}_{I_j,m_j},c}$ onto the submanifold $\mathcal{S}^{c}_{\mathcal{B}_{I_j,m_j}}$ is given as
\begin{align*}
\mathcal{P}^{\mathcal{B}_{I_j,m_j},c}_{i_{1},\ldots, i_{j-1},i_{j},i_{j+1},\ldots,i_d} 
&= \hat{\mathcal{P}}_{i_{1},\ldots, i_{j-1},i_{j},i_{j+1},\ldots,i_d}
\end{align*}
for $i_j = 1, 2, \dots, m_j - 1$ and
\begin{align}
\mathcal{P}^{\mathcal{B}_{I_j,m_j},c}_{i_{1},\ldots, i_{j-1},m_j+k,i_{j+1},\ldots,i_d}
= \left(\sum_{k=0}^{I_{j}- m_j} \hat{\mathcal{P}}_{i_{1},\ldots, i_{j-1},m_j+k,i_{j+1},\ldots,i_d}\right) 
\exp\left(c k \prod_{\substack{s = 1 \nonumber \\
s \neq j}}^d  i_s \right)\left( \sum_{k=0}^{I_{j}- m_j} \exp\left(ck \prod_{\substack{s = 1 \\ s \neq j}}^d i_s\right)\right)^{-1}
\end{align}
for $k=0, \dots, I_j-m_j$.
% \begin{equation}
% \mathcal{P}^{\mathcal{B}_{I_j,m_j},c}_{i_{1},\ldots, i_{j-1},m_j+k,i_{j+1},\ldots,i_d} 
% = \left(\sum_{k=0}^{I_{j}- m_j} \hat{\mathcal{P}}_{i_{1},\ldots, i_{j-1},m_j+k,i_{j+1},\ldots,i_d}\right) 
% \frac{e^{c\left(k \prod_{\substack{s = 1 \\ s \neq j}}^d i_s\right)}}{\sum_{k=0}^{I_{j}- m_j} e^{c\left(k \prod_{\substack{s = 1 \\ s \neq j}}^d i_s\right)}}, \quad 
% k = 0, \ldots, I_j-m_j
% \end{equation}
\end{reptheorem}

\begin{proof}

Remind that 
$\eta^{\mathcal{B}_{I_j,m_j},c}_{i_{1},\ldots, i_{j-1},i_{j},i_{j+1},\ldots,i_d}=\hat{\eta}_{i_{1},\ldots, i_{j-1},i_{j},i_{j+1},\ldots,i_d}, i_j=1,\ldots,m_j$.
We have
\begin{equation}
\begin{split}
 \mathcal{P}^{\mathcal{B}_{I_j,m_j},c}_{i_{1},\ldots, i_{j-1},i_{j},i_{j+1},\ldots,i_d}
&= \sum_{i_{1}^{\prime}=i_1}^{i_{1}+1} \cdots \sum_{i_{j}^{\prime}=i_{j}}^{i_{j}+1} \cdots \sum_{i_{d}^{\prime}=i_d}^{i_d+1}  \left( \prod_{k=1}^d \mu_{i_k}^{i_k^{\prime}} \right) \eta^{\mathcal{B}_{I_j,m_j},c}_{i_1^{\prime}, \ldots, i_d^{\prime}} \\
&=\sum_{i_{1}^{\prime}=i_1}^{i_{1}+1} \cdots \sum_{i_{j}^{\prime}=i_{j}}^{i_{j}+1} \cdots \sum_{i_{d}^{\prime}=i_d}^{i_d+1}  \left( \prod_{k=1}^d \mu_{i_k}^{i_k^{\prime}} \right) \hat{\eta}_{i_1^{\prime}, \ldots, i_d^{\prime}} \\
&=\hat{\mathcal{P}}_{i_{1},\ldots, i_{j-1},i_{j},i_{j+1},\ldots,i_d} ,i_j = 1, 2, \dots, m_j-1.
\end{split}
\end{equation}  
Moreover,
\begin{equation}
   \mathcal{P}^{\mathcal{B}_{I_j,m_j},c}_{i_{1},\ldots, i_{j-1},m_j+k,i_{j+1},\ldots,i_d}=\mathcal{P}^{\mathcal{B}_{I_j,m_j},c}_{i_{1},\ldots, i_{j-1},m_j,i_{j+1},\ldots,i_d} e^{c\left(k \prod_{\substack{s = 1 \\ s \neq j}}^d  i_s\right)}.
\end{equation}
Therefore,
\begin{align*}
\mathcal{P}^{\mathcal{B}_{I_j,m_j},c}_{i_{1},\ldots, i_{j-1},m_j,i_{j+1},\ldots,i_d}\left(\sum_{k=0}^{I_j-m_j}  e^{c\left(k\prod_{\substack{s = 1 \\ s \neq j}}^d  i_s\right)}\right)
&=\left(\sum_{k=0}^{I_{j}- m_j} \mathcal{P}^{\mathcal{B}_{I_j,m_j},c}_{i_{1},\ldots, i_{j-1},m_j+k,i_{j+1},\ldots,i_d}\right)\\
&=\left(\sum_{k=0}^{I_{j}- m_j} \hat{\mathcal{P}}_{i_{1},\ldots, i_{j-1},m_j+k,i_{j+1},\ldots,i_d}\right),
\end{align*}
\begin{align*}
\mathcal{P}^{\mathcal{B}_{I_j,m_j},c}_{i_{1},\ldots, i_{j-1},m_j,i_{j+1},\ldots,i_d}=\left(\sum_{k=0}^{I_{j}- m_j} \hat{\mathcal{P}}_{i_{1},\ldots, i_{j-1},m_j+k,i_{j+1},\ldots,i_d}\right)\frac{1}{\left( \sum_{k=0}^{I_{j}- m_j} e^{c\left(k \prod_{\substack{s = 1 \\ s \neq j}}^d i_s\right)}\right)}.
\end{align*}

\end{proof}



\begin{reptheorem}[\ref{the:lowerbound}]
\( F(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_{I_j,m_j},c}) \geq \max{\{l, -l\}} \), where
\begin{align*}
l = \frac{c(I_j\!-\!m_j)(I_j\!-\!m_j\!+\!1)\prod_{\substack{h=1 \\ h\neq j}}^d I_h(1\!+\!I_h)}{2^d\left(\prod_{j=1}^d I_j\right)^{s_{min}}}.
\end{align*}
\end{reptheorem}
\begin{proof}
Let us consider
\begin{equation}
F(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_{I_j,m_j},c})=-\sum_{i_1=1}^{I_1} \ldots \sum_{i_d=1}^{I_d}\left\{\hat{\mathcal{P}}_{i_1 \ldots i_d} \log {\mathcal{P}_{i_1, \ldots, i_d}^{\mathcal{B}_{I_j,m_j},c}}\right\}.
\end{equation}
Because $ \mathcal{P}^{\mathcal{B}_{I_j,m_j},c}_{i_{1},\ldots, i_{j},\dots,i_d}=\hat{\mathcal{P}}_{i_{1},\ldots, i_{j},\ldots,i_d}$ for every $i_j = 1, 2, \dots, m_j-1$ and for any $c$, we only need to consider
\begin{equation}
     f(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_{I_j,m_j},c}) =
    -\sum_{i_1=1}^{I_1} \cdots \sum_{i_{j-1}=1}^{I_{j-1}}\sum_{k=0}^{I_j-m_j} \sum_{i_{j+1}=1}^{I_{j+1}}\cdots \sum_{i_d=1}^{I_d} 
    \left\{ 
    \hat{\mathcal{P}}_{i_{1},\ldots, i_{j-1},m_j+k,i_{j+1},\ldots,i_d} 
    \log \mathcal{P}_{i_{1},\ldots, i_{j-1},m_j+k,i_{j+1},\ldots,i_d}^{\mathcal{B}_{I_j,m_j},c}
    \right\}.
\end{equation}
Moreover, $F(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_{I_j,m_j},c})\geq f(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_{I_j,m_j},c})$ always hold. 
On the one hand,
\begin{equation}
\begin{aligned}
\mathcal{P}^{\mathcal{B}_{I_j,m_j},c}_{i_{1},\ldots, i_{j-1},m_j+k,i_{j+1},\ldots,i_d} &= 
\left(\sum_{k=0}^{I_{j}- m_j} \hat{\mathcal{P}}_{i_{1},\ldots, i_{j-1},m_j+k,i_{j+1},\ldots,i_d}\right) 
\frac{e^{c\left(k \prod_{\substack{s = 1 \\ s \neq j}}^d  i_s\right)}}{\left( \sum_{k=0}^{I_{j}- m_j} e^{c\left(k \prod_{\substack{s = 1 \\ s \neq j}}^d i_s\right)}\right)} \\
&\leq \frac{e^{c\left(k \prod_{\substack{s = 1 \\ s \neq j}}^d  i_s\right)}}{\left( \sum_{k=0}^{I_{j}- m_j} e^{c\left(k \prod_{\substack{s = 1 \\ s \neq j}}^d i_s\right)}\right)}=\frac{1}{\left( \sum_{h=-k}^{I_{j}- m_j-k} e^{c\left(h \prod_{\substack{s = 1 \\ s \neq j}}^d i_s\right)}\right)}\\
&\leq e^{c\left(k \prod_{\substack{s = 1 \\ s \neq j}}^d i_s\right)}.
\end{aligned}
\end{equation}
Therefore, by applying $\min(\hat{\mathcal{P}})= \frac{1}{\left(\prod_{j=1}^{d} I_j\right)^{s_{min}}}$, 
\begin{equation}
\begin{split}
f(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_{I_j,m_j},c})
&\geq -\sum_{i_1=1}^{I_1} \cdots \sum_{i_{j-1}=1}^{I_{j-1}}\sum_{k=0}^{I_j-m_j} \sum_{i_{j+1}=1}^{I_{j+1}}\cdots \sum_{i_d=1}^{I_d}\left\{\frac{1}{\left(\prod_{j=1}^{d} \left(I_j \right)\right)^{s_{min}}}\log\mathcal{P}^{\mathcal{B}_{I_j,m_j},c}_{i_{1},\ldots, i_{j-1},m_j+k,i_{j+1},\ldots,i_d}\right\}\\
&\geq -\frac{1}{\left(\prod_{j=1}^{d} \left(I_j \right)\right)^{s_{min}}}\sum_{i_1=1}^{I_1} \cdots \sum_{i_{j-1}=1}^{I_{j-1}}\sum_{k=0}^{I_j-m_j} \sum_{i_{j+1}=1}^{I_{j+1}}\cdots \sum_{i_d=1}^{I_d} \left\{ c\left(k \prod_{\substack{s = 1 \\ s \neq j}}^d i_s\right) \right\}\\
&=  -\frac{c}{\left(\prod_{j=1}^{d} \left(I_j \right)\right)^{s_{min}}}\sum_{i_1=1}^{I_1} \cdots \sum_{i_{j-1}=1}^{I_{j-1}} \sum_{i_{j+1}=1}^{I_{j+1}}\cdots \sum_{i_d=1}^{I_d} \left( \prod_{\substack{s = 1 \\ s \neq j}}^d i_s \right)\left(\sum_{k=0}^{I_j-m_j} k  \right)\\
&=-\frac{c\left( I_j - m_j\right)\left( I_j - m_j+ 1 \right) 
\prod_{\substack{h = 1 \\ h \neq j}}^d I_h \left( 1 + I_h \right) }{2^d\left(\prod_{j=1}^{d} I_j \right)^{s_{min}}}.
\end{split}
\end{equation}
On the other hand, 
\begin{equation}
\begin{aligned}
\mathcal{P}^{\mathcal{B}_{I_j,m_j},c}_{i_{1},\ldots, i_{j-1},m_j+k,i_{j+1},\ldots,i_d} 
&\leq \frac{e^{c\left(k \prod_{\substack{s = 1 \\ s \neq j}}^d  i_s\right)}}{\left( \sum_{k=0}^{I_{j}- m_j} e^{c\left(k \prod_{\substack{s = 1 \\ s \neq j}}^d i_s\right)}\right)}=\frac{1}{\left( \sum_{h=-k}^{I_{j}- m_j-k} e^{c\left(h \prod_{\substack{s = 1 \\ s \neq j}}^d i_s\right)}\right)}\\
&\leq e^{c\left(-(I_j-m_j-k) \prod_{\substack{s = 1 \\ s \neq j}}^d i_s\right)}.
\end{aligned}
\end{equation}
Therefore we have
\begin{equation}
\begin{split}
f(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_{I_j,m_j},c})
&\geq -\sum_{i_1=1}^{I_1} \cdots \sum_{i_{j-1}=1}^{I_{j-1}}\sum_{k=0}^{I_j-m_j} \sum_{i_{j+1}=1}^{I_{j+1}}\cdots \sum_{i_d=1}^{I_d}\left\{ \frac{1}{\left(\prod_{j=1}^{d} \left(I_j \right)\right)^{s_{min}}}\log \mathcal{P}^{\mathcal{B}_{I_j,m_j},c}_{i_{1},\ldots, i_{j-1},m_j+k,i_{j+1},\ldots,i_d}\right\}\\
&\geq -\frac{1}{\left(\prod_{j=1}^{d} \left(I_j \right)\right)^{s_{min}}}\sum_{i_1=1}^{I_1} \cdots \sum_{i_{j-1}=1}^{I_{j-1}}\sum_{k=0}^{I_j-m_j} \sum_{i_{j+1}=1}^{I_{j+1}}\cdots \sum_{i_d=1}^{I_d} \left\{ c\left(-\left(I_j-m_j-k\right) \prod_{\substack{s = 1 \\ s \neq j}}^d i_s\right) \right\}\\
&=  \frac{c}{\left(\prod_{j=1}^{d} \left(I_j \right)\right)^{s_{min}}}\sum_{i_1=1}^{I_1} \cdots \sum_{i_{j-1}=1}^{I_{j-1}} \sum_{i_{j+1}=1}^{I_{j+1}}\cdots \sum_{i_d=1}^{I_d} \left( \prod_{\substack{s = 1 \\ s \neq j}}^d i_s \right)\left(\sum_{k=0}^{I_j-m_j} I_j-m_j-k  \right)\\
&=\frac{c\left( I_j - m_j\right)\left( I_j - m_j+ 1 \right) 
\prod_{\substack{h = 1 \\ h \neq j}}^d I_h\left( 1 + I_h \right) }{2^d \left(\prod_{j=1}^{d} \left(I_j \right)\right)^{s_{min}}}.
\end{split}
\end{equation} 
\end{proof}


\begin{repcorollary}[\ref{col:c_bounds}]
To satisfy the condition \( D_{\mathrm{KL}}(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_{I_j,m_j},c}) \leq D_{\mathrm{KL}}(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_{I_j,m_j},0}) \), for every $j = 1, \dots, d$, \( c \) should at least satisfy $- l \leq c \leq l$, where
\begin{equation}
\begin{aligned}
l &= \frac{2^d  s_{max}  \log\left( \prod_{j=1}^{d} I_j \right)}{\left( I_j-m_j \right) \prod_{\substack{h = 1 \\ h \neq j}}^d \left(1+I_h\right) \left(  \prod_{j=1}^{d} I_j \right)^{s_{max}-s_{min}}} < \frac{2^d \log\left( \prod_{j=1}^{d} I_j \right) b^2}{\left( I_j-m_j \right) \prod_{\substack{h = 1 \\ h \neq j}}^d \left(1+I_h\right) a^2}.
\end{aligned}
\end{equation}
\end{repcorollary}
\begin{proof}
For $c=0$, let us define
\begin{align*}
    h_{i_{1},\ldots, i_{j-1},i_{j+1},\ldots,i_d} &:= \sum_{k=0}^{I_{j}- m_j} \hat{\mathcal{P}}_{i_{1},\ldots, i_{j-1},m_j+k,i_{j+1},\ldots,i_d}
    \leq \frac{I_j-m_j+1}{\left(\prod_{j=1}^{d} \left(I_j \right)\right)^{s_{max}}}.
\end{align*}
% $ \sum_{k=0}^{I_{j}- m_j} \hat{\mathcal{P}}_{i_{1},\ldots, i_{j-1},m_j+k,i_{j+1},\ldots,i_d} :=h_{i_{1},\ldots, i_{j-1},i_{j+1},\ldots,i_d}\leq  
% \frac{I_j-m_j+1}{\left(\prod_{j=1}^{d} \left(I_j \right)\right)^{s_{max}}} $, then
Then it follows that
\begin{align*}
\mathcal{P}^{\mathcal{B}_{I_j,m_j},0}_{i_{1},\ldots, i_{j-1},m_j+k,i_{j+1},\ldots,i_d} &= 
\left(\sum_{k=0}^{I_{j}- m_j} \hat{\mathcal{P}}_{i_{1},\ldots, i_{j-1},m_j+k,i_{j+1},\ldots,i_d}\right) 
\frac{1}{I_j-m_j+1}\\
&=\frac{h_{i_{1},\ldots, i_{j-1},i_{j+1},\ldots,i_d}}{I_j-m_j+1},
\end{align*}
\begin{equation}
\begin{split}
 f(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_{I_j,m_j},0}) &=
    -\sum_{i_1=1}^{I_1} \cdots \sum_{i_{j-1}=1}^{I_{j-1}}\sum_{k=0}^{I_j-m_j} \sum_{i_{j+1}=1}^{I_{j+1}}\cdots \sum_{i_d=1}^{I_d} 
    \left\{ 
    \hat{\mathcal{P}}_{i_{1},\ldots, i_{j-1},m_j+k,i_{j+1},\ldots,i_d} 
    \log \mathcal{P}_{i_{1},\ldots, i_{j-1},m_j+k,i_{j+1},\ldots,i_d}^{\mathcal{B}_{I_j,m_j},0}
    \right\}\\
&=  -\sum_{i_1=1}^{I_1} \cdots \sum_{i_{j-1}=1}^{I_{j-1}}\sum_{k=0}^{I_j-m_j} \sum_{i_{j+1}=1}^{I_{j+1}}\cdots \sum_{i_d=1}^{I_d} 
    \left\{ 
    \hat{\mathcal{P}}_{i_{1},\ldots, i_{j-1},m_j+k,i_{j+1},\ldots,i_d} 
    \log \frac{h_{i_{1},\ldots, i_{j-1},i_{j+1},\ldots,i_d}}{I_j-m_j+1}
    \right\}\\   
&= 
    -\sum_{i_1=1}^{I_1} \cdots \sum_{i_{j-1}=1}^{I_{j-1}} \sum_{i_{j+1}=1}^{I_{j+1}}\cdots \sum_{i_d=1}^{I_d}  \left(\log \frac{h_{i_{1},\ldots, i_{j-1},i_{j+1},\ldots,i_d}}{I_j-m_j+1}\right) \left( \sum_{k=0}^{I_j-m_j} \hat{\mathcal{P}}_{i_{1},\ldots, i_{j-1},m_j+k,i_{j+1},\ldots,i_d} \right)\\
&= -\sum_{i_1=1}^{I_1} \cdots \sum_{i_{j-1}=1}^{I_{j-1}} \sum_{i_{j+1}=1}^{I_{j+1}}\cdots \sum_{i_d=1}^{I_d} \left( \log h_{i_{1},\ldots, i_{j-1},i_{j+1},\ldots,i_d} -\log (I_j-m_j+1) \right)h_{i_{1},\ldots, i_{j-1},i_{j+1},\ldots,i_d}\\
&\leq  \frac{s_{max} \log\left( \prod_{j=1}^{d} I_j  \right)  \left(I_j-m_j+1\right)}{\left(\prod_{j=1}^{d} \left(I_j \right)\right)^{s_{max}}}   \prod_{\substack{h = 1 \\ h \neq j}}^d I_h.
\end{split}
\label{upij0}
\end{equation}
If $ \underline{f}(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_{I_j,m_j},c}) \geq \overline{f}(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_{I_j,m_j},0}) $, which means
\begin{align*}
c \leq -\frac{2^d  s_{max}  \log\left( \prod_{j=1}^{d} I_j \right)}{\left( I_j-m_j \right)\left(  \prod_{j=1}^{d} I_j \right)^{s_{max}-s_{min}} \prod_{\substack{h = 1 \\ h \neq j}}^d \left(1+I_h\right)}
\intertext{or}
c\geq \frac{2^d  s_{max}  \log\left( \prod_{j=1}^{d} I_j \right)}{\left( I_j-m_j \right)\left(  \prod_{j=1}^{d} I_j \right)^{s_{max}-s_{min}} \prod_{\substack{h = 1 \\ h \neq j}}^d \left(1+I_h\right)},
\end{align*}
then $D_{\mathrm{KL}}(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_{I_j,m_j},c}) \geq D_{\mathrm{KL}}(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_{I_j,m_j},0})$. \\
Therefore, $c$ should at least satisfies:
\begin{equation}
  -\frac{2^d  s_{max}  \log\left( \prod_{j=1}^{d} I_j \right)}{\left( I_j-m_j \right) \prod_{\substack{h = 1 \\ h \neq j}}^d \left(1+I_h\right) \left(  \prod_{j=1}^{d} I_j \right)^{s_{max}-s_{min}}}\leq c \leq \frac{2^d  s_{max}  \log\left( \prod_{j=1}^{d} I_j \right)}{\left( I_j-m_j \right) \prod_{\substack{h = 1 \\ h \neq j}}^d \left(1+I_h\right) \left(  \prod_{j=1}^{d} I_j \right)^{s_{max}-s_{min}}}.
\end{equation}

\end{proof}

\begin{reptheorem}[\ref{the:finalresult}]
  For many-body approximation of $\hat{\mathcal{P}}$, to satisfy the condition $D_{\mathrm{KL}}(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_h,c}) \leq D_{\mathrm{KL}}(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_h,0})$ for every $h=1,\dots,d$, $c$ should at least satisfy: 
\begin{equation}
-\min_{j=1,2,\ldots,d} \, l_j \leq c \leq \min_{j=1,2,\ldots,d} \, l_j,
\end{equation}
where 
\begin{align*}
l_j &= \frac{2^{d}\left(\left(s_{min}-1\right)d+1\right) I_j \log \left(\tau \right)\left(\tau \right)^{s_{min}}}{\left( I_j - m_j\right)\left( I_j - m_j+ 1 \right) 
\prod_{\substack{h = 1 \\ h \neq j}}^d \left( 1 + I_h \right) {\left(\tau\right)^{s_{max}}}}
  < \frac{2^{d}\left(d+1\right) I_j \log \left(\tau \right)b^2}{\left( I_j - m_j\right)\left( I_j - m_j+ 1 \right) 
\prod_{\substack{h = 1 \\ h \neq j}}^d \left( 1 + I_h \right) \left(a^2\right)},  \nonumber \\
j &= 1, \ldots, d, \quad \tau=\prod_{j=1}^{d} I_j.
\end{align*}
%   Consider the many-body approximation, if we want $D_{\mathrm{KL}}(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_h,c}) \leq D_{\mathrm{KL}}(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_h,0})$, $h=1,\ldots,d$ ,then $c$ should at least satisfies:
 
% \begin{equation}
% -\min_{j=1,2,\ldots,d} \, l_j \leq c \leq \min_{j=1,2,\ldots,d} \, l_j
% \end{equation}
% where 
% \begin{equation}
% l_j = \frac{2^{d}\left(\left(s_{min}-1\right)d+1\right) I_j \log\left(\prod_{j=1}^{d} I_j \right)\left(\prod_{j=1}^{d} I_j\right)^{s_{min}}}{\left( I_j - m_j\right)\left( I_j - m_j+ 1 \right)\prod_{\substack{h = 1 \\ h \neq j}}^d \left( 1 + I_h \right)\left(\prod_{j=1}^{d} I_j\right)^{s_{max}}} < \frac{2^{d}\left(d+1\right) I_j \log\left(\prod_{j=1}^{d} I_j \right)b^2}{\left( I_j - m_j\right)\left( I_j - m_j+ 1 \right)\prod_{\substack{h = 1 \\ h \neq j}}^d \left( 1 + I_h \right)a^2}, \quad j = 1, \ldots, d.
% \label{eq:c_bounds}
% \end{equation}
\end{reptheorem}

\begin{proof}
According to the closed formula of $\mathcal{P}^{\mathcal{B}_1,0}$,
\begin{equation}
\begin{split}
\mathcal{P}^{\mathcal{B}_1,0}_{ \, i_1, \ldots, i_d} &= \prod_{k=1}^d \left(\sum_{i_1^{\prime}=1}^{I_1} \ldots \sum_{i_{k-1}^{\prime}=1}^{I_{k-1}} \sum_{i_{k+1}^{\prime}=1}^{I_{k+1}} \cdots \sum_{i_d^{\prime}=1}^{I_d} \hat{\mathcal{P}}_{i_1^{\prime}, \ldots, i_{k-1}^{\prime}, i_k, i_{k+1}^{\prime}, \ldots, i_d}\right) \\
&\geq \prod_{k=1}^d\left( \frac{1}{\left(\prod_{j=1}^{d} I_j\right)^{s_{min}}} \prod_{\substack{j = 1 \\ j \neq k}}^d I_j\right)\\
&=\left(\prod_{j=1}^{d} I_j \right)^{\left(1-s_{min}\right)d-1 }.
\end{split}
\end{equation}
  Moreover, $D_{\mathrm{KL}}(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_{I_j,m_j},c}) \geq D_{\mathrm{KL}}(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_1,0})$ if and only if $F(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_{I_j,m_j},c}) \geq F(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_1,0})$.
 Therefore, 
\begin{equation}
\begin{split}
F(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_1,0}) &= -\sum_{i_1=1}^{I_1} \ldots \sum_{i_d=1}^{I_d} \hat{\mathcal{P}}_{i_1 \ldots i_d} \log \mathcal{P}^{\mathcal{B}_1,0}_{ \, i_1, \ldots, i_d} \\
&\leq -\sum_{i_1=1}^{I_1} \ldots \sum_{i_d=1}^{I_d} 
\frac{1}{\left(\prod_{j=1}^{d} I_j\right)^{s_{max}}}     \log\mathcal{P}^{\mathcal{B}_1,0}_{ \, i_1, \ldots, i_d}\\
&\leq \frac{\left(\left(s_{min}-1\right)d+1\right)\left(\prod_{j=1}^{d} I_j \right) \log \prod_{j=1}^{d} I_j}{\left(\prod_{j=1}^{d} I_j\right)^{s_{max}}}. 
\end{split}
\label{upbofb1}
\end{equation}
On the one hand, 
\begin{equation}
\begin{split}
 F(\hat{\mathcal{P}} ; \mathcal{P}^{{\mathcal{B}_{I_j,m_j},c}}) &\geq f(\hat{\mathcal{P}} ; \mathcal{P}^{^{\mathcal{B}_{I_j,m_j},c}}) \\
 &\geq \frac{c\left( I_j - m_j\right)\left( I_j - m_j+ 1 \right) 
\prod_{\substack{h = 1 \\ h \neq j}}^d I_h\left( 1 + I_h \right) }{2^d \left(\prod_{j=1}^{d} I_j\right)^{s_{min}}}.
\end{split}
\end{equation}
On the other hand, 
\begin{equation}
\begin{split}
 F(\hat{\mathcal{P}} ; \mathcal{P}^{{\mathcal{B}_{I_j,m_j},c}}) &\geq f(\hat{\mathcal{P}} ; \mathcal{P}^{^{\mathcal{B}_{I_j,m_j},c}}) \\
 &\geq -\frac{c\left( I_j - m_j\right)\left( I_j - m_j+ 1 \right) 
\prod_{\substack{h = 1 \\ h \neq j}}^d I_h\left( 1 + I_h \right) }{2^d \left(\prod_{j=1}^{d} I_j\right)^{s_{min}}}.
\end{split}
\end{equation}
If
\begin{equation}
\begin{aligned}
&-\frac{c\left( I_j - m_j\right)\left( I_j - m_j+ 1 \right) 
\prod_{\substack{h = 1 \\ h \neq j}}^d I_h\left( 1 + I_h \right) }{2^d \left(\prod_{j=1}^{d} I_j\right)^{s_{min}}} \geq \frac{\left(\left(s_{min}-1\right)d+1\right)\left(\prod_{j=1}^{d} I_j \right) \log \prod_{j=1}^{d} I_j}{{\left(\prod_{j=1}^{d} I_j\right)^{s_{max}}}} \\
&\Leftrightarrow 
c \leq -\frac{2^{d}\left(\left(s_{min}-1\right)d+1\right) I_j \log \prod_{j=1}^{d} I_j}{\left( I_j - m_j\right)\left( I_j - m_j+ 1 \right) 
\prod_{\substack{h = 1 \\ h \neq j}}^d \left( 1 + I_h \right) {\left(\prod_{j=1}^{d} I_j\right)^{s_{max}-s_{min}}} }
\end{aligned}
\end{equation}
or
\begin{equation}
\begin{aligned}
&\frac{c\left( I_j - m_j\right)\left( I_j - m_j+ 1 \right) 
\prod_{\substack{h = 1 \\ h \neq j}}^d I_h\left( 1 + I_h \right) }{2^d \left(\prod_{j=1}^{d} I_j\right)^{s_{min}}} \geq \frac{\left(\left(s_{min}-1\right)d+1\right)\left(\prod_{j=1}^{d} I_j \right) \log \prod_{j=1}^{d} I_j}{{\left(\prod_{j=1}^{d} I_j\right)^{s_{max}}}} \\
&\Leftrightarrow 
c \geq \frac{2^{d}\left(\left(s_{min}-1\right)d+1\right) I_j \log \prod_{j=1}^{d} I_j}{\left( I_j - m_j\right)\left( I_j - m_j+ 1 \right) 
\prod_{\substack{h = 1 \\ h \neq j}}^d \left( 1 + I_h \right) {\left(\prod_{j=1}^{d} I_j\right)^{s_{max}-s_{min}}}},
\end{aligned}
\end{equation}
then  $F(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_h,c}) \geq F(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_{I_j,m_j},c}) \geq f(\hat{\mathcal{P}} ; \mathcal{P}^{^{\mathcal{B}_{I_j,m_j},c}}) \geq F(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_1,0})\geq F(\hat{\mathcal{P}} ; \mathcal{P}^{\mathcal{B}_h,0})$.
Therefore, $c$ should at least satisfies
\begin{equation}
-\min_{j=1,2,\ldots,d} \, l_j \leq c \leq \min_{j=1,2,\ldots,d} \, l_j,
\end{equation}
where
\begin{align*}
l_j &= \frac{2^{d}\left(\left(s_{min}-1\right)d+1\right) I_j \log\left(\prod_{j=1}^{d} I_j \right)\left(\prod_{j=1}^{d} I_j\right)^{s_{min}}}{\left( I_j - m_j\right)\left( I_j - m_j+ 1 \right)\prod_{\substack{h = 1 \\ h \neq j}}^d \left( 1 + I_h \right)\left(\prod_{j=1}^{d} I_j\right)^{s_{max}}}\\
&< \frac{2^{d}\left(d+1\right) I_j \log\left(\prod_{j=1}^{d} I_j \right)b^2}{\left( I_j - m_j\right)\left( I_j - m_j+ 1 \right)\prod_{\substack{h = 1 \\ h \neq j}}^d \left( 1 + I_h \right)a^2}
\label{eq:c_bounds}
\end{align*}
for each $j = 1, \dots, d$.
\end{proof}








\begin{reptheorem}[\ref{maxentropy}]
Consider the set:
\[
\widetilde{\mathcal{P}}^{\mathcal{B}}=\bigcup_{c \in \mathbb{R}} \mathcal{P}^{\mathcal{B},c}, \quad \mathcal{P}^{\mathcal{B},c} = \underset{\mathcal{R} \in \mathcal{S}_\mathcal{B}^c}{\operatorname{argmin}} \, D_{\mathrm{KL}}(\hat{\mathcal{P}}, \mathcal{R}), 
\]
we have \( \mathcal{P}^{\mathcal{B},0} \in \bigcup_{c \in \mathbb{R}} \mathcal{P}^{\mathcal{B},c} \) and \( \mathcal{P}^{\mathcal{B},0} \) maximizes the entropy in the set \(\widetilde{\mathcal{P}}^{\mathcal{B}} \).
\end{reptheorem}

% Proof of the theorem
\begin{proof}
 The Legendre transformation~\cite{amari2000methods} of $\psi(\theta)=-\theta_{1,\dots,1}$ is given as
\[
\varphi(\eta) = \max_{\theta^{\prime}} \left(\theta^{\prime} \eta - \psi(\theta^{\prime})\right), \quad \theta^{\prime} \eta = \sum_{x \in \Omega_d^{+}} \theta_x^{\prime} \eta_x.
\]
Then $\varphi(\eta)$ coincides with the negative entropy, which is defined as
\[
\varphi(\eta) = \sum_{\left(i_1, \ldots, i_d\right) \in \Omega_d} \mathcal{P}_{i_1, \ldots, i_d} \log \mathcal{P}_{i_1, \ldots, i_d}.
\]
Thus, it is clear that \(\varphi(\eta)\) is a convex function that attains the minimum value. Moreover, we also have
\[
\frac{\partial \varphi(\eta)}{\partial \eta_{x}} = \frac{\partial}{\partial \eta_{x}}(\theta \eta - \psi(\theta)) = \theta_{x}.
\]
This holds for all $\mathcal{P}^{\mathcal{B},c}$, $\mathcal{P}^{\mathcal{B},c} = \mathcal{S}^c_{\mathcal{B}} \cap \mathcal{S}^{\mathcal{B}}_\mathcal{\hat{P}}$.
Therefore, $\mathcal{P}^{\mathcal{B},0}$ satisfies the following.
\[
\frac{\partial \varphi(\eta)}{\partial \eta_{v}} = \frac{\partial}{\partial \eta_{v}}(\theta \eta - \psi(\theta)) = \theta_{v} = 0, \quad \forall v \in \Omega_d^{+} \backslash \mathcal{B},
\]
\[
\eta_s = \hat{\eta}_s \quad \forall s \in  \mathcal{B}.
\]
This shows that $\varphi(\eta(\mathcal{P}^{\mathcal{B},0}))$ obtains the minimum value in the set $\bigcup_{c \in \mathbb{R}} \mathcal{P}^{\mathcal{B},c}$.
\end{proof}







\end{document}
