\documentclass[accepted]{uai2024} 
%\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
\usepackage{soul, color}
\usepackage{colortbl}
\usepackage{yhmath}
\usepackage{bm}
\usepackage{graphicx}
\usepackage{booktabs}
\usepackage{tikz}
\usetikzlibrary{positioning}
\usetikzlibrary{fit}
\usetikzlibrary{trees}
\usetikzlibrary{shadings}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{bbold}
\usepackage{arydshln}
\usepackage{listings}
\usepackage{tabularx}
\usepackage{url}
\usepackage{algorithm}
\usepackage[noend]{algpseudocode}
\usepackage{enumitem}

\allowdisplaybreaks[1]
\usepackage[inkscapearea=page]{svg}

%% Some suggested packages, as needed:
\usepackage{natbib} 
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}

\graphicspath{ {./pics/} }

\newcommand{\mutil}{\widetilde{\mu}}
\newcommand{\sigtil}{\widetilde{\sigma}}
\newcommand{\nutil}{\widetilde{\nu}}
\newcommand{\pitil}{\widetilde{\pi}}
\newcommand{\ptil}{\widetilde{p}}
\newcommand{\qtil}{\widetilde{q}}
\newcommand{\util}{\widetilde{u}}
\newcommand{\btil}{\widetilde{b}}
\newcommand{\Btil}{\widetilde{B}}
\newcommand{\Util}{\widetilde{U}}
\newcommand{\pihat}{\widehat{\pi}}
\usetikzlibrary{positioning,decorations.pathreplacing,shapes}
\newcommand{\bbf}{\bm{b}}
\newcommand{\ubf}{\bm{u}}
\newcommand{\E}{\mathbb{E}}
\newcommand{\var}{\operatorname{Var}}
\newcommand{\yT}{\widetilde{\bm{y}}}
\newcommand{\aT}{\widetilde{\bm{a}}}
\newcommand{\bT}{\widetilde{\bm{b}}}
\newcommand{\yH}{\widehat{\bm{y}}}
\newcommand{\bH}{\widehat{\bm{b}}}
\newcommand{\aH}{\widehat{\bm{a}}}
\newcommand{\BH}{\widehat{\bm{B}}}
\newcommand{\UH}{\widehat{\bm{U}}}
\newcommand{\YH}{\widehat{\bm{Y}}}
\newcommand{\Bbf}{\bm{B}}
\newcommand{\Abf}{\bm{A}}
\newcommand{\Ybf}{\bm{Y}}
\newcommand{\Ubf}{\bm{U}}
\newcommand{\eT}{\widetilde{\bm{e}}}
\newcommand{\BT}{\widetilde{\bm{B}}}
\newcommand{\UT}{\widetilde{\bm{U}}}
\newcommand{\eH}{\widehat{\bm{e}}}
\newcommand{\mIcal}{\mid \mathcal{I}}
\newcommand{\Smb}{\bm{A}}%S_{\setminus B}}
\newcommand{\Sb}{\bm{S}_{B}}
\newcommand{\yh}{\hat{y}}
\newcommand\norm[1][\cdot]{\left\lVert#1\right\rVert}
\newcommand\abs[1][\cdot]{\left\lvert#1\right\rvert}
\newcommand{\hlcyan}[1]{{\sethlcolor{cyan}\hl{#1}}}
\DeclareRobustCommand{\hlcyan}[1]{{\sethlcolor{cyan}\hl{#1}}}
\definecolor{codegreen}{rgb}{0,0.6,0}
\definecolor{codeblue}{rgb}{0.,0.5,0.99}
\definecolor{codepurple}{rgb}{0.58,0,0.82}
\definecolor{backcolour}{rgb}{0.95,0.95,0.92}
\definecolor{Gray}{gray}{0.8}
\definecolor{lightyellow}{RGB}{245,238,197}
\newcolumntype{a}{>{\columncolor{lightyellow}}c}
\newcolumntype{b}{>{\columncolor{backcolour}}c}
\renewcommand\arraystretch{1.2}
\newcommand{\figwidth}{0.45\textwidth}

\DeclareFontFamily{U}{mathx}{}
\DeclareFontShape{U}{mathx}{m}{n}{<-> mathx10}{}
\DeclareSymbolFont{mathx}{U}{mathx}{m}{n}
\DeclareMathAccent{\widehat}{0}{mathx}{"70}
\DeclareMathAccent{\widecheck}{0}{mathx}{"71}

\newcommand{\Sbf}{\mathbf{S}}
\newcommand{\Gbf}{\mathbf{G}}
\newcommand{\Hbf}{\mathbf{H}}
\newcommand{\Ibf}{\mathbf{I}}
\newcommand{\Zbf}{\mathbf{Z}}
\newcommand{\Qbf}{\mathbf{Q}}
\newcommand{\Tbf}{\mathbf{T}}
\newcommand{\Fbf}{\mathbf{F}}
\newcommand{\Lbf}{\mathbf{L}}
\newcommand{\Dbf}{\mathbf{D}}
\newcommand{\Mbf}{\mathbf{M}}
\newcommand{\Cbf}{\mathbf{C}}
\newcommand{\Ebf}{\mathbf{E}}
\newcommand{\ybf}{\mathbf{y}}
\newcommand{\xbf}{\mathbf{x}}
\newcommand{\zbf}{\mathbf{z}}
\newcommand{\wbf}{\mathbf{w}}
\newcommand{\sbf}{\mathbf{s}}
\newcommand{\fbf}{\mathbf{f}}
\newcommand{\ebf}{\mathbf{e}}
\newcommand{\gammabf}{\bm{\gamma}}
\newcommand{\alphabf}{\bm{\alpha}}
\newcommand{\deltabf}{\bm{\delta}}
\newcommand{\mubf}{\bm{\mu}}
\newcommand{\Sigmabf}{\mathbf{\Sigma}}

\newcommand{\prob}{Prob \,}
\newcommand{\leb}{\mathcal{L}}
\newcommand{\counting}{\mathcal{C}}
\newcommand{\rr}{\mathbb{R}}
\newcommand{\nn}{\mathbb{N}}

\newcommand{\Bhat}{\widehat{B}}
\newcommand{\Uhat}{\widehat{U}}
\newcommand{\pitd}{\widetilde{\pi}_\mathrm{TD}}
\newcommand{\pibu}{\pihat_{bu}}
\newcommand{\piout}{\pi^\mathrm{out}}
\newcommand{\pibum}{\pihat_{bu(1:m)}}
\newcommand{\pibuM}{\pihat_{bu(1:M)}}
\newcommand{\pibar}{\Bar{\pi}}
\newcommand{\bibim}{b_1, \dots,  b_m}
\newcommand{\biplusm}{b_1 + \dots + b_m}
\newcommand{\bibar}{\Bar{b}}
\newcommand{\bicheck}{\Check{b}}
\newcommand{\bB}{\mathbf{\Bar{b}}}
\newcommand{\bC}{\mathbf{\Check{b}}}
\newcommand{\simIID}{\,\overset{\text{IID}}{\sim}\,}
\newcommand{\ulow}{\ubf^{\scriptscriptstyle low}}
\newcommand{\uupp}{\ubf^{\scriptscriptstyle upp}}
\newcommand{\Au}{\mathbf{A^{\hspace{-0.7mm}u}}}
\newcommand{\Su}{\mathbf{S^{\hspace{-0.2mm}u}}}
\newcommand{\mbar}{\Bar{m}}
\newcommand{\klow}{k_{low}}

\def\hrett{0.65cm}
\def\wrett{3.6cm}
\def\distrett{0.55cm}
\def\propmid{0.4}

\newcommand{\Lor}[1]{\textcolor{codegreen}{#1}}
\newcommand{\Gio}[1]{\textcolor{codepurple}{G: #1}}
\newcommand{\Dar}[1]{\textcolor{codeblue}{D: #1}}
\newcommand{\Nic}[1]{\textcolor{brown}{N: #1}}

\newtheorem{proposition}{Proposition}
\newtheorem{remark}{Remark}

\newtheorem{lemma}{Lemma}
\newtheorem{definition}{Definition}

\title{Probabilistic reconciliation of mixed-type hierarchical time series}
% Add authors
\author[1]{\href{mailto:<lorenzo.zambon@idsia.ch>?Subject=Your UAI 2024 paper}{Lorenzo~Zambon}{}}
\author[1]{Dario~Azzimonti}
\author[1]{Nicolò~Rubattu}
\author[1]{Giorgio~Corani}
% Add affiliations after the authors
\affil[1]{%
    Dalle Molle Institute for Artifcial Intelligence (IDSIA), USI-SUPSI, Lugano, Switzerland
}

\begin{document}

\newcommand{\Syph}{\textit{Syph}}
\newcommand{\SyphSmall}{\textit{Syph-small}}
\newcommand{\Mfive}{\textit{M5}}

\maketitle


\begin{abstract}
Hierarchical time series are collections of time series that are formed via aggregation,
and thus adhere to some linear constraints.
The forecasts for
hierarchical time series should be \textit{coherent}, i.e., they should satisfy the
same constraints.
In a probabilistic setting, forecasts are in the form of predictive distributions. 
Probabilistic reconciliation adjusts the predictive distributions, yielding a joint reconciled distribution that assigns positive probability only to coherent forecasts.
There are methods for the reconciliation of hierarchies containing only Gaussian or only discrete predictive distributions; instead, the reconciliation of mixed hierarchies, i.e. mixtures of discrete and continuous time series, is still an open problem. 
We propose two different approaches to address this problem: \textit{mixed conditioning} and \textit{top-down conditioning}.
We discuss their properties and we present experiments with datasets containing up to thousands of time series.
\end{abstract}

\section{Introduction}

Hierarchical time series are collections of time series formed via aggregation. 
For example, the aggregation of the 
regional levels of tourism yields the national level of tourism; the aggregation of the  sales of individual items  yields the sales of a group  of items, and so on.
Forecasts for  hierarchical time series should 
be \textit{coherent}; for instance, 
the sum of the  forecasts of the regional tourism levels should match the forecast for the
national tourism level. 


%
Hierarchical forecasts are usually generated in two steps.
First,   incoherent  forecasts are independently generated for each time series (\textit{base forecasts}). 
Then,  they are adjusted   to 
become coherent (\textit{reconciliation}).
Reconciled forecasts, besides being coherent,  are  generally  more accurate than  the base forecasts: indeed, forecast reconciliation is a special case of forecast combination \citep{hollyman2021understanding}.
\cite{athanasopoulos2023-review} provides a review of 
methodologies and applications of forecast reconciliation.

Most  methods \citep{hyndman2011optimal, han21a-aistats, di2022forecast} only reconcile the point forecasts.
The state-of-the-art method is minT \citep{wickramasuriya2019optimal},
whose coherent forecasts  minimize the expected  mean squared error.
However, \textit{reconciled predictive distributions} are needed   
\citep{kolassa2022_do_we_want}
to support decision making.
A principled  definition of  probabilistic reconciliation was given by \cite{panagiotelis2022}.
The probabilistic reconciliation of Gaussian base forecasts and its relation with minT have been studied by
\cite{corani_reconc_ecml, wickramasuriya2023probabilistic}, while 
the probabilistic reconciliation of forecasts for count time series has been studied by
\cite{corani_probabilistic_2023, zambon2024efficient,zambon2024properties}.
An alternative research line is constituted by end-to-end models \citep{rangapuram2021end, rangapuram2023coherent,olivares_probabilistic_2023,Dirichlet_Proportions_Model2023}, 
which produce coherent forecasts for the entire hierarchy.

An open problem is, however, the reconciliation of \textit{mixed hierarchies}, 
whose disaggregated time series have low-count values,
while the aggregated ones are smooth and thus modelled as continuous.
This situation is  for instance common in retail \citep[Chap. 6.8]{boylan_intermittent_2021}  
 and there are  currently no suitable methods for this case:
"\textit{the development of algorithms to handle
mixtures of discrete and continuous data [...]
represents a bold research agenda}" \citep{athanasopoulos2023-review}.

We propose two approaches for the probabilistic reconciliation of  mixed hierarchies. 
The first (\textit{mixed conditioning}) adopts reconciliation via conditioning \citep{corani_reconc_ecml, zambon2024efficient}.
It creates a mixed joint distribution of all the base forecasts, where the bottom are defined over counts and the upper over real numbers.
The joint predictive distribution is then conditioned on the hierarchy constraints,
yielding a coherent reconciled distribution that only includes coherent forecasts.
This approach is theoretically well-grounded but, for reasons discussed later, 
is not suitable for large hierarchies.

We thus propose also a second approach, which we call \textit{top-down conditioning}.
It works in two steps: first, the upper base forecasts are reconciled via conditioning, using only the hierarchical constraints between the upper;
then, the bottom distributions are updated in a probabilistic top-down fashion.
We show that this approach  successfully reconciles 
hierarchies containing thousands of intermittent bottom time series, taken
from the  M5 competition \cite{MAKRIDAKIS20221325}.


The paper is organized as follows.
In Sec.~\ref{sec: prob fore rec},  we discuss hierarchical forecasting and probabilistic reconciliation.
In Sec.~\ref{sec:mixed},  we show how to  reconcile mixed hierarchies via conditioning, while 
in Sec.~\ref{sec:topDown} we present the top-down conditioning approach.
We present experiments on real datasets in Sec.~\ref{sec:experiments}.
The conclusions are in Sec.~\ref{sec:conclusion}. 
We provide the proofs in the appendix.


\section{Hierarchical forecasting}
\label{sec: prob fore rec}   

Hierarchical time series are collections of time series that are formed via aggregation and therefore satisfy some summing constraints.
For instance, in Fig.~\ref{fig: simple tree}, the time series $u_1$ is equal to the sum of the time series $u_2$ and $u_3$, and so on.
The lowest level of the hierarchy contains the \textit{bottom time series}, which are denoted by $\bbf = [b_1,\dots,b_m]^T$.
All the remaining time series are referred to as aggregated or \textit{upper time series}, and are denoted by $\ubf = [u_1,\dots,u_k]^T$.
Finally, we denote by 
$\ybf = \left[\ubf^T, \bbf^T\right]^T \in \rr^n$
the vector of all the time series.
For simplicity, we do not show the time index.
The hierarchy constraints are expressed as:
\begin{equation}\label{eq: def S}
\ybf = \Sbf \bbf, \quad \text{with} \;\; \Sbf= \begin{bmatrix}
          \Abf \\ \hdashline[2pt/2pt]
          \Ibf
         \end{bmatrix},
\end{equation}
where $\Ibf \in \rr^{m \times m}$ is the identity matrix. 
$\Sbf \in \rr^{n \times m}$ is called \textit{summing matrix} and $\Abf \in \rr^{k \times m}$ \textit{aggregating matrix}. 
For the hierarchy of Fig.~\ref{fig: simple tree}, we have:
\begin{equation*}
\Abf \;\; = \begin{bmatrix}
        1 & 1 & 1 & 1 \\
        1 & 1 & 0 & 0 \\
        0 & 0 & 1 & 1\\
         \end{bmatrix}.
\end{equation*}


The \textit{base forecasts} are the univariate forecasts produced independently for each time series.
In this work, we assume the base forecasts to be in the form of predictive distributions.
We denote by  $\pihat_B$ and
 by $\pihat_U$ the  base forecast distributions for the  bottom and upper time series and by
$\pihat$ the base forecast distribution for the entire hierarchy.
Depending on the context, $\pi$ denotes either a probability mass function or a  density.

\begin{figure}[!ht]
    \centering
\begin{tikzpicture}[level/.style={sibling distance=40mm/#1}, level distance=0.8cm]
\node [circle,draw] {$u_1$}
  child {node [circle,draw]  {$u_2$}
    child {node [circle,draw] {$b_1$}
    }
    child {node [circle,draw] {$b_2$}
    }
  }
  child {node [circle,draw]  {$u_3$}
    child {node [circle,draw]{$b_3$}
    }
  child {node [circle,draw] {$b_4$}
  }
};
\end{tikzpicture}
    \vspace*{2 mm}
    \caption{A hierarchy with $4$ bottom and $3$ upper time series.}
    \label{fig: simple tree}
\end{figure}

\paragraph{Probabilistic reconciliation.}
Let us introduce the \textit{coherent} subspace $\mathcal{S} := \{ \ybf \in \rr^n : \; \ybf = \Sbf \bbf \}$, which is the set of points that satisfy the hierarchical constraints.
The base forecast distribution is \textit{incoherent}, since its support is not contained in $\mathcal{S}$.
The aim of probabilistic reconciliation is to find a joint \textit{reconciled distribution} $\pitil$ that gives positive probability only to the points of $\mathcal{S}$.
Note that it is sufficient to compute the reconciled distribution $\pitil_B$ for the bottom time series. 
Indeed, the the reconciled distribution $\pitil$ on the entire hierarchy is then obtained by extending  $\pitil_B$ in a coherent way:
\begin{equation}\label{eq: reconciled distribution full}
\pitil(\ubf, \bbf) = \pitil_B(\bbf) \; \mathbb{1}_{\ubf = \Abf \bbf},   
\end{equation}
where $\mathbb{1}$  is $1$ if $\ubf = \Abf \bbf$ and $0$ otherwise.
In the following, we will thus only show how to compute  $\pitil_B(\bbf)$.

\paragraph{Probabilistic bottom-up.}
The simplest reconciliation approach is  the \textit{probabilistic bottom-up}, obtained by setting:
\[\pitil_B = \pihat_B.\]
%
The probabilistic bottom-up simply ignores the base forecasts of the upper time series, and has therefore limited accuracy.
The marginal  distribution of the upper time series
reconciled via bottom-up is given by:
 \begin{equation} \label{eq: pi BU k upper}
    \pibu(\ubf) := \sum_{\substack{\bbf:\; \Abf \bbf = \ubf}} \pihat_B(\bbf).
\end{equation}

\paragraph{Reconciliation via conditioning.}
Reconciliation via conditioning 
conditions the incoherent distribution $\pihat$  on the hierarchy constraints.
If $\pihat$ is discrete,
the reconciled distribution is given by \citep{zambon2024efficient}:
\begin{align}
\pitil_B(\bbf) &:=
\prob\left(\BH = \bbf \mid \UH - \Abf \BH = 0 \right) \nonumber\\
&\propto \pihat(\Abf \bbf,\bbf). \label{eq: reconciled distribution}
\end{align}
Also in the continuous case \citep{zambon2024efficient}, it can be shown that
$\pitil_B(\bbf) \propto \pihat(\Abf \bbf,\bbf)$. 


Reconciliation via conditioning can be interpreted in a Bayesian way \citep{corani_probabilistic_2023}.
The distribution of the bottom-up reconciliation constitutes the prior.
It is then updated to incorporate the information contained in $\pihat_U$.
It treats the forecast $\pihat_U$ as a virtual evidence  \citep[Ch~3.6]{darwiche2009modeling}, which increases our belief in certain values of the upper time series.
The outcome of the updating is $\pitil_U(\ubf)$, which is
a compromise \citep{corani_probabilistic_2023,zambon2024properties} between $\pibu(\ubf)$ and $\pihat_U(\ubf)$.

\section{Mixed Reconciliation via conditioning}
\label{sec:mixed}

Let us now consider a \textit{mixed hierarchy},
where the bottom time series have low-count values,
while the upper ones are smooth and thus treated as continuous.
We thus assume that the
predictive distributions of the 
bottom time series are discrete and  the
predictive distributions of the 
upper time series are continuous. 
Also in the mixed case, reconciliation via conditioning is given by Eq.~\eqref{eq: reconciled distribution}.
%
\begin{proposition}\label{prop:mixedCond}
    For a hierarchy with discrete  bottom forecast distributions and continuous upper forecast distributions, the  distribution of the bottom time series reconciled  via conditioning is: 
    \begin{equation}
        \pitil_B(\bbf) 
        % := \prob\left(\BH = \bbf \mid \UH - \Abf \BH = 0 \right).
        \propto \pihat(\Abf \bbf,\bbf).
        \label{eq:mixedCond}
    \end{equation}
\end{proposition}
We prove Prop.~\ref{prop:mixedCond} in App.~\ref{sec: app mixed cond}, extending the results of \cite{zambon2024efficient} to the case of mixed-type variables.
Here, we assume for simplicity that all the bottom distributions are discrete and all the upper ones are continuous.
The treatment of hierarchies with both continuous and discrete time series on the same level is beyond the scope of this paper.

\paragraph{Sampling from the reconciled distribution.}
\label{sec: sampling from the rec distr}

In the case of mixed hierarchies, the reconciled distribution in Eq.~\eqref{eq:mixedCond} is only available through samples.
Our approach is based on importance sampling: we
first sample from the bottom base forecast distribution $\pihat_B$, and then compute the weights using the upper base forecast distribution $\pihat_U$, which is a multivariate Gaussian in our experiments. Here we cannot use the BUIS algorithm \citep{zambon2024efficient} because we have a joint base distribution on the upper time series.

\paragraph{Minimal example.}
We now reconcile a  mixed hierarchy  with one upper variable ($U$) and two bottom
variables ($B_1$ and $B_2$) with base forecasts:
\begin{gather*}
    \pihat_{B_1} = \text{Poisson}(15), \qquad
    \pihat_{B_2} = \text{Poisson}(15), \\
    \pihat_{U} = \mathcal{N}(40,5^2).
\end{gather*}

The base, bottom-up, and reconciled distributions for $U$ are shown in Fig.~\ref{fig:minimal_U}.
Since the base forecasts have  a positive incoherence (40-15-15 \textgreater \ 0), reconciliation increases the means of the bottom distributions  (from 15 to 17.8) and decreases the  mean of $U$  (from 40 to  35.6).
Reconciliation also reduces the  variance of the predictive distributions:
the variance of $\pibu$, $\pihat_U$, and $\pitil_U$ are respectively 30.1, 25 and 14.9.
This intuitive behavior  is consistent with the theoretical properties of the  Gaussian  reconciliation  \citep{zambon2024properties}. 
Notice that $\pitil_U$ is discrete, as it is 
% indeed, reconciliation does not change the support of the bottom time series, and  $\pitil_U$ is 
obtained by summing  the discrete samples of the reconciled bottom distributions.

\begin{figure}[t]
\centering
\includegraphics[width=0.48\textwidth]{mixed_U.pdf}
\caption{Predictive distribution of $U$: 
bottom-up (discrete), reconciled (mix-cond, discrete) and base (Gaussian).} 
\label{fig:minimal_U}
\end{figure}

\paragraph{Shortcomings  in high dimensions.}
We  assume the independence of the \textit{discrete} predictive distributions when creating the joint distribution $\pihat$.
% from the marginal base forecast.
This approach is commonly used given the lack of standard methods, in the \textit{discrete} case, to  obtain a multivariate  predictive distribution from the marginals.
This assumption is viable with a moderate number of bottom time series, but  in high dimension,
$\pibu$  is both  too peaked and biased.
It is too peaked because of the  independence assumptions, which leads to an overconfident joint distribution.  
It is biased because it is obtained by summing many base forecasts: even for the best algorithms, the base forecasts for intermittent time series are  biased \citep{svetunkov2023iets}.
We noticed in particular that they tend to be overestimated.
%
If the bottom-up distribution is unreliable,
the distribution
reconciled via conditioning can be worse than the base forecast.

\begin{figure}[b]
\centering
\includegraphics[width=0.48\textwidth]{M5_WI_1_h_1_j_1.pdf}
\caption{ Predictive distribution for the top time series (\Mfive\ dataset, store \textit{WI\_1}): 
base ($\pihat_U$, Gaussian), reconciled (TD-cond and Mix-cond), and bottom-up ($\pibu$, discrete).
The black triangle is the actual value.
}
\label{fig:M5example}
\end{figure}

In Fig. \ref{fig:M5example}, we show the  
distribution of the top level  of a hierarchy
with 3049 discrete bottom time series, taken
from the  \textit{M5} competition \citep{MAKRIDAKIS20221325}.
Because of the overestimation bias, the mean of the bottom-up distribution ($\pibu$, purple) is much larger than the actual value (black triangle) and has a long right tail. It is also more peaked
than  the base upper forecast ($\pihat_U$, yellow). 
Notice also the large incoherence between $\pibu$ and  $\pihat_U$.
Hence, while the \textit{marginal}  distributions of the bottom time series are good, 
the \textit{joint} distribution is unreliable, and therefore also the bottom-up distribution $\pibu$. 
Reconciliation via conditioning (Mix-cond), blue in Fig.~\ref{fig:M5example}, 
thus worsens the base forecasts.
\textit{Top-down conditioning} (TD-cond), which we discuss in the next section, addresses this problem.  
It is shown in light blue in Fig.~\ref{fig:M5example}, where it
provides a sensible reconciled distribution. 



\section{Reconciliation via top-down conditioning}
\label{sec:topDown}

\textit{Top-down conditioning} is a probabilistic reconciliation approach that works in two steps: first, the upper base forecasts are reconciled via conditioning, using only the hierarchical constraints between the upper variables;
then, the bottom distributions are updated via a probabilistic 
top-down procedure. 
\cite{elgavish2023hierarchical} presents a similar idea for the Gaussian case, where it admits an analytical solution.
We extend this idea to count time series.
First, we formally define the reconciled distribution via top-down conditioning and we study its properties. 
We show that, in the case of hierarchies with only one upper, the reconciled upper distribution is exactly the base distribution; if there is more than one upper, 
% the reconciled upper distribution
it is given by the upper distribution partially reconciled via conditioning.
We then introduce an algorithm to efficiently sample from the reconciled distribution.


We only consider strictly hierarchical structures, i.e., hierarchies represented by a tree.
Moreover, we assume that the hierarchy is \textit{balanced} \citep{di2022forecast}; a precise definition and its implications are given in App.~\ref{sec: balanced hier}.
% Note that this is not a strong requirement, since any hierarchy can be made balanced by duplicating some bottom time series.
% All the hierarchies that we use in our experiments are already balanced.


Throughout the section we assume that the support of the upper forecast distribution is included in the support of the bottom-up distribution. 
If all bottom forecast distributions are discrete, then the bottom-up forecast has support on the natural numbers; we thus assume  $\pihat_U$ to be discrete.
In practice, when we run the experiments, the samples from the upper distribution are truncated and rounded before applying the algorithm. 
We report all the proofs in App.~\ref{sec: proof top down}.


\subsection{Hierarchy with a single upper}
\label{sec:topDownSingle}

Let us first consider a hierarchy with $m$ bottom and one upper, where $\pihat_1, \dots, \pihat_m, \pihat_U$ are the base distributions. 
\begin{definition}
\label{def: pitd 1 upper m bottom}
Assume that the base forecasts for all the bottom and the upper time series are conditionally independent.
% Consider that all bottom variables and the upper are conditionally independent. 
We define the reconciled distribution via top-down conditioning as 
\begin{equation} \label{eq: pitd 1 upper m bottom}
    % \pitd(\bbf) := \frac{\pihat_1(b_1) \dots \pihat_m(b_m)}{\pibu(\biplusm)} \, \pihat_U(\biplusm).
        \pitd(\bbf) := \pihat_1(b_1) \dots \pihat_m(b_m) \, \frac{\pihat_U(\biplusm)}{\pibu(\biplusm)}.
\end{equation}
\end{definition}
In App.~\ref{sec: proof pitd prob distr} we show that $\pitd$ is a probability distribution.
Note that the bottom-up distribution $\pibu$, defined in Eq.~\eqref{eq: pi BU k upper}, can be written in this case as
\begin{equation} \label{eq: pi BU}
    \pibu(u) := \sum_{\substack{\bibim: \\ \biplusm = u}} \pihat_1(b_1) \dots \pihat_m(b_m).
\end{equation}
%
 Definition~\ref{def: pitd 1 upper m bottom} could be easily generalized to include the case in which we have a joint distribution $\pihat_B$ on the bottom time series; we leave the study of the correlations between discrete bottom forecasts for future work.    
 The reconciled distribution via top-down conditioning has two desirable properties.  
%
\begin{proposition}\label{prop: properies top down}
$\pitd$ satisfies the following properties:
\begin{enumerate}[label={(\roman*)}]
    \item If $(\Btil_1, \dots, \Btil_m) \sim \pitd \; \Longrightarrow \; \Btil_1 + \dots + \Btil_m \sim \pihat_U$

    \item Given $(\bibar_1, \dots, \bibar_m)$ and $(\bicheck_1, \dots, \bicheck_m)$ such that 
    $\bibar_1 + \dots + \bibar_m = \bicheck_1 + \dots + \bicheck_m$, then
    \[
    \frac{\pitd(\bibar_1, \dots, \bibar_m)}{\pitd(\bicheck_1, \dots, \bicheck_m)} = 
    \frac{\pihat_1(\bibar_1)\dots\pihat_m(\bibar_m)}{\pihat_1(\bicheck_1)\dots\pihat_m(\bicheck_m)}
    \]
\end{enumerate}
\end{proposition}
The first property explains the name \textit{top-down}:
indeed, the reconciled upper distribution is exactly the base upper distribution.
The second property
specifies how the distribution of the upper is split 
between the bottom,
i.e., proportionally to the base distribution of the bottom. 

\paragraph{Sampling from $\pitd$.}

Let us first consider the case $m=2$.  
We can rewrite Eq.~\eqref{eq: pitd 1 upper m bottom} as
\begin{align}
\pitd(b_1,b_2)
&= \pihat_1(b_1) \, \pihat_2(b_2) \, \frac{\pihat_U(b_1+b_2)}{\pibu(b_1+b_2)} \nonumber \\
&= \sum_u \pihat_U(u) \cdot \frac{\pihat_1(b_1) \, \pihat_2(b_2)}{\pibu(u)} \, 
\mathbb{1}_{b_1+b_2=u} \nonumber \\
&= \sum_u \pihat_U(u) \cdot \pihat(b_1,b_2 \,|\, b_1+b_2=u). \label{eq: sampling from pitd 2 bottom} 
\end{align}
Eq.~\eqref{eq: sampling from pitd 2 bottom}  
shows that we can sample from $\pitd$ in two steps.
First, we sample $u$ from $\pihat_U$; 
then, we sample $(b_1,b_2)$ from
the base bottom distribution, conditioned on the constraint $b_1+b_2=u$.
% $\pihat(b_1,b_2|b_1+b_2=u)$. 
For the latter step, we introduce Alg.~\ref{alg: cond top down 1 upper 2 bottom}.
It samples $b_1$ from the marginal distribution of $\pihat(b_1,b_2 \,|\, b_1+b_2=u)$; $b_2$ is then computed as $u-b_1$.
% It is at the core of ...

\begin{lemma} \label{lemma: cond top down}
 The output $(b_1, b_2)$ of Alg.~\ref{alg: cond top down 1 upper 2 bottom} is distributed as \[\pihat(b_1,b_2 \,|\, b_1+b_2=u).\]
\end{lemma}

\begin{algorithm}[t]
    \caption{Top-down sampling (2 bottom)}
    \label{alg: cond top down 1 upper 2 bottom}
\begin{algorithmic}[1]
\State \textbf{Input}: $\pihat_1, \pihat_2; \, u$
\State \textbf{Output}: sample $(b_1, b_2)$ 
\Statex
\State Define $q_1(b_1) \propto \pihat_1(b_1) \, \pihat_2(u - b_1)$
\State $b_1 \gets$ sample from $q_1$
\State $b_2 \gets u - b_1$
\State \Return $(b_1, b_2)$
\end{algorithmic}
\end{algorithm}

\begin{figure}[t]
  \centering
  \begin{minipage}{\columnwidth}
    \begin{subfigure}{0.4\columnwidth}
      \centering
      \begin{tikzpicture}[level distance=1.2cm,
                        level 1/.style={sibling distance=2.2cm},
                        level 2/.style={sibling distance=1.1cm},
                        font=\small]
        \node {$B^{(3)}_1$}
          child {node {$B^{(2)}_1$}
            child {node {$B^{(1)}_1$}}
            child {node {$B^{(1)}_2$}}
          }
          child {node {$B^{(2)}_2$}
            child {node {$B^{(1)}_3$}}
            child {node {$B^{(1)}_4$}}
          };
      \end{tikzpicture}
    \end{subfigure}
    \hspace{0.5cm}
    \begin{subfigure}{0.6\columnwidth}
      \centering
      \begin{tikzpicture}[font=\footnotesize]
    % Top
    \node[draw, minimum width=\wrett, minimum height=\hrett] at (0,0) {$u$};   
    % Middle
    \node[draw, minimum width=0.4*\wrett, minimum height=\hrett] at (-0.6*\wrett/2,-\hrett-\distrett) {$b^{(2)}_1$};
    \node[draw, minimum width=0.6*\wrett, minimum height=\hrett] at (0.4*\wrett/2,-\hrett-\distrett) {$b^{(2)}_2$};
    % Down
    \node[draw, minimum width=0.2*\wrett, minimum height=\hrett] at (-1.44,-\hrett-\distrett-\hrett-\distrett) {$b^{(1)}_1$};
    \node[draw, minimum width=0.2*\wrett, minimum height=\hrett] at (-0.72,-\hrett-\distrett-\hrett-\distrett) {$b^{(1)}_2$};
    \node[draw, minimum width=0.35*\wrett, minimum height=\hrett] at (0.27,-\hrett-\distrett-\hrett-\distrett) {$b^{(1)}_3$};
    \node[draw, minimum width=0.25*\wrett, minimum height=\hrett] at (1.35,-\hrett-\distrett-\hrett-\distrett) {$b^{(1)}_4$};
\end{tikzpicture}
    \end{subfigure}
  \end{minipage}
  \caption{Auxiliary binary tree ($m=4$, $L=3$)}
  \label{fig: fictitious hierarchy}
\end{figure}


Analogously, in the general case of $m > 2$, we need to sample from $\pihat(b_1,\dots,b_m \,|\, b_1+\dots+b_m=u)$.
Since there are $m$ variables and $1$ constraint, the direct generalization of Alg.~\ref{alg: cond top down 1 upper 2 bottom} would require sampling from a $(m-1)$-dimensional joint distribution,
% $\pihat_1(b_1)\dots\pihat_{m-1}(b_{m-1})\pihat_m(u-b_1-\dots-b_{m-1})$, 
which is not feasible when $m$ is large.
% This is not feasible when $m$ is large, since the complexity is exponential.
%
We thus introduce Alg.~\ref{alg: condit top down 1 upper m = 2^k bottom}: the key idea is to iteratively split the variables in two groups, applying each time Alg.~\ref{alg: cond top down 1 upper 2 bottom}.
For the sake of clarity, we present the algorithm in the case that $m$ is a power of $2$; 
however, we show in App.~\ref{sec: proof extension to any m} that the algorithm can be easily adapted to any $m$.
Let us define $\Bhat_1 \sim \pihat_1, \dots, \Bhat_m \sim \pihat_m$, and assume that all the $\Bhat_j$'s are independent.
We then set $L:=\log_2(m)+1$, and we build an auxiliary binary tree in the following way: 
\begin{alignat}{3}
&B^{(1)}_j = \Bhat_j, \quad &&\text{for }\; j = 1,\dots,m, \nonumber \\
&B^{(l+1)}_j = B^{(l)}_{2j-1} + B^{(l)}_{2j}, \quad &&\text{for }\; l=1,\dots,L-1, \nonumber \\
& &&\quad\; j = 1,\dots,2^{L-l-1}.  \nonumber
\end{alignat}
An example for $m=4$ is shown in Fig.~\ref{fig: fictitious hierarchy}.
For each $l$ and $j$, we denote by $\pi_j^{(l)}$ the distribution of $B^{(l)}_j$.
In the first part of Alg.~\ref{alg: condit top down 1 upper m = 2^k bottom}, 
we compute $\pi_j^{(l)}$ for each $j$ and $l$; thanks to the independence assumption, 
the distribution of the sum is given by the convolution, denoted by $*$:
\begin{equation}
\pi^{(l+1)}_j = \pi^{(l)}_{2j-1} * \pi^{(l)}_{2j}.    
\end{equation}
In practice, convolutions can be computed efficiently using the Fast Fourier Transform \citep{cooley1965algorithm}.
In the second part of Alg.~\ref{alg: condit top down 1 upper m = 2^k bottom}, 
we start from $u$ at the top node of the auxiliary binary tree, and proceed downward by iteratively doing conditional sampling using Alg.~\ref{alg: cond top down 1 upper 2 bottom}.
%
For example, if $m=4$, we first draw $\big(b^{(2)}_1,b^{(2)}_2\big)$ conditioned on $b^{(2)}_1 + b^{(2)}_2 = u$, then we draw $\big(b^{(1)}_1,b^{(1)}_2\big)$ conditioned on $b^{(1)}_1 + b^{(1)}_2 = b^{(2)}_1$ and $\big(b^{(1)}_3,b^{(1)}_4\big)$ conditioned on $b^{(1)}_3 + b^{(1)}_4 = b^{(2)}_2$ (Fig.~\ref{fig: fictitious hierarchy}).

\begin{algorithm}[t]
    \caption{Top-down sampling ($2^{L-1}$ bottom)}
    \label{alg: condit top down 1 upper m = 2^k bottom}
\begin{algorithmic}[1]
\State \textbf{Input:} $u; \, \pihat_1, \dots, \pihat_m$
\State \textbf{Output}: sample $(b_1, \dots, b_m)$
\Statex
\State $L \gets \log_2(m) + 1$
\State \#\#\# Compute the $\pi^{(l)}_j$'s      %\verb|###| qual è meglio?
\State $\pi^{(1)}_j \gets \pihat_j$ \quad for each $j = 1, \dots, m$
\For {$l = 1,\dots,L-1$}
  \For{$j = 1, \dots, 2^{L-l-1}$}
    \State $\pi^{(l+1)}_j \gets \pi^{(l)}_{2j-1} * \pi^{(l)}_{2j}$
  \EndFor
\EndFor
\State \#\#\# Top-down sampling
\State $b^{(L)}_1 \gets u$
\For{$l = L-1, \dots, 1$}
  \For{$j = 1, \dots, 2^{L-l-1}$}
  \State $\big(b^{(l)}_{2j-1}, b^{(l)}_{2j}\big) \gets$ 
  Alg.~\ref{alg: cond top down 1 upper 2 bottom}$\big(\pi^{(l)}_{2j-1},  \pi^{(l)}_{2j}; \, b^{(l+1)}_j\big)$
  \EndFor
\EndFor
\State \Return $\big(b^{(1)}_1, \dots, b^{(1)}_m\big)$
\end{algorithmic}
\end{algorithm}

\begin{lemma}\label{lemma: top down 1 upper m = 2^k bottom}
The output $(b_1, \dots, b_m)$ of Alg.~\ref{alg: condit top down 1 upper m = 2^k bottom} is distributed as
\[\pihat(b_1,\dots,b_m \,|\, b_1+\dots+b_m=u).\]
% \begin{equation}\label{eq: output alg cond td m=2^k bottom}
% \frac{\pihat_1(b_1) \dots \pihat_m(b_m)}
% {\pibu(u)} 
% \, \mathbb{1}_{u = b_1+\dots+b_m}.
% \end{equation}
\end{lemma}
%
Finally, we introduce Alg.~\ref{alg: top down 1 upper m = 2^k bottom} for sampling from the reconciled distribution via top-down conditioning in case of $1$ upper and $m$ bottom time series.
%
\begin{proposition} \label{prop: top down 1 upper m = 2^k bottom}
The output of Alg.~\ref{alg: top down 1 upper m = 2^k bottom} is distributed as $\pitd$, defined in Eq.~\eqref{eq: pitd 1 upper m bottom}.
\end{proposition}

\begin{algorithm}[t]
    \caption{Top-down conditioning (1 upper)}
    \label{alg: top down 1 upper m = 2^k bottom}
\begin{algorithmic}[1]
\State \textbf{Input:} $\pihat_U, \pihat_1, \dots, \pihat_m; \, N$
\State \textbf{Output}: sample $\big(b^i_1, \dots, b^i_m\big)_{i=1,\dots,m}$
\Statex
\State \textbf{Sample} $\big(u^i\big)_{i=1,\dots,N} \simIID \pihat_U$
\For {$i = 1, \dots, N$}
  \State $\bbf^i \gets$ Alg.~\ref{alg: condit top down 1 upper m = 2^k bottom}$\big(u^i;\,\pihat_1, \dots, \pihat_m\big)$
\EndFor
\State \Return $\left(\bbf^i\right)_{i=1,\dots,N}$
\end{algorithmic}
\end{algorithm}

\subsection{Hierarchy with $k$ upper}
\label{sec:topDownMultiple}

Let us now consider the general case of a hierarchy with $m$ bottom and $k$ upper.
We generalise Definition~\ref{def: pitd 1 upper m bottom} as follows.
%
\begin{definition}
\label{def: pitd k upper m bottom}
    Let $\pihat_1, \dots, \pihat_m$ be the conditionally independent base distributions of the bottom, and $\pihat_U$ the multivariate distribution of the upper. We further assume conditional independence between upper and bottom. The reconciled distribution via top-down conditioning is given by
\begin{equation} \label{eq: pitd k upper m bottom}
    \pitd(\bbf) := \frac{\pihat_1(b_1) \dots \pihat_m(b_m)}{\pibu(\Abf \bbf)} \, \pihat_U(\Abf \bbf).
\end{equation}
\end{definition}
%
We recall that the bottom-up distribution is defined as ${\pibu(\ubf) := \sum_{\substack{\bbf:\; \Abf \bbf = \ubf}} \pihat_B(\bbf)}$.
This sum is non-empty only if $\ubf$ satisfies the hierarchy constraints.
For example, in the case of the hierarchy of Fig.~\ref{fig: simple tree}, 
if $u_1 \neq u_2+u_3$, then
$\{\bbf:\; \Abf \bbf = \ubf\} = \varnothing$,
and therefore $\pibu(\ubf) = 0$.
% there exists no $\bbf$ such that $\ubf = \Abf \bbf$ 
% if $u_1 \neq u_2+u_3$.
% Since we deal with proper hierarchies, the subset of the upper time series also has to satisfy some hierarchical constraints.

Since the hierarchy is balanced, we can consider the sub-hierarchy given by only the upper time series (App.~\ref{sec: balanced hier}).
Hence, following the notation of Sec.~\ref{sec: prob fore rec}, we can write
\begin{equation} \label{eq: subhier}
\ubf = \Su \ulow,    
\end{equation}
where $\ulow$ is the set of upper time series on the lowest level of the hierarchy.
We then denote by $\uupp$ the set of all the other upper time series, so that $\uupp = \Au \ulow$. 
In the example of Fig.~\ref{fig: simple tree}, we have $\ulow = [u_2, u_3]^T$, $\uupp = [u_1]$, and $\Au = [1\;1]$.
% The hierarchical constraints between the upper time series can thus be written as $\mathbb{1}_{\uupp = \Au \ulow}$.
Prop.~\ref{prop: properies top down} can be generalized as follows.
%
\begin{proposition}\label{prop: properies top down k upper}
The distribution $\pitd$ in Def.~\ref{def: pitd k upper m bottom} satisfies the following properties: 
\begin{enumerate}[label={(\roman*)}]
    \item If $\,\BT \sim \pitd \; \Longrightarrow \; \Abf \BT \sim \pihat_U(\ubf) \; \mathbb{1}_{\uupp = \Au \ulow}$

    \item Given $\bB$ and $\bC$ such that 
    $\Abf \bB = \Abf \bC$, then
    \[
    \frac{\pitd\big(\bB\big)}{\pitd\big(\bC\big)} = 
    \frac{\pihat_1(\bibar_1)\dots\pihat_m(\bibar_m)}{\pihat_1(\bicheck_1)\dots\pihat_m(\bicheck_m)}
    \]
\end{enumerate}
\end{proposition}

\begin{algorithm}[b]
    \caption{Top-down conditioning ($k$ upper)}
    \label{alg: top down k upper m bottom}
\begin{algorithmic}[1]
\State \textbf{Input:} $\Abf;\, \pihat_U, \pihat_1, \dots, \pihat_m; \, N$
\State \textbf{Output}: sample $\big(b^i_1, \dots, b^i_m\big)_{i=1,\dots,m}$
\Statex
\State $\Su \gets \text{sub-hier}(\Abf)$
\State $\pitil_{U^{low}} \gets \text{cond-reconc}(\Su,\, \pihat_U)$
\State \textbf{Sample} $\big(u^i_1, \dots, u^i_{\klow}\big)_{i=1,\dots,N} \simIID \pitil_{U^{low}}$
% \State \textbf{Sample} $\big(u^i_1, \dots, u^i_{n_{u^{low}}}\big)_{i=1,\dots,N} \simIID \pitil_{U^{low}}$
\For {$i = 1, \dots, N$}
  \For{$j = 1, \dots, \klow$}
    \State $\bB^{(j)} \gets$ Alg.~\ref{alg: condit top down 1 upper m = 2^k bottom}$\big(u^i_j;\, \pihat_1, \dots, \pihat_m\big)$
  \EndFor
  \State $\bbf^i \gets \big(\bB^{(1)}, \dots, \bB^{(\klow)}\big)$
\EndFor
\State \Return $\left(\bbf^i\right)_{i=1,\dots,N}$
\end{algorithmic}
\end{algorithm}

% The proof is reported in App.~\ref{sec: proof cond top down}.
Note that the distribution of $\Abf \BT$ in (i) can be written as
\begin{equation}\label{eq: conditioning upper}
 \pihat_U(\Au \ulow,\, \ulow) \; \mathbb{1}_{\uupp = \Au \ulow},   
\end{equation}
%
which corresponds to the formula of reconciliation via conditioning \eqref{eq: reconciled distribution full}, 
with $\Abf, \ubf, \bbf$ replaced by $\Au, \uupp, \ulow$.
%
This provides the intuition for the algorithm to sample from $\pitd$ in case of hierarchies with more than one upper (Alg.~\ref{alg: top down k upper m bottom}).
First, we reconcile only the upper forecasts, by conditioning on the constraints between the upper.
% \begin{remark}
Note that reconciliation via conditioning is not an arbitrary choice, but is implied by the properties of $\pitd$, as discussed above.
% \end{remark}
%
If the base forecast for the upper time series is a multivariate Gaussian, reconciliation via conditioning can be done analytically \citep{corani_reconc_ecml, zambon2024properties}.
We can then sample from the partially reconciled distribution on the lowest level of the upper, and apply Alg.~\ref{alg: condit top down 1 upper m = 2^k bottom} 
% independently
for each of the lowest upper.
%
\begin{proposition} \label{prop: top down k upper m bottom}
The output of Alg.~\ref{alg: top down 1 upper m = 2^k bottom} is distributed as $\pitd$, defined in Eq.~\eqref{eq: pitd k upper m bottom}.
\end{proposition}



\section{Experiments}
\label{sec:experiments}

\begin{figure*}[t]
    \centering
\begin{tikzpicture}[every node/.style={rectangle,draw,align=center,rounded corners=4,font=\small},level distance=1.cm,
  level 1/.style={sibling distance=5.1cm, edge from parent fork down},
  level 2/.style={sibling distance=2.15cm, edge from parent fork down, fill=red},
  level 3/.style={sibling distance=1.2cm, edge from parent fork down}
]
  \node[fill=red!30] {\textit{Store}}
    child {node[fill=orange!30] {Hobbies}
      child {node[fill=yellow!30] {Hobbies 1}
        child {node {Item 1}}
        child[edge from parent/.style={draw=none}] {node[draw=none] {...}}
        child {node {Item 416}}
      }
      child {node[fill=yellow!30] {Hobbies 2}}
    }
    child {node[fill=orange!30] {Foods}
    child {node[fill=yellow!30] {Foods 1}}
      child {node[fill=yellow!30] {Foods 2} child[edge from parent/.style={draw=none}] {node[draw=none] {...}}}
      child {node[fill=yellow!30] {Foods 3}}
    }
    child {node[fill=orange!30] {Household}
      child {node[fill=yellow!30] {Household 1}}
      child {node[fill=yellow!30] {Household 2}
        child {node {Item 2534}}
        child[edge from parent/.style={draw=none}] {node[draw=none] {...}}
        child {node {Item 3049}}
      }
    };
\end{tikzpicture}
    \caption{\Mfive\ \textit{Store}-hierarchy. The store is in red,
    % , ($\{\text{CA}_1, \text{CA}_2, \text{CA}_3, \text{CA}_4, \text{TX}_1, \text{TX}_2, \text{TX}_3, \text{WI}_1, \text{WI}_2, \text{WI}_3\}$); 
    categories are in orange, departments are in yellow.}
    \label{fig:m5hier}
\end{figure*}

We consider three different hierarchical datasets.
We start with the \Syph\ dataset, available from the R package ZIM \citep{zim_manual}.
It  provides the \textit{weekly} number of syphilis cases in the US from 2007 to 2010.
The hierarchy has  53 bottom time series (one for each state) and one 
upper time series, the total number of cases in the US.
We then consider a reduced version of this dataset (\SyphSmall), which contains only the nine states of the South Atlantic region and their total.


We also consider 
the  high-dimensional dataset of the \Mfive\ competition 
\citep{MAKRIDAKIS20221325}.
It  contains \textit{daily} sales data referring to
10 different stores. The hierarchy of each store
has the same structure:  3049 bottom time series
(single items) and 11 upper time series, obtained by aggregating the  items by  department, product category, and store (Fig. \ref{fig:m5hier}). 
We independently reconcile each store  and we eventually
report the average results.

Tab.\ref{tab:dset} reports  the main characteristics of the datasets. In all datasets, the mean values of the time series justify modelling  the bottom time series as counts and the upper time series as continuous.


We always consider the reconciliation of one-step-ahead  forecasts. We perform 52 reconciliations on
\Syph\ and
\SyphSmall\ and 14 reconciliations on \Mfive\ adopting a rolling-origin approach, 
i.e., at each iteration we increase the time series of one time step,  re-compute the base forecasts and reconcile them.  
The number of reconciled bottom time series  is hence 9 x 52 = 486 on \SyphSmall, 53 x 52 = 2756 on \Syph\ and
10 (stores) x 3049 (bottom) x 14 = 426'860 on \Mfive.

\begin{table}[t]
    \small
    \centering
    \begin{tabular}{rrrrrr}
        \toprule
        & $n$ & $m$ & $T$ & $\bar{y}_u$ & $\bar{y}_b$\\
        \midrule
        \SyphSmall & 10 & 9 & 209 & 26 & 3\\
        \rowcolor{backcolour}
        \cellcolor{white}\Syph & 54 & 53 & 209 & 97 & 2\\
        \Mfive & 3060 & 3049 & 1941 & \{3448, 718, 387\} & 1\\
        \bottomrule
    \end{tabular}
    \caption{Datasets characteristics: $n$ is the total number of time series, $m$ the number of bottom time series, $T$ the length of the time series , $\bar{y}_b$ and $\bar{y}_u$  the mean of the bottom and upper time series.
    For \Mfive, the three values of $\bar{y}_u$ correspond to  the three upper levels of the hierarchy (store, category, department).
    }
    \label{tab:dset}
\end{table}

\paragraph{Methods.} 
We compute the base forecasts using
ADAM  \citep{svetunkov2023iets}, 
available from the R package 
 \textit{smooth} \citep{smooth_pkg}.
It is  a 
state-space model for probabilistic forecasting of both intermittent and smooth time series. On intermittent time series, it returns 
the predictive distributions in the form of positive samples.
The samples are continuous; we round them to predict  count time series,  as done by \cite{svetunkov2023iets}. 
For smooth time series, we use ADAM with a Gaussian predictive distribution.
 

We compare  different reconciliation approaches against  the base forecast, which is our  \textit{baseline}.
All methods are implemented in the R package bayesRecon\footnote{The vignette "Reconciliation of M5 hierarchy with mixed-type forecasts" in the package partially reproduces the results.}  \citep{bayesRecon}.

The first is the Gaussian reconciliation
(\textit{\textbf{Gauss}}), which we implement as follows.
We approximate each bottom base forecast with a Gaussian distribution. 
We then obtain the joint predictive density for the bottom time series  assuming them to be
independent. 
% There is currently no well-established method to obtain a 
% multivariate predictive distribution over discrete variables starting from marginal
% predictive distributions.
We adopt a multivariate normal as  joint  density for the upper time series.
As in \citep{corani_reconc_ecml}, its mean is the mean of 
the base forecasts of the upper time series and its  covariance matrix is  equal to the covariance of the 1-step-ahead residuals, estimated via shrinkage \citep{wickramasuriya2019optimal}. We also assume independence between the forecast of bottom and upper time series. After these approximations, we have a joint Gaussian distribution over bottom and upper time series, which we reconcile analytically following \cite{corani_reconc_ecml}.
    
We also test a variant of \textit{Gauss}, designed to yield only positive values.
We set to zero the reconciled bottom samples that are negative. We then sum up such truncated bottom samples to obtain the reconciled distribution for the entire hierarchy.
This approach yields samples that are  positive and coherent, but biased. 
We refer to this as  \textit{\textbf{Gauss-T}}, where 
the T stands for \textit{truncated}.

The third method is mixed conditioning (\textit{\textbf{Mix-cond}}),
which we implement as follows.
As for \textit{Gauss}, we model the predictive density of the  upper time series as a  joint multivariate normal distribution. 
We then model the bottom joint distribution over counts as the product of discrete distributions and we perform reconciliation as discussed in Sec.~\ref{sec:mixed}.

Finally, we consider the top-down conditioning method 
(\textbf{TD-cond}).
We adopt the same joint base distribution of  \textit{Mix-cond}, but we reconcile it using the methods of Sec.~\ref{sec:topDown}. 
In particular we use Alg.~\ref{alg: top down 1 upper m = 2^k bottom} on 
 \SyphSmall\ and \Syph, which  contain a single upper variable, and Alg.~\ref{alg: top down k upper m bottom} on \Mfive, which  contains multiple upper variables.

The implementation of \textit{Mix-cond} and \textit{TD-cond}  reconciles a store of the \Mfive\ hierarchy (3060 time series) in a median time of 11.2 (Mix-cond) and 10.9 (TD-cond) seconds on a M1 Mac laptop.

\paragraph{Indicators.} 
We assess the
point forecasts using the
mean scaled absolute error (MASE) \citep{hyndman2006another}.
Following \citep{kolassa2016evaluating}, we use
the median as  point forecast when computing the MASE.
We score  the 90\% prediction intervals using the mean interval score (MIS) \citep{gneiting2011quantiles}. 
We assess the marginal predictive distributions using the ranked probability score (RPS) \citep{kolassa2016evaluating}
and the joint predictive distribution
using the energy score (ES) \citep{panagiotelis2022}.
We compute RPS and ES using the \textit{scoringRules} R package \citep{pkg:scoringRules}.
We do not compute the ES for the \Mfive\ dataset, since the energy score has computational  and  sampling issues in high dimensions \citep{pinson2013discrimination}.

We report the improvement over the base forecasts using the skill score values and averaging them across experiments.
%
For instance, the skill score of \textit{Gauss} on ES is:
\begin{align*}
\text{Skill}_{\%}\,\text{(ES, \textit{Gauss})} = 100 \cdot
\frac{\text{ES(\textit{base}) - ES(\textit{Gauss})}}
{(\text{ES(\textit{base}) + ES(\textit{Gauss}))}/2}\,.
\end{align*}
A positive skill score implies an improvement with respect to the base forecasts.

\paragraph{Results.}

\begin{table}[!ht]
\begin{center}

\begin{tabular}{rrrrrr}
\toprule
\multicolumn{2}{l}{} & {\small Gauss} & {\small Gauss-T} & {\small Mix-cond} & {\small TD-cond} \\
\midrule
\multicolumn{2}{l}{\cellcolor{white}\textbf{\textit{Syph-small}}}  \\
% \midrule

MASE    & \textit{\small Bottom}    & -61.7    & -61.7    & \textbf{-2.5}      & -4.3     \\
\rowcolor{backcolour}
\cellcolor{white} & \textit{\small Upper}     & 13.3    & -2.6   & \textbf{23.8}    & -0.8    \\
MIS     & \textit{\small Bottom}    & -45.4    & -2.8    & \textbf{4.4}      & -6.3    \\
\rowcolor{backcolour}
\cellcolor{white} & \textit{\small Upper}     & 41.7    & 34.0   & \textbf{42.3}    & 12.8     \\
RPS     & \textit{\small Bottom}    & -56.7    & -52.5    & \textbf{4.4}      & -3.3    \\
\rowcolor{backcolour}
\cellcolor{white} & \textit{\small Upper}     & 26.3    & 21.3   & \textbf{29.0}    & 4.9     \\
ES      & 
% \textit{B $\cup$ U}    
& 9.5    & 6.2   & \textbf{12.0}    & 2.3     \\
\midrule
\multicolumn{2}{l}{\cellcolor{white}\textbf{\textit{Syph}}}  \\
% \midrule

MASE    & \textit{\small Bottom}    & -87.4    & -80.4    & -0.2      & \textbf{1.3}     \\
\rowcolor{backcolour}
\cellcolor{white} & \textit{\small Upper}     & -8.7    & -62.5   & -5.6    & \textbf{0.0}    \\
MIS     & \textit{\small Bottom}    & -79.9    & -37.0    & 2.9   & \textbf{4.4}    \\
\rowcolor{backcolour}
\cellcolor{white} & \textit{\small Upper}     & 19.9    & -43.0   & \textbf{20.2}    & -1.0     \\
RPS     & \textit{\small Bottom}    & -91.8    & -88.1    & \textbf{17.6}      & -4.6    \\
\rowcolor{backcolour}
\cellcolor{white} & \textit{\small Upper}     & 4.4    & -53.4  & \textbf{7.0}    & 0.0     \\
ES      & 
% \textit{B $\cup$ U}
& -1.6    & -36.1   & 1.0    & \textbf{3.0}     \\
\midrule
\multicolumn{2}{l}{\textbf{\textit{M5}}}  \\
% \midrule

MASE    & \textit{\small Bottom}    & -69.1    & -66.7    & 0.5      & \textbf{2.8}     \\
\rowcolor{backcolour}
\cellcolor{white} & \textit{\small Upper}     & -34.9    & -118.6   & -29.9    & \textbf{-0.5}    \\
MIS     & \textit{\small Bottom}    & -47.8    & -20.3    & 4.9      & \textbf{10.6}    \\
\rowcolor{backcolour}
\cellcolor{white} & \textit{\small Upper}     & -34.3    & -150.9   & -37.6    & \textbf{3.6}     \\
RPS     & \textit{\small Bottom}    & -55.8    & -50.5    & 8.3      & \textbf{14.7}    \\
\rowcolor{backcolour}
\cellcolor{white} & \textit{\small Upper}     & -33.2    & -127.8   & -30.0    & \textbf{1.3}     \\
% ES      & 
% % \textit{B $\cup$ U}    
% & -45.0    & -123.3   & -39.5    & \textbf{0.5}     \\
\bottomrule
\end{tabular}
\caption{Mean skill scores on  \textit{Syph-small}, \textit{Syph} and \Mfive\ datasets.}
\label{tab:results-syph}
\end{center}
\end{table}

On \SyphSmall, \textit{Mix-cond} outperforms the other approaches (Tab.~\ref{tab:results-syph}), as expected
in low dimensions.
\textit{TD-cond} is not very suitable for this dataset, 
as it does not exploit the information of the joint bottom-up distribution (which is tenable in this case) to 
revise the upper base forecast.
The Gaussian approaches provide a  
poor approximation for  count distributions 
and thus they have low performance, especially on the bottom time series. 
Their positive skill scores on the upper time series are due
to a reduction of variance compared to the base forecasts.
The \textit{Gauss-T} improves  the prediction intervals  (scored by MIS) compared
to \textit{Gauss}. 
However, the overall performance of both  \textit{Gauss} and \textit{Gauss-T} is generally poor and we no longer comment on them.

On \Syph, the best-performing approach is either
\textit{Mix-cond} or \textit{TD-cond}, depending on the indicator.
Besides the average value of the indicators, it is worth looking at their variability. In Fig.\ref{fig:ESbox}, we show the boxplot of the skill scores
of the ES on the 52 reconciliations of 
\Syph~ and \SyphSmall. 

\begin{figure}[!ht]
    \centering
    \includegraphics[width=0.48\textwidth]{ES_Syph.pdf}
    \caption{Boxplots of the skill scores on the energy score (ES) on \SyphSmall\ and \Syph\ datasets. The means are shown as dashed lines and the medians as solid lines.
    Means and medians are different due to the presence of outliers.
    The Gauss-T method is not shown for reasons of space.}
    \label{fig:ESbox}
\end{figure}

\begin{figure}[!ht]
\centering
\includegraphics[width=0.48\textwidth]{syph_both.pdf}
\caption{Base forecasts, reconciled distributions (\textit{Mix-cond}, \textit{TD-cond}), and actual values (black triangles) for \SyphSmall\ and \Syph\ dataset.}
\label{fig:syphexample}
\end{figure}

On \SyphSmall, \textit{Mix-cond} yields the highest
distribution of the skill scores.
The performance of \textit{Gauss} is better on this dataset than on the others  since there are few bottom time series, and the Gaussian reconciliation works well on the upper time series. 
On \Syph, however, 
\textit{TD-cond} has both
the highest median and the lowest variance of energy score; 
it is arguably the preferable approach.


In Fig.\ref{fig:syphexample} we illustrate
the difference between 
\textit{Mix-cond} and \textit{TD-cond} in two examples 
of reconciliation.
As already pointed out, 
both approaches return a positive, discrete reconciled distribution  for the upper time series. 
This happens even if the upper base forecasts 
have a tail of negative values, as in  Fig.\ref{fig:syphexample}.
The upper reconciled distribution of
\textit{Mix-cond} has lower variance than the base forecasts.
This is beneficial in the first example, in which both the bottom-up and the base forecasts provide valuable information.
In the second example, referring to \Syph, this yields instead a distribution that is peaked around a wrong value. 


On  \Mfive,   
\textit{Mix-cond}
performs poorly 
(Tab.~\ref{tab:results-syph})
for the reasons discussed in Sec.~\ref{sec:mixed}.
% By conditioning, we update a joint prior ($\pibu$) which is both biased (being the sum of many biased predictive distributions) and which has too narrow a support due to the assumption of conditional independence among all base forecasts.
\textit{TD-cond}, instead, shows a convincing performance from different viewpoints.
On average, it provides a solid improvement on the predictive distribution of the bottom time series and a moderate improvement on the upper time series; the latter is due to the Gaussian reconciliation applied on the 11 upper time series.
Moreover, it is more reliable than the competitors,
having a much lower variability of the skill scores on both bottom and upper time series (Fig.~\ref{fig:RPSm5box}).
Note the different scales of the axes in Fig.~\ref{fig:ESbox} and Fig.~\ref{fig:RPSm5box}.

\begin{figure}[t]
    \centering
    \includegraphics[width=0.48\textwidth]{RPS_M5.pdf}
    \caption{Boxplots of the skill scores on the RPS on \Mfive\ datasets. The means are shown as dashed lines and the medians as solid lines. Means and medians are different due to the presence of outliers. For each method, the left boxplot refers to the bottom, the right one to the upper.}
    \label{fig:RPSm5box}
\end{figure}


%%%%%%%%%%%%
% Mean/Sd for the distributions in syph 
% Distribution  Mean    SD
% Base          96.5    27.5
% Bottom-up     104.8   42.0
% Mix-cond      97.2    16.8
% TD-cond       96.2    27.6

%%%%%%%%%%%%
% Mean/Sd for the distributions in syph-small
% Distribution  Mean    SD
% Base          29.1    15.3
% Bottom-up     25.6    15.0
% Mix-cond      25.5    9.1
% TD-cond       29.9    14.4



\section{Conclusion} \label{sec:conclusion}
We presented two principled methods
for the probabilistic reconciliation of mixed-type hierarchical forecasts.
\textit{Mixed conditioning} extends previous work on reconciliation via conditioning,
but is only effective in moderately-sized hierarchies because of the shortcomings of the bottom-up distribution in high dimensions. 
Weakening the assumption of conditional independence between the bottom predictive distributions is a promising direction to overcome the problem; we leave this study for future work.
The second method is \textit{top-down conditioning}, which can be sensibly used to reconcile large mixed hierarchies.
First, the upper forecasts are reconciled via conditioning; in this work, we used Gaussian reconciliation because we assumed the upper forecasts to be jointly Gaussian, but this is not an intrinsic limitation of our method.
Then, the bottom forecasts are reconciled via a probabilistic top-down procedure. 
We introduced top-down conditioning under the assumptions that all the bottom forecasts are discrete and all the upper are Gaussian; moreover, we assumed that  the time series are organized into a balanced hierarchy.
We leave for future work the extension to more general linearly constrained multiple time series, or to cases in which different types of forecasts are on the same level of the hiearchy.


%\section{Author contributions}
\begin{contributions} % will be removed in pdf for initial submission 
% (without ‘accepted’ option in \documentclass)
% so you can already fill it to test with the
% ‘accepted’ class option
LZ  formalized the  reconciliation for mixed variables,  designed and implemented the conditioning and the top-down algorithms.
DA and NR contributed to the formalization of the algorithm and partially developed the code for the experiments and visualization. 
GC proposed the research topic.
All  authors substantially contributed  to the  design of the experiments, the discussion of the results, and the drafting of the manuscript.
\end{contributions}



%\section{Funding}
\begin{acknowledgements} % will be removed in pdf for initial submission,
% (without ‘accepted’ option in \documentclass)
% so you can already fill it to test with the
% ‘accepted’ class option
%    Briefly acknowledge people and organizations here.
%    \emph{All} acknowledgements go in this section.
Research funded by the Swiss National Science Foundation (grant 200021\_212164) and by the Hasler foundation (project: \textit{hierarchical forecasting with mixed hierarchies}).
\end{acknowledgements}

\bibliography{biblio}

\newpage

\onecolumn

%\title{Title in Title Case\\(Supplementary Material)}
\title{Appendix}
\maketitle



%This Supplementary Material should be submitted together with the main paper.

\appendix

% \section{Proofs}
% \label{sec:proofs}

\section{Mixed conditioning}
\label{sec: app mixed cond}

\subsection{Mixed type distributions and densities}
\label{sec: mixed distr and densities}

We recall here some basic notions about distributions and densities, as we need them in Sec.~\ref{sec: proof mixed cond} to derive the formula of reconciliation via conditioning in the mixed case.

A \textit{count} variable $X$
can only assume non-negative integer values; thus it
has range  $\nn = \{0,1,2,\dots\}$.
The distribution of $X$ is represented by the probability mass function (pmf) $\pi_X$, which assigns a  probability to
each point of the range.
Hence
\begin{equation}\label{eq: discrete distr}
\prob(X \in G) = \sum_{x \in G} \pi_X(x),
\end{equation}
for any $G \subset \nn$.
The pmf is the \textit{density} of $X$ with respect to the counting measure $\counting$ over $\nn$, defined as 
$\counting(G) = \sum_{j \in \nn} \bm{1}_{\{j \in G\}}$.
Indeed, the sum in Eq.~\eqref{eq: discrete distr} is the integral of the pmf with respect to the measure $\counting$ \citep{billingsley2017probability}.

A real-valued random variable $Y$ is \textit{absolutely continuous}
(in the following, just \textit{continuous})
if its distribution is absolutely continuous with respect to the Lebesgue measure $\leb$. 
The distribution of $Y$ is then represented by its density $\pi_Y$ with respect to $\leb$, such that:
\begin{equation}\label{eq: continuous distr}
\prob(Y \in F) = \int_F \pi_Y(y) \; dy.
\end{equation}
for any measurable $F \subset \rr$.

We now introduce the mixed case.
Let \[\Zbf = \left(X_1,\dots,X_m,Y_1,\dots,Y_k\right)\] be a random vector, where the $X_i$'s are discrete and the $Y_j$'s are continuous.
We denote by $\pi_{\Zbf}$ the density of $\Zbf$  with respect to the product measure $\counting^m \otimes \leb^k$.
Hence, for any measurable $G \subset \rr^m$ and $F \subset \rr^k$, we have
\begin{equation}\label{eq: mixed distr}
\prob(\Zbf \in G \times F) = \sum_{\xbf \in G} \int_{F} d\ybf \; \pi_{\Zbf}(\xbf,\, \ybf).    
\end{equation}
% Note that, for all $\ybf \in \rr^k$, the sum over $F$ is well-posed since $\pi_{\Zbf}(\xbf,\, \ybf) > 0$ for at most countably many $\xbf$'s.
Note that the sum over $G$ in Eq.~\eqref{eq: mixed distr} is well-posed as ${\pi_{\Zbf}(\xbf,\, \ybf) \neq 0}$ only for countably many $\xbf$'s.
See \cite{billingsley2017probability} for a detailed discussion of measures and densities.

\subsection{Proof of Proposition~\ref{prop:mixedCond}}
\label{sec: proof mixed cond}

%\begin{proof}
Let us assume that the forecast distribution for the bottom time series is discrete, while for the upper is continuous.
We denote by $\pihat$ the density of 
$\YH = \left(\UH, \BH\right)$ 
with respect to $\leb^{k} \otimes \counting^{m}$, so that,
for any measurable $F \subset \rr^{k}$ and $G \subset \rr^{m}$:
\begin{equation*}
    \prob\left( \UH \in F,\, \BH \in G \right) = \sum_{\bbf \in G} \int_{F} d\ubf \; \pihat(\ubf,\, \bbf).
\end{equation*}

% To derive \eqref{eq: reconciled distribution}, we proceed as follows.
We now define $\Zbf := \UH - \Abf \BH$; since $\UH$ is continuous, $\Zbf$ is continuous too\footnote{
Let $F \subset \rr^{k}$ be a measurable set such that $\leb(F) = 0$. Then
\begin{align*}
 \prob(\Zbf \in F) 
 &= \prob\left(\UH - \Abf \BH \in F\right) \\
 &= \sum_{\bbf \in \nn} \prob\left(\BH = \bbf,\, \UH \in F_{\Abf \bbf}\right) \\
 &\le \sum_{\bbf \in \nn} \prob\left(\UH \in F_{\Abf \bbf}\right) = 0,
\end{align*}
as $\leb(F_{\Abf \bbf}) = 0$ for any $\bbf$ (the Lebesgue measure is invariant under translations) and $\UH$ is continuous.
}. 
For any set $H \subset \rr^k$ and $\wbf \in \rr^k$, we denote by 
$
    H_{\wbf} := \{\xbf:\; \xbf - \wbf \in H\}.
$
We have that
\begin{align*}
\prob\left( \Zbf \in F,\; \BH \in G \right) 
&= \prob\left( \UH - \Abf \BH \in F,\; \BH \in G \right) \\ 
&= \sum_{\bbf \in G} \prob\left( \UH - \Abf \BH \in F,\; \BH = \bbf \right) \\
&= \sum_{\bbf\in G} \prob\left( \UH \in F_{\Abf \bbf},\; \BH = \bbf \right) \\
&= \sum_{\bbf\in G} \int_{F_{\Abf \bbf}} d\ubf \; \pihat(\ubf,\, \bbf) \\
&= \sum_{\bbf\in G} \int_F d\zbf \; \pihat(\zbf + \Abf \bbf,\, \bbf), 
\end{align*}
where we used the change of variables $\zbf = \ubf - \Abf \bbf$ in the integral.
Hence, the density
% the joint distribution
of $\left(\Zbf,\, \BH\right)$ 
with respect to  $\counting^{m} \otimes \leb^{k}$ is:
% is absolutely continuous with respect to  $\counting^{m} \otimes \leb^{k}$, with density given by 
\begin{equation}
\pi_{\left(\Zbf,\, \BH\right)}(\zbf, \bbf) := \pihat(\zbf + \Abf \bbf, \bbf).
\end{equation}
%
Note that the event $\left\{\YH \in \mathcal{S}\right\}$ coincides with $\left\{\Zbf = \textbf{0}\right\}$.
As in \cite{zambon2024efficient}, we derive the expression of the reconciled distribution as the conditional density of $\BH$ given  $\Zbf = \bm{0}$ \cite[Chapter~4]{cinlar2011probability}:
\begin{align}
\pitil(\bbf) 
&= \frac{\pi_{\left(\Zbf,\, \BH\right)}(\textbf{0},\bbf)}
{\sum_{\xbf} \pi_{\left(\Zbf,\, \BH\right)}(\textbf{0},\xbf)} \nonumber\\
&= \frac{\hat{\pi}(\Abf \bbf, \bbf)}{\sum_{\xbf} \hat{\pi}(\Abf \xbf, \xbf)} \nonumber \\
&\propto \hat{\pi}(\Abf \bbf, \bbf), \nonumber
\end{align}
provided that $\sum_{\xbf} \hat{\pi}(\Abf \xbf, \xbf) > 0$.
%\end{proof}

\newpage
\section{Top-down}
\label{sec: proof top down}

In this section, for better readability, we sometimes use the integral notation even if the distributions are not continuous but discrete:
% write $\sum_{u} f(u) \pi(u)$ as $\int f(u) \pi(u) du$ even if $\pi(u)$ is a pmf.  
e.g., we write $\int \pi(x,y) \, dy$ instead of $\sum_y \pi(x,y)$.

\subsection{Balanced hierarchies}
\label{sec: balanced hier}

Following \cite{di2022forecast}, we say that a hierarchy is \textit{balanced} if each level of the hierarchy is complete. 
For example, the hierarchy in Fig.~\ref{fig: unbalanced} is not balanced: the time series $u_1$ is equal to the sum of 
$u_2$, $u_3$, and $b_5$, and therefore the intermediate level is not complete.
Note that any unbalanced hierarchy can be made balanced by duplicating some nodes.
In this example, we can obtain a balanced hierarchy by adding the node $u_4$, which is just a copy of $b_5$ (Fig.~\ref{fig: balanced}).


\begin{figure*}[!ht]
  \centering
  \begin{minipage}{\columnwidth}
    \begin{subfigure}{0.5\columnwidth}
      \centering
\begin{tikzpicture}
    % Bottom level nodes
    \node[draw, circle, minimum size=0.8cm, inner sep=0pt] (b1) at (0,0) {$b_1$};
    \node[draw, circle, minimum size=0.8cm, inner sep=0pt] (b2) at (1.5,0) {$b_2$};
    \node[draw, circle, minimum size=0.8cm, inner sep=0pt] (b3) at (3,0) {$b_3$};
    \node[draw, circle, minimum size=0.8cm, inner sep=0pt] (b4) at (4.5,0) {$b_4$};
    \node[draw, circle, minimum size=0.8cm, inner sep=0pt] (b5) at (6,0) {$b_5$};
    
    % Intermediate level nodes
    \node[draw, circle, minimum size=0.8cm, inner sep=0pt] (u2) at (0.75,1.5) {$u_2$};
    \node[draw, circle, minimum size=0.8cm, inner sep=0pt] (u3) at (3.75,1.5) {$u_3$};
    
    % Top level node
    \node[draw, circle, minimum size=0.8cm, inner sep=0pt] (u1) at (3,3) {$u_1$};
    
    % Edges from bottom to intermediate level
    \draw (b1) -- (u2);
    \draw (b2) -- (u2);
    \draw (b3) -- (u3);
    \draw (b4) -- (u3);
    
    % Edges from intermediate to top level
    \draw (u2) -- (u1);
    \draw (u3) -- (u1);
    
    % Edge from bottom to top level
    \draw (b5) -- (u1);
\end{tikzpicture}
\caption{Unbalanced hierarchy}
  \label{fig: unbalanced}
    \end{subfigure}
    \hspace{0.5cm}
    \begin{subfigure}{0.5\columnwidth}
      \centering
\begin{tikzpicture}
    % Bottom level nodes
    \node[draw, circle, minimum size=0.8cm, inner sep=0pt] (b1) at (0,0) {$b_1$};
    \node[draw, circle, minimum size=0.8cm, inner sep=0pt] (b2) at (1.5,0) {$b_2$};
    \node[draw, circle, minimum size=0.8cm, inner sep=0pt] (b3) at (3,0) {$b_3$};
    \node[draw, circle, minimum size=0.8cm, inner sep=0pt] (b4) at (4.5,0) {$b_4$};
    \node[draw, circle, minimum size=0.8cm, inner sep=0pt] (b5) at (6,0) {$b_5$};
    
    % Intermediate level nodes
    \node[draw, circle, minimum size=0.8cm, inner sep=0pt] (u2) at (0.75,1.5) {$u_2$};
    \node[draw, circle, minimum size=0.8cm, inner sep=0pt] (u3) at (3.75,1.5) {$u_3$};
    \node[draw, circle, minimum size=0.8cm, inner sep=0pt] (u4) at (6,1.5) {$u_4$};
    
    % Top level node
    \node[draw, circle, minimum size=0.8cm, inner sep=0pt] (u1) at (3,3) {$u_1$};
    
    % Edges from bottom to intermediate level
    \draw (b1) -- (u2);
    \draw (b2) -- (u2);
    \draw (b3) -- (u3);
    \draw (b4) -- (u3);
    \draw (b5) -- (u4);
    
    % Edges from intermediate to top level
    \draw (u2) -- (u1);
    \draw (u3) -- (u1);
    \draw (u4) -- (u1);
\end{tikzpicture}
\caption{Balanced hierarchy}
  \label{fig: balanced}
    \end{subfigure}
  \end{minipage}
  \caption{The unbalanced hierarchy (left) is made balanced by duplicating the node $b_5$ (right)}
  \label{fig: balanced vs unbalanced}
\end{figure*}

If a hierarchy is balanced, there exists a set of ``lowest upper time series'' $\ulow$, such that any other upper time series is the sum of some lowest upper, and each bottom is child of only one of the lowest upper. 
For example, 
it is easy to see that such a set does not exist for the hierarchy of Fig.~\ref{fig: unbalanced}, 
while $\ulow = [u_2, u_3, u_4]$ for the hierarchy of Fig.~\ref{fig: balanced}.
% in the hierarchy of Fig.~\ref{fig: balanced}, $\ulow = [u_2, u_3, u_4]$.
For any balanced hierarchy, we can consider the sub-hierarchy given by only the upper time series: the set of bottom time series of this sub-hierarchy is given by $\ulow$, so that
there exists a matrix $\Su$ such that
$
\ubf = \Su \ulow.
$

The assumption that the hierarchy is balanced is required by the \textit{top-down conditioning} reconciliation approach.
Indeed, it is needed for the first step of the algorithm, where the upper forecasts are reconciled via conditioning (lines 3-4 of Alg.~\ref{alg: top down k upper m bottom}); note that any hierarchy with only one upper time series is trivially balanced.

All the hierarchies used in the experiments in Sect.~\ref{sec:experiments} are balanced.
However, this is not a strong requirement, since any hierarchy can be made balanced by duplicating some bottom time series (Fig.~\ref{fig: balanced vs unbalanced}).

\subsection{Proof that $\pitd$ is a probability distribution}
\label{sec: proof pitd prob distr}

First, it is trivial from Eq.~\eqref{eq: pitd 1 upper m bottom} that $\pitd(\bbf) \ge 0$ for any $\bbf$.
Moreover, from Eq.\eqref{eq: pi BU} follows that, for any $\bbf$ such that $\pibu(b_1+\dots+b_m) = 0$, we have $\pihat_1(b_1) \dots \pihat_m(b_m) = 0$;
hence, for any $\bbf$ such that the denominator is equal to $0$, the numerator is also $0$. We implicitly define $\pitd$ as $0$ on such $\bbf$.
% Moreover, from Eq.\eqref{eq: pi BU} follows that, for any $\bbf$ such that $\pibu(b_1+\dots+b_m) = 0$, we have $\pihat_1(b_1) \dots \pihat_m(b_m) = 0$; we then set $\pitd$ as zero on such points.
%
Finally, $\pitd$ is normalized:
\begin{align*} 
\sum_{\substack{\bibim}} \pitd(\bibim) 
&= \sum_{\substack{\bibim}}  \pihat_1(b_1) \dots \pihat_m(b_m) \, \frac{\pihat_U(\biplusm)}{\pibu(\biplusm)} \\
&= \sum_u \sum_{\substack{\bibim: \\ \biplusm = u}}  \pihat_1(b_1) \dots \pihat_m(b_m) \, \frac{\pihat_U(\biplusm)}{\pibu(\biplusm)} \\
&= \sum_u \frac{\pihat_U(u)}{\pibu(u)} \sum_{\substack{\bibim: \\ \biplusm = u}}  \pihat_1(b_1) \dots \pihat_m(b_m)  \\
&= \sum_u \frac{\pihat_U(u)}{\pibu(u)} \pibu(u) 
= \sum_u \pihat_U(u) 
= 1.
\end{align*}

\subsection{Proof of Proposition~\ref{prop: properies top down}}
\label{sec: proof properties top down}

\begin{enumerate}[label={(\arabic*)}]
\item
Let $(\Btil_1, \dots, \Btil_m) \sim \pitd$, and $\Util := \Btil_1 + \dots + \Btil_m$.
Then,  the distribution of $\Util$ is given by
    \begin{align*}
        \pi_{\Util}(u) 
        &= \sum_{\substack{\bibim: \\ \biplusm = u}} \pitd(\bibim) \\
        % &= \sum_{\substack{b_1, \ldots, b_{m}: \\ b_1+ \cdots+ b_{m}  = u}} \frac{\pihat_1(b_1) \dots \pihat_m(b_{m})}{\pibu(b_1+ \cdots+ b_{m})} \, \pihat_U(b_1+ \cdots+ b_{m}) \\
        &= \sum_{\substack{\bibim: \\ \biplusm = u}} \frac{\pihat_1(b_1) \dots \pihat_m(b_m)}{\pibu(\biplusm)} \, \pihat_U(\biplusm) \\
        &= \sum_{\substack{\bibim: \\ \biplusm = u}} \frac{\pihat_1(b_1) \dots \pihat_m(b_m)}{\pibu(u)} \, \pihat_U(u) \\
        &= \frac{\pihat_U(u)}{\pibu(u)} \sum_{\substack{\bibim: \\ \biplusm = u}} \pihat_1(b_1) \dots \pihat_m(b_m)\\
        &= \pihat_U(u).
    \end{align*}

Note that this holds only for $u$ belonging to the support of the bottom-up distribution, i.e. $u$ such that $\pibu(u) \neq 0$,
as remarked in Sec.~\ref{sec:topDown}.
% Note that $\pitil_{\Util}$ is defined only for $u$ that belong to the support of $\Btil_1 + \dots + \Btil_m$. \Dar{Rivedere}

\item Let $\bibar_1 + \dots + \bibar_m = u = \bicheck_1 + \dots + \bicheck_m$. Then
\begin{align*}
    \frac{\pitd(\bibar_1, \dots, \bibar_m)}{\pitd(\bicheck_1, \dots, \bicheck_m)} 
    &= \frac{\pihat_1(\bibar_1) \dots \pihat_m(\bibar_m) \, \pihat_U(u)}{\pibu(u)} \cdot 
    \frac{\pibu(u)}{\pihat_1(\bicheck_1) \dots \pihat_m(\bicheck_m) \, \pihat_U(u)} \\
    &= \frac{\pihat_1(\bibar_1) \dots \pihat_m(\bibar_m)}{\pihat_1(\bicheck_1) \dots \pihat_m(\bicheck_m)}.
\end{align*}
\end{enumerate}

\subsection{Proof of Lemma~\ref{lemma: cond top down}}
\label{sec: proof cond top down}

The distribution of the output $(b_1,b_2)$ of Alg.~\ref{alg: cond top down 1 upper 2 bottom} is given by
\begin{align*}
&\pi(b_1 \,|\, u) 
= \frac{\pihat_1(b_1) \, \pihat_2(u-b_1)}{\sum_b \pihat_1(b) \, \pihat_2(u-b)}, \\
%
&\pi(b_2 \,|\, b_1, \,u) 
= \mathbb{1}_{b_2 = u-b_1}.
\end{align*}
Hence
\begin{align*}
\pi(b_1,\,b_2 \,|\, u) 
&= \pi(b_2 \,|\, b_1, \,u) \, \pi(b_1 \,|\, u) \\
&= \frac{\pihat_1(b_1) \, \pihat_2(u-b_1)}{\sum_b \pihat_1(b) \, \pihat_2(u-b)} \; \mathbb{1}_{b_2 = u-b_1}\\
&= \frac{\pihat_1(b_1) \, \pihat_2(b_2)}{\sum_b \pihat_1(b) \, \pihat_2(u-b)} \; \mathbb{1}_{u = b_1+b_2} \\
&= \pihat(b_1,b_2 \,|\, b_1+b_2=u).
\end{align*}

\subsection{Proof of Lemma~\ref{lemma: top down 1 upper m = 2^k bottom}}
\label{sec: proof lemma top down 1 upper m = 2^k bottom}


For each $l=1,\dots,L-1$ and $j=1,\dots,2^{L-l-1}$, consider $b^{(l)}_{2j-1}, b^{(l)}_{2j}$ from line 13 of Alg.~\ref{alg: condit top down 1 upper m = 2^k bottom}.
From Lemma~\ref{lemma: cond top down}, we have
\begin{align}
\pi\Big(b^{(l)}_{2j-1}, b^{(l)}_{2j} \, \big| \, b^{(l+1)}_j \Big) 
&= \frac{\pi^{(l)}_{2j-1}\Big(b^{(l)}_{2j-1}\Big) \, \pi^{(l)}_{2j}\Big(b^{(l)}_{2j}\Big)}
{\sum_b \pi^{(l)}_{2j-1}(b) \, \pi^{(l)}_{2j}\big(b^{(l+1)}_j - b\big)} 
\, \mathbb{1}_{b^{(l+1)}_j =\, b^{(l)}_{2j-1} + b^{(l)}_{2j}}    \nonumber \\
&= \frac{\pi^{(l)}_{2j-1}\Big(b^{(l)}_{2j-1}\Big) \, \pi^{(l)}_{2j}\Big(b^{(l)}_{2j}\Big)}
{\pi^{(l+1)}_j\big(b^{(l+1)}_j\big)} 
\, \mathbb{1}_{b^{(l+1)}_j =\, b^{(l)}_{2j-1} + b^{(l)}_{2j}},   \label{eq: proof td 1}
\end{align}
where the denominator in second equation is the result of the convolution in line 8 of Alg.~\ref{alg: condit top down 1 upper m = 2^k bottom}.
%
If we denote by $\bbf^{(l)} = \Big(b^{(l)}_1, \dots, b^{(l)}_{2^{L-l}}\Big)$, for each $l=1,\dots,L$,
we obtain
\begin{align}
\pi\Big(\bbf^{(l)} \, \big| \, \bbf^{(l+1)} \Big) 
&= \prod_{j=1}^{2^{L-l-1}} \pi\Big(b^{(l)}_{2j-1}, b^{(l)}_{2j} \, \big| \, b^{(l+1)}_j \Big) \nonumber \\
&= \frac{\prod_{j=1}^{2^{L-l}} \pi^{(l)}_j\big(b^{(l)}_j\big)}{\prod_{j=1}^{2^{L-l-1}} \pi^{(l+1)}_j\big(b^{(l+1)}_j\big)} 
\; \prod_{j=1}^{2^{L-l-1}} \mathbb{1}_{b^{(l+1)}_j =\, b^{(l)}_{2j-1} + b^{(l)}_{2j}}, \label{eq: proof td 1bis} 
\end{align}
where the second equation is the result of plugging-in  Eq.~\eqref{eq: proof td 1}.
%
Hence
\begin{align}
\pi\Big(\bbf^{(1)}, \bbf^{(2)}, \dots, \bbf^{(L)} \,\big|\, u \Big) 
=& \prod_{l=1}^{L-1} \pi\Big(\bbf^{(l)} \, \big| \, \bbf^{(l+1)} \Big) \, \pi\Big(\bbf^{(L)}\,\big|\, u\Big) \nonumber \\
=& \prod_{l=1}^{L-1} \frac{\prod_{j=1}^{2^{L-l}} \pi^{(l)}_j\big(b^{(l)}_j\big)}
{\prod_{j=1}^{2^{L-l-1}} \pi^{(l+1)}_j\big(b^{(l+1)}_j\big)} 
 \prod_{j=1}^{2^{L-l-1}} \mathbb{1}_{b^{(l+1)}_j =\, b^{(l)}_{2j-1} + b^{(l)}_{2j}} 
\;\cdot \mathbb{1}_{b^{(L)}_1 =\, u}\nonumber \\
&= \frac{\prod_{j=1}^m \pi^{(1)}_j\big(b^{(1)}_j\big)}{\pi^{(L)}_1\big(b^{(L)}_1\big)}
\, \prod_{l=1}^{L-1} \prod_{j=1}^{2^{L-l-1}} \mathbb{1}_{b^{(l+1)}_j =\, b^{(l)}_{2j-1} + b^{(l)}_{2j}}
\;\cdot \mathbb{1}_{b^{(L)}_1 =\, u}, \label{eq: proof td 2}
\end{align}
where we use Eq.~\eqref{eq: proof td 1bis} and line 10 of Alg.~\ref{alg: condit top down 1 upper m = 2^k bottom}
in the second equation, and the third equation is the result of a telescoping product. 
%
Therefore 
\begin{align}
\pi\Big(b^{(1)}_1, \dots, b^{(1)}_m \,\big|\, u\Big) 
&= \pi\Big(\bbf^{(1)} \,\big|\, u\Big) \nonumber \\
&= \int \pi\Big(\bbf^{(1)}, \bbf^{(2)}, \dots, \bbf^{(L)} \,\big|\, u\Big) \, d\bbf^{(2)} \dots d\bbf^{(L)} \nonumber \\
&= \int \frac{\prod_{j=1}^m \pi^{(1)}_j\big(b^{(1)}_j\big)}{\pi^{(L)}_1\big(b^{(L)}_1\big)}
\, \prod_{l=1}^{L-1} \prod_{j=1}^{2^{L-l-1}} \mathbb{1}_{b^{(l+1)}_j =\, b^{(l)}_{2j-1} + b^{(l)}_{2j}}
\;\cdot \mathbb{1}_{b^{(L)}_1 =\, u}
\; d\bbf^{(2)} \dots d\bbf^{(L)} \nonumber \\
&=\frac{\prod_{j=1}^m \pi^{(1)}_j\big(b^{(1)}_j\big)}{\pi^{(L)}_1(u)} 
\, \mathbb{1}_{b^{(1)}_1+\dots+b^{(1)}_m =\, u}. \label{eq: proof td 2bis}
\end{align}
%

Since $\pi^{(1)}_j = \pihat_j$, for all $j=1,\dots,m$, and $\pi^{(L)}_1 = \pi^{(1)}_1 * \dots * \pi^{(1)}_m = \pibu$,
we conclude from Eq.~\eqref{eq: proof td 2bis} that
\begin{align*}
\pi\Big(b^{(1)}_1, \dots, b^{(1)}_m \,\big|\, u\Big) 
&=\frac{\prod_{j=1}^m \pihat_j\big(b^{(1)}_j\big)}{\pibu(u)} 
\, \mathbb{1}_{b^{(1)}_1+\dots+b^{(1)}_m =\, u} \\
&= \pihat\Big(b^{(1)}_1,\dots,b^{(1)}_m \,|\, b^{(1)}_1+\dots+b^{(1)}_m=u\Big).
\end{align*}

\subsection{Proof of Proposition~\ref{prop: top down 1 upper m = 2^k bottom}}
\label{sec: proof top down 1 upper m = 2^k bottom}

From Lemma~\ref{lemma: top down 1 upper m = 2^k bottom} follows that, for all $i=1,\dots,N$:
\[
\pi\big(\bbf^i\,|\,u^i\big) 
= \frac{\pihat_1(b^i_1) \dots \pihat_m(b^i_m)}{\pibu(u^i)} 
\; \mathbb{1}_{b^i_1+\dots+b^i_m =\, u^i}. 
\]
Since $u^i \sim \pihat_U$, we have
\begin{align*}
\pi(\bbf^i) 
&= \int \pi(\bbf^i,\,u^i) \, du^i \\ 
&= \int \pi\big(\bbf^i\,|\,u^i\big) \, \pi(u^i) \, du^i \\
&= \int \frac{\pihat_1(b^i_1) \dots \pihat_m(b^i_m)}{\pibu(u^i)} 
\; \mathbb{1}_{b^i_1+\dots+b^i_m =\, u^i} \, \pi(u^i) \, du^i \\
&= \frac{\pihat_1(b^i_1) \dots \pihat_m(b^i_m)}{\pibu(b^i_1+\dots+b^i_m)} \, \pi(b^i_1+\dots+b^i_m) \\
&= \pitd(\bbf^i).
\end{align*}

\subsection{Extension of Algorithm~\ref{alg: condit top down 1 upper m = 2^k bottom} to generic $m$}
\label{sec: proof extension to any m}

We now consider the case of a hierarchy with one upper and $m$ bottom time series, but we drop the assumption that $m$ is a power of $2$.
In this case, we proceed as follows. 
Let $M$ be the smallest power of $2$ greater or equal to $m$, i.e. $M = 2^{\lceil\log_2(m)\rceil}$, and
let $L := \log_2(M) + 1$.
We build a binary tree with $L$ levels as in Sec.~\ref{sec:topDownSingle}. In this case however the ``missing'' bottom nodes have distribution given by a Dirac's delta centered in $0$, denoted by $\delta_0$:
\begin{alignat}{3}
&B^{(1)}_j = \Bhat_j, \quad &&\text{for }\; j = 1,\dots,m, \nonumber \\
&B^{(1)}_j = \delta_0, \quad &&\text{for }\; j = m+1,\dots,M, \nonumber \\
&B^{(l+1)}_j = B^{(l)}_{2j-1} + B^{(l)}_{2j}, \quad &&\text{for }\; l=1,\dots,L-1, \;\; j = 1,\dots,2^{L-l-1}.  \nonumber
\end{alignat}
%
An example for $m=3$ is shown in Fig.~\ref{fig: fictitious hierarchy with delta}.
As in Sec.~\ref{sec:topDownSingle}, we also denote by $\pi^{(l)}_j$ the distribution of $B^{(l)}_j$, so that
\begin{alignat}{3}
&\pi^{(1)}_j(b_j) = \pihat_j(b_j), \quad &&\text{for }\; j = 1,\dots,m, \nonumber \\
&\pi^{(1)}_j(b_j) = \mathbb{1}_{b_j = 0}, \quad &&\text{for }\; j = m+1,\dots,M, \nonumber \\
&\pi^{(l+1)}_j = \pi^{(l)}_{2j-1} * \pi^{(l)}_{2j}, \quad &&\text{for }\; l=1,\dots,L-1, \;\; j = 1,\dots,2^{L-l-1}.  \nonumber
\end{alignat}
% We also recall that, for any distribution $\pi$, we have $\pi * \delta_0 = \pi$.

\begin{figure}[!ht]
  \centering
\begin{tikzpicture}[level distance=2cm,
                      level 1/.style={sibling distance=6cm},
                      level 2/.style={sibling distance=3cm}]
      \node[circle,draw,minimum size=4em] (b3) {$B^{(3)}_1$}
        child {node[circle,draw,minimum size=4em] (b2a) {$B^{(2)}_1$}
          child {node[circle,draw,font=\scriptsize,inner sep=1pt] (b1a) {$B^{(1)}_1 = \Bhat_1$}}
          child {node[circle,draw,font=\scriptsize,inner sep=1pt] (b1b) {$B^{(1)}_2 = \Bhat_2$}}
        }
        child {node[circle,draw,minimum size=4em] (b2b) {$B^{(2)}_2$}
          child {node[circle,draw,font=\scriptsize,inner sep=1pt] (b1c) {$B^{(1)}_3 = \Bhat_3$}}
          child {node[circle,draw,fill=blue!20,font=\scriptsize,inner sep=1pt] (b1d) {$B^{(1)}_4 = \delta_0$}}
        };
  \end{tikzpicture}
  \caption{Binary tree ($m=3$, $L=3$)}
  \label{fig: fictitious hierarchy with delta}
\end{figure} 

% We can then run  Alg.~\ref{alg: condit top down 1 upper m = 2^k bottom} with $\pi^{(1)}_1, \dots, \pi^{(1)}_{M}$.
% From Lemma~\ref{lemma: top down 1 upper m = 2^k bottom}, we get a sample $(b_1,\dots,b_m,b_{m+1},\dots,b_{M})$ distributed as

We can then run Alg.~\ref{alg: condit top down 1 upper m = 2^k bottom} with $\pi^{(1)}_1, \dots, \pi^{(1)}_{M}$, and only keep the first $m$ terms of the sample $(b_1,\dots,b_m,b_{m+1},\dots,b_{M})$.
Indeed, from Lemma~\ref{lemma: top down 1 upper m = 2^k bottom}
\begin{align*}
\pi(b_1,\dots,b_{M}\,|\,u) 
&= \frac{\pi^{(1)}_1(b_1) \dots \pi^{(1)}_{M}(b_{M})}{\pibuM(u)} \, \mathbb{1}_{u = b_1+\dots+b_{M}}\\
&= \frac{\pihat_1(b_1) \dots \pihat_m \, \mathbb{1}_{b_{m+1} = 0} \dots \mathbb{1}_{b_{M} = 0}}{\pibuM(u)} \, \mathbb{1}_{u = b_1+\dots+b_{M}},
\end{align*}
where
\begin{align*}
\pibuM 
&= \pi^{(1)}_1 * \dots * \pi^{(1)}_{M} \\
&= \pihat_1 * \dots * \pihat_m * \delta_0 * \dots * \delta_0 \\
&= \pihat_1 * \dots * \pihat_m
= \pibum.
\end{align*}
By integrating out $b_{m+1},\dots,b_M$, we obtain
\begin{align*}
\pi(b_1,\dots,b_m\,|\,u) 
&= \int \pi(b_1,\dots,b_{M}\,|\,u) \, db_{m+1} \dots db_M \\
&= \int \frac{\pihat_1(b_1) \dots \pihat_m(b_m) \, \mathbb{1}_{b_{m+1} = 0} \dots \mathbb{1}_{b_{M} = 0}}{\pibum(u)} 
\, \mathbb{1}_{u = b_1+\dots+b_{M}} \, db_{m+1} \dots db_M \\
&= \frac{\pihat_1(b_1) \dots \pihat_m(b_m)}{\pibum(u)} \, \mathbb{1}_{u = b_1+\dots+b_m}
\end{align*}

\begin{remark}
% Since $pi * \delta_0 = \pi$, for any distribution $\pi$, there is no need in practice to run the algorithm for the ``missing'' nodes.
In practice, since $\pi * \delta_0 = \pi$ for any distribution $\pi$, when we run the algorithm there is no need to compute any of the convolutions with the ``missing'' nodes. 
For example, for the binary tree of Fig.~\ref{fig: fictitious hierarchy with delta}, when we reach $B^{(2)}_2$ we can stop as we have $B^{(1)}_3 = B^{(2)}_2$. 
\end{remark}


\subsection{Proof of Proposition~\ref{prop: properies top down k upper}}
\label{sec: proof properies top down k upper}


\begin{enumerate}[label={(\arabic*)}]
\item
Let $\,\BT \sim \pitd$, and $\UT := \Abf \BT$.
Then, the distribution of $\Util$ is given by
    \begin{align*}
        \pi_{\Util}(\ubf) 
        &= \sum_{\substack{\bbf: \; \Abf \bbf = \ubf}} \pitd(\bbf) \\
        &= \sum_{\substack{\bbf: \; \Abf \bbf = \ubf}} \frac{\pihat_1(b_1) \dots \pihat_m(b_m)}{\pibu(\Abf \bbf)} \, \pihat_U(\Abf \bbf) \\
        &= \sum_{\substack{\bbf: \; \Abf \bbf = \ubf}} \frac{\pihat_1(b_1) \dots \pihat_m(b_m)}{\pibu(\ubf)} \, \pihat_U(\ubf) \\
        &= \mathbb{1}_{\uupp = \Au \ulow} \; \frac{\pihat_U(\ubf)}{\pibu(\ubf)} \sum_{\substack{\bbf: \; \Abf \bbf = \ubf}} \pihat_1(b_1) \dots \pihat_m(b_m) \\
        &= \mathbb{1}_{\uupp = \Au \ulow} \; \pihat_U(\ubf).
    \end{align*}
Note that the indicator function is necessary, since ${1 / \pibu(\ubf)}$ can be pulled out of the sum only if $\pibu(\ubf) \neq 0$, which holds if
$\uupp = \Au \ulow$.

\item Let $\Abf \bB = \ubf = \Abf \bC$. Then
\begin{align*}
    \frac{\pitd(\bB)}{\pitd(\bC)} 
    &= \frac{\pihat_1(\bibar_1) \dots \pihat_m(\bibar_m) \, \pihat_U(\ubf)}{\pibu(\ubf)} \cdot 
    \frac{\pibu(\ubf)}{\pihat_1(\bicheck_1) \dots \pihat_m(\bicheck_m) \, \pihat_U(\ubf)} \\
    &= \frac{\pihat_1(\bibar_1) \dots \pihat_m(\bibar_m)}{\pihat_1(\bicheck_1) \dots \pihat_m(\bicheck_m)}.
\end{align*}
\end{enumerate}

\subsection{Proof of Proposition~\ref{prop: top down k upper m bottom}}
\label{sec: proof alg k upper}

We need to prove that
\begin{equation}\label{eq: claim proof prop 5}
\Big(\bB^{(1)},\dots,\bB^{(\klow)}\Big) \sim 
% \pitd(\bbf).
\frac{\pihat_1(b_1) \dots \pihat_m(b_m)}{\pibu(\Abf \bbf)} \, \pihat_U(\Abf \bbf).
\end{equation}

We denote $\bB^{(j)} = \big(\bB^{(j)}_1,\dots,\bB^{(j)}_{m_j}\big)$, for all $j=1,\dots,\klow$, and we drop the superscript $i$ for better readability.
First, from line 8 of Alg.~\ref{alg: top down k upper m bottom}, and from Lemma~\ref{lemma: top down 1 upper m = 2^k bottom}, we have that
\begin{equation*}
\bB^{(j)} \,|\, u_j \sim \frac{\pihat\big(\bB^{(j)}_1\big) \dots \pihat\big(\bB^{(j)}_{m_j}\big)}
{\pibu(u_j)}
\, \mathbb{1}_{u_j = \bB^{(j)}_1 + \dots + \bB^{(j)}_{m_j}},
\end{equation*}
and therefore
\begin{equation}\label{eq: proof prop 5 pi(b|u)}
\pi\Big(\bB^{(1)}, \dots, \bB^{(\klow)} \,\big|\, u_1, \dots, u_{\klow} \Big)
= \frac{\pihat_1(b_1) \dots \pihat_m(b_m)}
{\pibu(u_1) \dots \pibu(u_{\klow})}
\; \mathbb{1}_{u_1 = \bB^{(1)}_1 + \dots + \bB^{(1)}_{m_1}} \dots \mathbb{1}_{u_{\klow} = \bB^{(\klow)}_1 + \dots + \bB^{(\klow)}_{m_{\klow}}}.
\end{equation}
Moreover, from line 4 of Alg.~\ref{alg: top down k upper m bottom} and Eq.~\eqref{eq: reconciled distribution}:
\begin{equation}\label{eq: proof prop 5 pi(u)}
\pi(u_1,\dots,u_{\klow}) \propto \pihat_U\big(\Au \ubf^{low},\, \ubf^{low}),
\end{equation}
where $\ubf^{low} = (u_1,\dots,u_{\klow})$.
Joining Eq.~\eqref{eq: proof prop 5 pi(b|u)} and Eq.~\eqref{eq: proof prop 5 pi(u)}, and integrating out $u_1,\dots,u_{\klow}$, we obtain 
\begin{align*}
\pi\Big(\bB^{(1)},\dots,\bB^{(\klow)}\Big)
&= \int \pi\Big(\bB^{(1)}, \dots, \bB^{(\klow)} \,\big|\, u_1, \dots, u_{\klow} \Big) \, \pi(u_1,\dots,u_{\klow}) \, d\ubf^{low} \\
&\propto \pihat_1(b_1) \dots \pihat_m(b_m) 
\int \frac{\pihat_U\big(\Au \ubf^{low},\, \ubf^{low}) 
\, \mathbb{1}_{u_1 = \bB^{(1)}_1 + \dots + \bB^{(1)}_{m_1}} \dots \mathbb{1}_{u_{\klow} = \bB^{(\klow)}_1 + \dots + \bB^{(\klow)}_{m_{\klow}}}}
{\pibu(u_1) \dots \pibu(u_{\klow})}
 \, d\ubf^{low} \\
&= \pihat_1(b_1) \dots \pihat_m(b_m) 
\int \frac{\pihat_U\big(\Au \ubf^{low},\, \ubf^{low}) 
\, \mathbb{1}_{u_1 = \bB^{(1)}_1 + \dots + \bB^{(1)}_{m_1}} \dots \mathbb{1}_{u_{\klow} = \bB^{(\klow)}_1 + \dots + \bB^{(\klow)}_{m_{\klow}}}}
{\sum\limits_{\substack{\bbf^{(j)}:\; b^{(j)}_1+\dots+b^{(j)}_{m_j} = u_j \\ j = 1,\dots,\klow}} \pihat(b^{(1)}_1) \dots \pihat(b^{(1)}_{m_1}) \dots \pihat(b^{(\klow)}_1) \dots \pihat(b^{(\klow)}_{m_{\klow}})}
 \, d\ubf^{low} \\
&\propto \frac{\pihat_1(b_1) \dots \pihat_m(b_m)}{\pibu(\Abf \bbf)} \, \pihat_U(\Abf \bbf). 
\end{align*}


\end{document}
