% \documentclass{uai2024} % for initial submission
\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    % \bibliographystyle{plainnat}
    % \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

% \usepackage[hidelinks]{hyperref}
\usepackage{url}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{graphicx}
\usepackage{amsthm}
\usepackage{enumitem}

\usepackage{algorithm}
\usepackage{algpseudocode}

\usepackage{cleveref}
\usepackage{wrapfig}
\usepackage[textwidth=1in, textsize=tiny]{todonotes}

\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}{Proposition}[section]
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{condition}[theorem]{Condition}
\newtheorem{assumption}[theorem]{Assumption}

\theoremstyle{definition}
\newtheorem{remark}{Remark}

\newcommand{\blue}[1]{{\color{blue}#1}}

\newcommand\blfootnote[1]{%
  \begingroup
  \renewcommand\thefootnote{}\footnote{#1}%
  \addtocounter{footnote}{-1}%
  \endgroup
}

\usepackage{thmtools}
\usepackage{thm-restate}

\title{Computing Low-Entropy Couplings for Large-Support Distributions}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\vspace{-0.5em}Samuel Sokota}
\author[1]{Dylan Sam}
\author[2]{Christian Schroeder de Witt}
\author[3]{Spencer Compton}
\author[2]{Jakob Foerster}
\author[1,4]{J. Zico Kolter\vspace{-0.5em}}
% Add affiliations after the authors
\affil[1]{Carnegie Mellon University}
\affil[2]{University of Oxford}
\affil[3]{Stanford University}
\affil[4]{Bosch AI}
\affil[ ]{\texttt{ssokota@andrew.cmu.edu}}
  
\begin{document}
\maketitle

\begin{abstract}
Minimum-entropy coupling (MEC)---the process of finding a joint distribution with minimum entropy for given marginals---has applications in areas such as causality and steganography. 
However, existing algorithms are either computationally intractable for large-support distributions or limited to specific distribution types and sensitive to hyperparameter choices. This work addresses these limitations by unifying a prior family of iterative MEC (IMEC) approaches into a generalized partition-based formalism. From this framework, we derive a novel IMEC algorithm called ARIMEC, capable of handling arbitrary discrete distributions, and introduce a method to make IMEC robust to suboptimal hyperparameter settings.
These innovations facilitate the application of IMEC to high-throughput steganography with language models, among other settings.
Our codebase is available at \url{https://github.com/ssokota/mec}.
\end{abstract}

\section{Introduction}

\looseness=-1
Given two marginal distributions, a coupling is a bivariate joint distribution with the given marginals.
In general, there may be many couplings for a particular pair of marginals.
The problem of computing a coupling with the minimum amount of joint entropy among all feasible couplings is called minimum-entropy coupling (MEC) \citep{KOVACEVIC2015369}.
Further detailed in \citet{compton2023minimumentropy}, applications of MEC include causal inference \citep{Kocaoglu_Dimakis_Vishwanath_Hassibi_2017,compton2020entropic,Javidian:21,compton2022entropic}, communication \citep{pmlr-v162-sokota22a}, steganography \citep{witt2023perfectly}, random number generation \citep{li2021}, multimodal learning \citep{liang2023quantifying}, functional representations \citep{Cicalese2019MinimumEntropyCA}, and dimensionality reduction~\citep{Vidyasagar2012,Cicalese2016}.

\looseness=-1
While MEC is NP-hard \citep{KOVACEVIC2015369}, recent works have provided approaches that achieve provable approximations of MECs \citep{Kocaoglu_Dimakis_Vishwanath_Hassibi_2017,Cicalese2019MinimumEntropyCA,rossi2019,li2021,compton2022tighter,compton2023minimumentropy,shkel2023information} in log-linear time (i.e., $O(N \log N)$) in the cardinality of the support of the marginals.
Unfortunately, the supports of many distributions of interest, such as those of generative AI models, are intractably large for these provable approximation algorithms.

\looseness=-1
To handle such cases, \citet{pmlr-v162-sokota22a} introduced a class of heuristic algorithms for producing low-entropy couplings.
These algorithms work by iteratively coupling components of random vectors using provable MEC approximation algorithms in such a way that guarantees the aggregate joint distribution is a coupling.
In practice, both \citet{pmlr-v162-sokota22a} and \citet{witt2023perfectly} find that these iterative minimum-entropy coupling (IMEC) approaches produce low-entropy couplings for distributions with very large supports---binary images and trajectories of Atari games~\citep{ale} in the work of \citet{pmlr-v162-sokota22a} and binary strings and generative models (including GPT-2~\citep{Radford2019LanguageMA}, WaveRNN~\citep{kalchbrenner_efficient_2018}, and Image Transformer~\citep{image_transformer}) in the work of \citet{witt2023perfectly}.
Unfortunately, the applicability of the IMEC algorithms \citet{pmlr-v162-sokota22a} introduced is limited to problems in which one distribution either has small support or is factorable. 
Moreover, these algorithms can be sensitive to hyperparameter choices, requiring careful tuning for optimal performance. 
\emph{As a result, at the time of writing, there exist no techniques for producing low-entropy couplings of general large-support distributions}, let alone any that are also robust to hyperparameter settings.

\looseness=-1
In this work, we make multiple contributions regarding the IMEC line of research.
First, we unify existing IMEC algorithms under a single formalism using \emph{sets of partitions}, where each partition is over the sample space of one of the given marginals.
IMEC couples distributions by iteratively performing (approximate) MECs between a conditional distribution of one marginal and the posterior over the blocks of a partition associated with the other marginal.
In particular, at each iteration, IMEC uses a partition whose associated posterior maximizes entropy.

Leveraging this formalism, we derive the first algorithm for computing low-entropy couplings for arbitrary large-support distributions, which we call autoregressive IMEC (ARIMEC).
ARIMEC uses a set of partitions, which we call the prefix tree partition set, in which each partition corresponds to a node of the prefix tree of one of the sample spaces.
These prefix trees can have large numbers of nodes (and thereby induce large numbers of partitions).
Thus, to facilitate an efficient implementation, we introduce techniques to 1) lazily update the posterior over different blocks and 2) quickly search over partitions using pruning.

Finally, recognizing IMEC's general brittleness to partition set choice, we introduce a technique, called merging, to improve its robustness. At each iteration, this technique merges sample realizations into groups with identical posterior updates. Merging uses these groupings to perform additional MECs when entropy would otherwise be wasted due to suboptimal partition sets or other factors.

We empirically validate the utility of our innovations in two settings: Markov coding games \citep{pmlr-v162-sokota22a} and steganography \citep{cachin_perfect}. In Markov coding games, the objective is to encode messages into the trajectories of a Markov decision process while achieving a high expected return. In steganography, the goal is to embed sensitive information into innocuous content such that an adversary cannot detect the hidden information. Our results show that ARIMEC achieves substantially improved communication rates in both settings, illustrating its ability to use autoregressive prior information about realistic messages. Additionally, we demonstrate that merging significantly enhances IMEC's robustness to suboptimal partition set choices, thereby facilitating easier out-of-the-box application. Overall, our results suggest ARIMEC with merging as a practical approach to applications involving computing low-entropy couplings for large-support distributions, such as high-throughput steganography with language models.

\section{Background and Notation}

For our background, we formally introduce minimum-entropy coupling and discuss existing techniques for computing and (heuristically) approximating minimum-entropy couplings.
Thereafter, we introduce notation for partitions of sets, which we will later use to unify existing methods in one general framework.

\subsection{Minimum-Entropy Coupling}

We begin by formalizing the ideas of couplings and minimum-entropy couplings.

\begin{definition}[Coupling]
Let $\mathcal{\mu} \colon \mathbb{X} \to [0, 1]$ be a probability distribution over a finite set $\mathbb{X}$ and let $\mathcal{\nu} \colon \mathbb{Y} \to [0, 1]$ be a probability distribution over a finite set $\mathbb{Y}$.
A coupling of $\mathcal{\mu}$ and $\mathcal{\nu}$ is a bivariate joint probability distribution $\gamma \colon \mathbb{X} \times \mathbb{Y} \to [0, 1]$ that marginalizes to $\mu$ and $\nu$.
In other words, $\gamma$ satisfies
\begin{align} \label{cond:coupling1}
\sum_{x' \in \mathbb{X}} \gamma(x', y) &= \nu(y), \text{ for all } y \in \mathbb{Y},\\ 
\sum_{y' \in \mathbb{Y}} \gamma(x, y') &= \mu(x), \text{ for all } x \in \mathbb{X}.\label{cond:coupling2}
\end{align} 
We use $\Gamma(\mu, \nu) = \{\gamma \mid \gamma \text{ satisfies  (\ref{cond:coupling1}) \& (\ref{cond:coupling2})}\}$ 
to denote the set of all couplings for $\mu$ and $\nu$.
\end{definition}

\begin{definition}[Joint Entropy]
Given a coupling $\gamma$, the joint entropy is defined as \[\mathcal{H}(\gamma) = - \mathbb{E}_{(X, Y) \sim \gamma} \log \gamma(X, Y).\]
\end{definition}

Throughout the paper, we will use capital letters to denote random variables, as is done in the definition above.

\begin{definition}[Minimum-Entropy Coupling]
Given two marginal distributions $\mu, \nu$, a \textbf{minimum-entropy coupling} is a coupling $\gamma \in \Gamma(\mu, \nu)$ such that 
\[\mathcal{H}(\gamma) = \min \{\mathcal{H}(\gamma') \mid \gamma' \in \Gamma(\mu, \nu)\}.\]
\end{definition}


\subsection{Computing and Approximating Minimum-Entropy Couplings} \label{sec:amec}

\looseness=-1
While computing an exact minimum-entropy coupling is NP-hard \citep{KOVACEVIC2015369}, there has been a series of recent works that construct $O(N \log N)$ approximation algorithms, where $N$ is the size of the sample space.
\citet{Cicalese2019MinimumEntropyCA} introduce an approximation algorithm that they show guarantees a coupling within 1 bit of minimum entropy.
\citet{rossi2019} show that \citet{Kocaoglu_Dimakis_Vishwanath_Hassibi_2017}'s greedy approach guarantees a coupling within 1 bit of minimum entropy.
\citet{li2021} introduce a third approach for which he also proved a 1 bit approximation guarantee.
Most recently, \citet{compton2023minimumentropy} show an improved guarantee for \citet{Kocaoglu_Dimakis_Vishwanath_Hassibi_2017}'s greedy approach of about 0.53 bits, while also showing that \citet{Cicalese2019MinimumEntropyCA} and \citet{li2021}'s algorithms cannot match this guarantee.
\citet{compton2023minimumentropy} also give approaches that guarantee exact MECs, though they require exponential time.

\subsection{Iterative Minimum-Entropy Coupling with a Tabular Posterior}

In some settings, it is desirable to (non-provably) approximate minimum-entropy couplings where one random variable is a vector that ranging over such a large number of possible outcomes that the approaches described in \Cref{sec:amec} are inapplicable.
\citet{pmlr-v162-sokota22a} propose an iterative approach to such settings that assumes that this random vector is autoregressively specified.
In this work, we refer to \citet{pmlr-v162-sokota22a}'s algorithm as tabular IMEC (TIMEC).
TIMEC guarantees that the resulting joint distribution is a coupling, supports conditional sampling and likelihood queries for both $X \mid Y$ and $Y \mid X$, where $Y$ is the random vector, and heuristically achieves low entropy.
It can either be defined using the conditional generative process for sampling $Y \mid X$ or the conditional generative process for sampling $X \mid Y$, as both induce the same joint distribution.
We focus on the process for generating $Y \mid X$, which is formalized in \Cref{alg:timec1}, in the main body but include the process for generating $X \mid Y$ in \Cref{alg:timec2} in \Cref{app:inverse}.
\Cref{alg:timec1} works iteratively in two steps: 
\begin{enumerate}[leftmargin=*]
    \item First, it performs an (approximate) MEC between the posterior over $X$ given $Y_{1:j-1}$ (inductively defined via Bayes' Theorem) and the conditional distribution $\nu(Y_j \mid Y_{1:j-1})$.\footnote{Note that we use upper-bound-inclusive indexing, so $Y_{1:0}=()$, $Y_{1:1}=(Y_1)$, $Y_{1:2}=(Y_1, Y_2)$, etc.}
    The joint posterior over $X$ and $Y_j$ given $Y_{1:j-1}$ is assigned to the output of this coupling.
    \item Second, it samples $Y_j$ from the posterior over $Y_j$ given both $X=x$ and $Y_{1:j-1}$ (also inductively defined via Bayes' Theorem). 
\end{enumerate}

\begin{algorithm}[t]
    \caption{Tabular IMEC: $Y \mid X=x$} \label{alg:timec1}
    \begin{algorithmic}
        \Procedure{TIMEC}{$\mu$, $\nu$, $x$}
        \State $\gamma(X) \gets \mu(X)$
            \For{$j=1, \dots,  m$}
                \State $\gamma(X, Y_j \mid Y_{1:j-1}) \gets \text{MEC}(\gamma(X \mid Y_{1:j-1}),$ \State \hspace{38.35mm} $\nu(Y_j \mid Y_{1:j-1}))$
                \State $Y_j \sim \gamma(Y_j \mid x, Y_{1:j-1})$
            \EndFor
            \State return $Y$
        \EndProcedure
    \end{algorithmic}
\end{algorithm}

\subsection{Iterative Minimum-Entropy Coupling with a Factored Posterior}

Unfortunately, requiring approximate MECs over distributions of size $|\mathbb{X}|$ makes TIMEC inapplicable to many settings, such as steganography with large message sizes \citep{witt2023perfectly}.
To ameliorate this issue, \citet{pmlr-v162-sokota22a} also proposed a second approach, which we refer to as factored IMEC (FIMEC)\footnote{\citet{witt2023perfectly} use the name iMEC for this approach.}, in which $X$ is also assumed to be a random vector.
Furthermore, crucially, it is assumed to be factorable.

\begin{assumption}[Factorability] \label{ass:fact}
$X=(X_1, \dots, X_n)$ is a random vector with $\mu(x) = \prod_i \mu(x_i)$ for all $x \in \mathbb{X}$.
\end{assumption}

As with TIMEC, FIMEC guarantees that the resulting distribution is a coupling, supports likelihood queries to both conditionals and the joint distribution, and heuristically achieves low entropy.
It can similarly be defined in terms of either conditional generative process ($X \mid Y$ or $Y \mid X$).
We again focus on the $Y \mid X$ case (\Cref{alg:fimec1}), and defer the $X \mid Y$ case to \Cref{app:inverse}.
The basic structure of \Cref{alg:fimec1} is analogous to that of \Cref{alg:timec1}.
However, rather than performing MECs using $\gamma(X \mid Y_{1:j-1})$, FIMEC uses $\gamma(X_{i^{\ast}} \mid Y_{1:j-1})$, where $X_{i^{\ast}}$ is a component of $X$ with maximum posterior entropy.
The other components $X_i$ for $i \neq i^{\ast}$ are left independent of $Y_j \mid Y_{1:j-1}$.


\begin{algorithm}[t]
    \caption{Factored IMEC: $Y \mid X{=}x$} 
    \begin{algorithmic}
        \Procedure{FIMEC}{$\mu$, $\nu$, $x$}
        \State $\gamma(X) \gets \mu(X)$
            \For{$j=1, \dots,  m$}
                \State $i^{\ast} \gets \text{argmax}_i \mathcal{H}(\gamma(X_i \mid Y_{1:j-1}))$
                \State $\gamma(X_{i^{\ast}}, Y_j \mid Y_{1:j-1}) \gets \text{MEC}(\gamma(X_{i^{\ast}} \mid Y_{1:j-1}),$
                \hspace{44.16mm}$\nu(Y_j \mid Y_{1:j-1}))$
                \State $\gamma(X, Y_j \mid Y_{1:j-1}) \gets \gamma(X_{i^{\ast}}, Y_j \mid Y_{1:j-1}) \cdot$ \State
                \hspace{26.52mm} $\prod_{i \neq i^{\ast}} \gamma(X_i \mid Y_{1:j-1})$
                \State $Y_j \sim \gamma(Y_j \mid x, Y_{1:j-1})$
            \EndFor
            \State return $Y$
        \EndProcedure
    \end{algorithmic}
    \label{alg:fimec1}
\end{algorithm}

\subsection{Partitions of Sets}

As discussed in the introduction, we will show the IMEC algorithms discussed in the previous two sections can be unified into a single algorithm using partitions over $\mathbb{X}$. We use the following definitions and notation for partitions.
\begin{definition}[Partition]
A partition $\mathcal{P}$ of a set $\mathbb{X}$ is a collection of blocks $\{\mathbb{B}_1, \dots, \mathbb{B}_{\ell}\}$ where: 
\begin{enumerate}[nosep,leftmargin=*]
    \item Each block is a subset of $\mathbb{X}$.
    \item Each pair of distinct blocks has an empty intersection.
    \item The union of blocks is equal to $\mathbb{X}$.
\end{enumerate}
\end{definition}

\begin{definition}[Block Function]
For a partition $\mathcal{P}$ of a set $\mathbb{X}$, the block function $\mathcal{B}_{\mathcal{P}} \colon \mathbb{X} \to \mathcal{P}$ maps $x$ to the block of the partition of which it is an element.
When $X$ is a random variable, we use $B_{\mathcal{P}} = \mathcal{B}_{\mathcal{P}}(X)$ to denote the block of $\mathcal{P}$, as a random variable, to which $X$ belongs.
\end{definition}

Note that the probability distribution of $B_{\mathcal{P}}$ is defined by \[\mu(B_{\mathcal{P}}=\mathbb{B}) = \mu(X \in \mathbb{B}) = \sum_{x \in \mathbb{B}}\mu(X=x).\]

\section{A Unification of Iterative Minimum-Entropy Coupling}

We are now ready to describe our unification of existing IMEC algorithms.
In this unification, different instances of IMEC are specified using different sets of partitions 
\[\mathfrak{P} \subset \{\mathcal{P} \mid \mathcal{P} \text{ is a partition of } \mathbb{X}\}.\]
Instances of this unified perspective guarantee that the resulting distribution is a coupling, support conditional and likelihood queries for both $X \mid Y$ and $Y \mid X$, and heuristically produce low entropy.
We define this unified perspective to IMEC using the conditional generative process given in \Cref{alg:imec1}, which samples from $Y|X$.
(Equivalently, it is defined by the generative process given in \Cref{alg:imec2} in \Cref{app:inverse}, which samples from $X|Y$).
\Cref{alg:imec1} works iteratively in three~steps:
\begin{enumerate}[leftmargin=*]
    \item First, it computes a partition $\mathcal{P} \in \mathfrak{P}$ inducing a maximum-entropy posterior. The entropy induced by a partition $\mathcal{P}$ at iteration $j$ is defined in terms of the probabilities over the blocks of the partition under $\gamma$, given $Y_{1:j-1}$.
    That is,
    \begin{align*}
    &\mathcal{H}(\gamma(B_{\mathcal{P}} \mid Y_{1:j-1}))\\ 
    &= - \sum_{\mathbb{B} \in \mathcal{P}} \gamma(X \in \mathbb{B} \mid Y_{1:j-1}) \log \gamma(X \in \mathbb{B} \mid Y_{1:j-1}).
    \end{align*}
    The intuition behind selecting the maximum-entropy partition is that it heuristically offers the opportunity to reduce the joint entropy by the largest amount.\footnote{A justification is as follows. Recall that \[\max(\mathcal{H}(C), \mathcal{H}(D)) \leq \mathcal{H}(C, D) \leq \mathcal{H}(C) + \mathcal{H}(D),\] where $\mathcal{H}(C, D)$ achieves its upper bound when $C$ and $D$ are independent. Thus, the maximum reduction in joint entropy achievable by performing a coupling is upper bounded by \[\mathcal{H}(C) + \mathcal{H}(D) - \max(\mathcal{H}(C), \mathcal{H}(D)) = \min(\mathcal{H}(C), \mathcal{H}(D)).\]
    Therefore, maximizing $\mathcal{H}(C)$ maximizes an upper bound on the joint entropy reduction.}
    \item Second, it performs an (approximate) MEC between the posterior over the blocks of the chosen partition $\mathcal{P}$ and the conditional distribution $\nu(Y_j \mid Y_{1:j-1})$.
    The joint posterior over the block $B_{\mathcal{P}}$ and $Y_j$ given $Y_{1:j-1}$ is assigned to the output of this coupling.
    \item  Third, it samples $Y_j$ from the posterior over $Y_j$ given both the block $\mathcal{B}_{\mathcal{P}}(x)$ and $Y_{1:j-1}$. 
\end{enumerate}

\begin{algorithm}[t]
    \caption{IMEC (Generic Form): $Y \mid X=x$} \label{alg:imec1}
    \begin{algorithmic}
        \Procedure{IMEC}{$\mu$, $\nu$, $x$, $\mathfrak{P}$}
        \State $\gamma(X) \gets \mu(X)$
            \For{$j=1, \dots,  m$}
                \State $\mathcal{P} \gets \arg\max_{\mathcal{P} \in \mathfrak{P}} \mathcal{H}(\gamma(B_{\mathcal{P}} \mid Y_{1:j-1}))$
                \State $\gamma(B_{\mathcal{P}}, Y_j \mid Y_{1:j-1}) \gets \text{MEC}(\gamma(B_{\mathcal{P}} \mid Y_{1:j-1}),$
                \State \hspace{42.25mm} $\nu(Y_j \mid Y_{1:j-1}))$
                \State $Y_j \sim \gamma(Y_j \mid \mathcal{B}_{\mathcal{P}}(x), Y_{1:j-1})$
            \EndFor
            \State return $Y$
        \EndProcedure
    \end{algorithmic}
\end{algorithm}

\subsection{Theory}

The general form of IMEC possesses the following two properties, which reduce to the results of \citet{pmlr-v162-sokota22a} as a special case.
\begin{restatable}[Coupling]{proposition}{coupling}
IMEC induces a coupling of $\mu$ and $\nu$. 
\label{prop:coupling}
\end{restatable}

\begin{restatable}[Greediness]{proposition}{greediness}  
If the partition of singletons is in $\mathfrak{P}$, IMEC approximately minimizes $\mathcal{H}(X, Y_{1:j})$ subject to $\mu, \nu, \gamma(X, Y_{1:j-1})$ on the $j$th iteration, for each $j$. 
\label{prop:greediness}
\end{restatable}
Proofs for these statements are provided in \Cref{app:coupling} and \Cref{app:greediness}, respectively.

The general form of IMEC also possesses the following runtime guarantee, which says that IMEC can be implemented efficiently whenever maximum-entropy posterior partition computation is efficient.
\begin{restatable}[IMEC Runtime]{proposition}{imec} \label{prop:imec}
Given a polynomial-time function for computing the maximum-entropy posterior partition, IMEC can be implemented in polynomial time in $\max_j |\mathbb{Y}_j|, \max_i |\mathbb{X}_i|, m, n$.
\end{restatable}
We prove \Cref{prop:imec} in \Cref{app:complexity}.

\subsection{Special Cases}

\paragraph{Tabular Posterior} To implement TIMEC using \Cref{alg:imec1}, we can select the partition set $\mathfrak{P}$ to be the set of all partitions of $\mathbb{X}$.
As per \Cref{lemma:trivial-partition}, which is stated and derived in \Cref{app:remarks}, the partition of singletons (or a partition that is equivalent up to measure zero) will always be selected, as it achieves maximum entropy.
Coupling with the partition of singletons is equivalent to coupling over the whole set, which is exactly what TIMEC does.

Using \Cref{prop:imec}, we derive the following runtime guarantee for TIMEC in \Cref{app:complexity}.
\begin{restatable}[TIMEC Runtime]{corollary}{timec} \label{prop:timec}
TIMEC can be implemented in polynomial time in $\max_j |\mathbb{Y}_j|, |\mathbb{X}|, m$.
\end{restatable}

Note that TIMEC's polynomial time guarantee is in contrast to a direct application of an approximate MEC algorithm, which would require exponential time as a function of the same quantities.

\begin{figure}
    \centering
    \includegraphics[width=\linewidth]{figures/factored.pdf}
    \caption{(Left) A set $\mathbb{X}$ of sequences  of length 3; (right) a partition $\mathcal{P}_2$ used by FIMEC.}
    \label{fig:factored-vis}
\end{figure}

\paragraph{Factored Posterior} To implement FIMEC using \Cref{alg:imec1}, we can select the partition set as $\mathfrak{P} = \{\mathcal{P}_1, \dots, \mathcal{P}_n\}$, where for each $i$, 
\[\mathcal{P}_i = \{\mathbb{X}_1 \times \dots \times \mathbb{X}_{i-1} \times \{x_i\}  \times \mathbb{X}_{i+1} \times \dots \times \mathbb{X}_n \mid x_i \in \mathbb{X}_i\}\]
and where $\mathbb{X}_i$ denotes the sample space for $X_i$.
An example is shown in Figure \ref{fig:factored-vis}.
From this perspective, selecting $\mathcal{P}_i$ on a particular iteration is equivalent to selecting $X_i$ as the component with which to couple.

Using \Cref{prop:imec}, we derive the following runtime guarantee for FIMEC in \Cref{app:complexity}.
\begin{restatable}[FIMEC Runtime]{corollary}{fimec} \label{prop:fimec}
Let \Cref{ass:fact} hold. Then FIMEC can be implemented in polynomial time in $\max_j |\mathbb{Y}_j|, \max_i | \mathbb{X}_i|, m, n$.
\end{restatable}

Note that FIMEC's polynomial-time guarantee is in contrast to a direct application of both an approximate MEC algorithm and TIMEC, which would each require exponential time in the same quantities.

\section{Iterative Minimum-Entropy Coupling with an Autoregressive Posterior}

Building on our unified framework, we now derive a new IMEC algorithm, which we call autoregressive IMEC (ARIMEC).
ARIMEC improves upon the applicability of FIMEC by eliminating the factorability assumption.
We present ARIMEC in two parts.
First, we introduce the \textit{prefix tree partition set}, which allows us to formally define ARIMEC using \Cref{alg:imec1}.
Second, we detail insights to make a practical implementation of ARIMEC.

\subsection{Mathematical Formalization}

\looseness=-1
In the framing of \Cref{alg:imec1}, the defining characteristic of IMEC algorithms is their partition sets. 
Therefore, to develop an IMEC algorithm of maximal applicability, it is essential to choose a partition set compatible with a universal model of distributions (i.e., one capable of representing any distribution). 
The autoregressive model, which decomposes a distribution over vectors into component-wise conditional distributions via the chain rule of probability, is one such universal model.
This section formalizes ARIMEC using a partition set specifically tailored to align with the tree-like output structure inherent in autoregressive models.


In order to define this partition set, which we call the prefix tree partition set, we first define prefixes.


\begin{figure*}[t]
    \centering
    \includegraphics[width=.9\linewidth]{figures/ar3.pdf}
    \caption{(Left) A set $\mathbb{X}$ of sequences of length 2; (middle) the prefix tree for $\mathbb{X}$; (right) the partition induced by the left-most depth one node of the prefix tree.}
    \label{fig:ar-vis}
\end{figure*}


\begin{definition}[Prefix/Extension]
We write $v \sqsubset v'$ to mean that $v$ is a prefix of $v'$ in the substring sense and, equivalently, that $v'$ is an extension of $v$ in the substring sense.
\end{definition}
\begin{definition}[Immediate Prefix/Extension]
We say $v$ is the immediate prefix of $v'$ and, equivalently, that $v'$ is the immediate extension of $v$, if $v \sqsubset v'$ and $v'$ is one character longer than $v$.
\end{definition}

Next, we define the \textit{prefix tree} of a set of vectors.
The prefix tree is a directed graph over prefixes of vectors with edges pointing to immediate extensions, as stated below.
(Note that our usage of the term is graph theoretic and does not pertain to the trie data structure.)

\begin{definition}[Prefix Tree]
The prefix tree for a set of vectors $\mathbb{X}$ is a directed graph $(\mathbb{V}, \mathbb{E})$, where
the vertex set 
\[\mathbb{V}= \{v \sqsubset x \mid x \in \mathbb{X}\}\] is the set of prefixes of elements of $\mathbb{X}$ and the set of edges 
\[\mathbb{E}=\{(v, c) \mid v, c \in \mathbb{V}, c \text{ is an immediate extension of } v\}\] is the set of pairs of vertices and their immediate extensions.
For distinct vertices $v, u$, we use the notation $\mathbb{V}_{v \to u}$ to mean the subset of $\mathbb{V} \setminus \{v\}$ touchable by paths that start from $v$ and contain $u$.
\end{definition}
We can view each vertex $v$ in the prefix tree as partitioning $\mathbb{X}$ in a manner that aligns with the sampling paths of autoregressive models.
This perspective naturally induces what we call the prefix tree partition set, where each partition corresponds to a prefix upon which an autoregressive model could be conditioned.

\begin{definition}[Prefix Tree Partition Set]
Let $(\mathbb{V}, \mathbb{E})$ be the prefix tree for $\mathbb{X}$.
Then the \textbf{prefix tree partition set} is defined as $\mathfrak{P} = \{\mathcal{P}_v \mid v \in \mathbb{V}\}$,
where
\[\mathcal{P}_v = \{\mathbb{B}_{c \sqsubset} \mid (v, c) \in \mathbb{E}\} \cup \{\mathbb{B}_{v \not \sqsubset}\} \cup \{ \mathbb{B}_{v =}\},\]
and where
\begin{itemize}[nosep,leftmargin=*]
    \item $\mathbb{B}_{c\sqsubset}= \{x \in \mathbb{X} \mid c \sqsubset x\}$ denotes the subset of $\mathbb{X}$ that is an extension of the child $c$;
    \item $\mathbb{B}_{v \not \sqsubset}= \{x \in \mathbb{X} \mid v \not \sqsubset x\}$ denotes the subset of $\mathbb{X}$ that does not extend $v$;
    \item $\mathbb{B}_{v =} = \{x \in \mathbb{X} \mid v = x\}$ denotes the (either singleton or empty) subset of $\mathbb{X}$ equal to $v$.
\end{itemize}
For distinct vertices $v, u$, we use the notation $\mathbb{B}_{v \to u}$ to mean the subset of $\mathcal{P}_v \setminus \{\mathbb{B}_{v =}\}$ touchable by paths that start from $v$ and contain $u$.
\end{definition}
A visualization of the prefix tree and one partition that it induces is shown in Figure \ref{fig:ar-vis}.

Having defined the prefix tree partition set, ARIMEC's formalization is immediate.

\begin{definition}[ARIMEC] \label{def:arimec}
\textbf{ARIMEC} is the instance of \Cref{alg:imec1} in which the set of partitions $\mathfrak{P}$ is selected to be the prefix tree partition set.
\end{definition}

\begin{figure*}[t!]
    \centering
    \includegraphics[width=0.85\textwidth]{figures/arimec_vis.pdf}
    \caption{Visualization of two iterations of ARIMEC.}
    \label{fig:vis_arimec}
\end{figure*}

\begin{figure*}[t!]
    \centering
    \includegraphics[width=0.3\textwidth]{figures/tree.pdf}
    \caption{Path shown down the prefix tree corresponding to the procedure in \Cref{fig:vis_arimec}.}
    \label{fig:arimec_tree}
\end{figure*}

ARIMEC can be thought of as, at each iteration, operating at a working prefix $v$ whose associated partition $\mathcal{P}_v$ maximizes posterior entropy.
As information is gained, the likelihood of one the blocks (such as some $\mathbb{B}_{c \sqsubset}$) will become large.
As a result, the entropy associated with the working prefix's partition $\mathcal{P}_v$ will become small, causing the (entropy-maximizing) working prefix to change---often to a child of the existing working prefix.
Over iterations, the working prefix will tend to traverse downward in the prefix tree toward the true value of $X$.
However, it is also possible for it to move upward if the probability of $\mathbb{B}_{\not \sqsubset v}$ (for working prefix $v$) becomes large.
This backtracking mechanism allows ARIMEC to recover from cases in which the working prefix deviates from prefixes of $X$.


We provide visual intuition for ARIMEC in \Cref{fig:vis_arimec}, showing example iterations for marginals of length two and its corresponding path down the tree in \Cref{fig:arimec_tree}.

\subsection{Efficient Implementation}

While \Cref{def:arimec} formalizes ARIMEC at a mathematical level, constructing a practical implementation is challenging due to the exponentially large number of nodes in the prefix tree, which makes naive maximum-entropy posterior partition computations intractable.
To address this challenge, we propose a procedure that seeks to prove a maximum-entropy partition by searching over as few partitions as possible, while lazily and (provably) efficiently computing the posterior (and posterior entropy) of each partition that it does search.
This procedure has two components.
The first is a polynomial time algorithm for lazily computing posteriors (and posterior entropies) for particular partitions.
The second is a search procedure for finding a maximum-entropy partition that prunes partitions that are provably not maximum entropy.
In practice, we observe that the procedure is highly efficient, often only requiring the evaluation of one or two nodes to prove a maximum-entropy partition, though we do not formally prove its runtime complexity; see \Cref{app:search} for further details.

\paragraph{Posterior Updates}

The core idea behind our approach to posterior updates is that, given an updated posterior for the partition associated to node $v$, we can immediately derive the posterior of the partition for any adjacent node $u$.
The posterior over $\mathcal{P}_u$ is dictated by two rules.
First, that \[\gamma(\mathbb{B}_{u \to v} \mid Y_{1:j}) = 1 - \gamma(\mathbb{B}_{v \to u} \mid Y_{1:j}),\] by the complement law of probability.
Second, that \[\gamma(\mathbb{B} \mid Y_{1:j}) \propto \gamma(\mathbb{B} \mid Y_{1:j-1})\] for $\mathbb{B} \in \mathcal{P}_u \setminus \mathbb{B}_{u \to v}$, as direct evidence about $B_{\mathcal{P}_v}$ does not differentiate between the elements of $\mathcal{P}_u \setminus \mathbb{B}_{u \to v}$.
These ideas are formalized in \Cref{lem:bayes}.

We can compute the posterior for any partition $\mathcal{P}_u$ in polynomial time by iteratively applying \Cref{lem:bayes} to the partitions along the undirected path from $v$ to $u$, as is stated below and proven in \Cref{app:post}.
\begin{restatable}[Posterior Updates]{proposition}{posterior} \label{prop:posterior}
Assume that the posterior over a partition is updated if and only if its corresponding node is touched and that nodes are touched by traversing edges of the tree (i.e., without jumps).
Let $\mathcal{P}_v$ be a partition whose posterior was updated on iteration $j$.
If $v, u$ are neighbors and $u$ was last visited on iteration $j' \leq j$, then the iteration $j$ posterior for any partition $\mathcal{P}_u$ can be computed in polynomial time in $\max_i |\mathbb{X}_i|, n$.
\end{restatable}

 \paragraph{Maximum-Entropy Posterior Partition Search}

The core idea behind our search procedure is to prune nodes of the prefix tree whose partitions provably cannot be maximal entropy.
To prune nodes, we make use of an upper bound on entropy.
This upper bound---stated formally in \Cref{lemma:ent_ub}---shows, roughly speaking, that the entropy of any distribution with one sufficiently probable element cannot exceed the entropy of the distribution that would divide the remaining mass uniformly.

We can apply this upper bound on large numbers of nodes simultaneously:
If a prefix $v$ is unlikely, then $\gamma(\mathbb{B}_{u \to v} \mid Y_{1:j})$ will be large for every $u$ in the subtree rooted at $v$.
On the other hand, if a prefix $v$ is likely, then $\gamma(\mathbb{B}_{u \to v} \mid Y_{1:j})$ will be large for every $u$ in complement of the subtree rooted at $v$.
We prove this result, stated in \Cref{prop:maxent} below, in \Cref{app:ent}.

 \begin{restatable}[Maximum-Entropy Partition]{proposition}{maxent} \label{prop:maxent}
Let 
\[\kappa \geq \max_{\mathcal{P} \in \mathfrak{P}} |\{\mathbb{B} \in \mathcal{P} \mid \mu(\mathbb{B}) > 0\}|\] be an upper bound on the number of blocks with positive probability.
Define 
\[\mathcal{U} \colon q \mapsto - q \log q - (1 - q) \log \frac{1 - q}{\kappa - 1}.\]
For any neighbor $u$ of $v$, if 
\[\gamma(\mathbb{B}_{v \to u} \mid y_{1:j}) < 1-1/ \kappa,\]
then, for all $u' \in \mathbb{V}_{v \to u}$, \[\mathcal{H}(\gamma(B_{\mathcal{P}_{u'}} \mid y_{1:j})) \leq \mathcal{U}(\gamma(\mathbb{B}_{u \to v} \mid y_{1:j})).\]
\end{restatable}

Using \Cref{prop:maxent}, we can prove a maximum-entropy partition by searching only over the nodes for which we cannot prove an upper bound that is smaller than a previously observed entropy, as is described in \Cref{alg:maxentpartition}.

\begin{algorithm}[t]
    \caption{Maximum-Entropy Partition Search} \label{alg:maxentpartition}
    \begin{algorithmic}
        \Procedure{MaxEntPartition}{$v, \gamma(\cdot \mid Y_{1:j})$}
        \State $\text{queue} \gets [\mathcal{P}_v]$
        \While{$\text{queue non-empty}$}
        \State $\mathcal{P}_{u} \gets \text{queue.pop}()$
        \If{$\mathcal{H}(\gamma(B_{\mathcal{P}_{u}} \mid Y_{1:j}))$ is max ent so far}
        \State $\text{max ent partition} \gets \mathcal{P}_{u}$
        \EndIf
        \For{each node $u'$ adjacent to $u$}
        \State $q \gets \gamma(\mathbb{B}_{u \to u'} \mid Y_{1:j})$
        \If{$q > 1 - \frac{1}{ \kappa}$ \text{or} $\mathcal{U}(q)  > \text{max ent so far}$}
        \State $\text{queue.append($\mathcal{P}_{u'}$)}$
        \EndIf
        \EndFor
        \EndWhile
        \State return $\text{max ent partition}$
        \EndProcedure
    \end{algorithmic}
\end{algorithm}

\section{Mitigating Entropy Waste via Merging} \label{sec:waste}

One suboptimality of ARIMEC, and more generally of all IMEC algorithms, results from the fact that, on a particular iteration, it may be the case that 
\[\mathcal{H}(\nu(Y_j \mid Y_{1:j-1})) - \mathcal{H}(\gamma(B_{\mathcal{P}} \mid Y_{1:j-1})) > 0.\]
In such a case, IMEC necessarily wastes at least $\mathcal{H}(\nu(Y_j \mid Y_{1:j-1})) - \mathcal{H}(\gamma(B_{\mathcal{P}} \mid Y_{1:j-1}))$ bits of information because $Y_j$ possesses more information than is necessary to encode $B_{\mathcal{P}}$.
This waste can stem from bad hyperparameter selection (i.e., the partitions in $\mathfrak{P}$ are low entropy) or reduced uncertainty about $X$ due to previous approximate MECs.
While the latter is typically desirable (as it indicates we've achieved low conditional entropy), the former can negatively impact performance \citep{witt2023perfectly}. 

To address this, we introduce a technique that we call merging.
At each iteration $j$, after performing a coupling, merging groups the possible realizations of $Y_j$ by the posterior update over $B_{\mathcal{P}}$ they induce.
Instead of sampling a realization of $Y_j$, merging samples one of these groups. If the sampled group contains multiple elements, merging performs an additional coupling between the posterior over that group and the new maximum-entropy partition. This process repeats until the sampled group consists of a single element, at which point iteration $j+1$ begins.

\paragraph{An Example of Merging} To illustrate this process, consider a case in which 
\[\nu(Y_j \mid Y_{1:j-1}) = [1/4, 1/4, 1/2],\] \[\gamma(B_{\mathcal{P}} \mid Y_{1:j-1}) = [1/2, 1/2].\]
Then the following is a minimum-entropy coupling:
\begin{center}
\begin{tabular}{| c | c | c | c|}
\hline
 & $y_1$ & $y_2$ & $y_3$\\
\hline
$b_1$ & 1/4 & 1/4 & 0\\
$b_2$ & 0 & 0 & 1/2\\
\hline
\end{tabular}.
\end{center}
In this coupling, the posterior over $B_{\mathcal{P}}$ remains the same whether $Y_j$ is realized as $y_1$ or $y_2$ (specifically, the probability of $b_1$ is one), indicating wasted entropy over ${y_1, y_2}$. Merging post-processes such couplings to yield:
\begin{center}
\begin{tabular}{| c | c | c |}
\hline
 & $\{y_1, y_2\}$ & $\{y_3\}$\\
\hline
$b_1$ & 1/2 & 0\\
$b_2$ & 0 & 1/2\\
\hline
\end{tabular}.
\end{center}
Under merging, if the group $\{y_1, y_2\}$ is sampled, the subsequent coupling is performed between the new maximum-entropy partition and
\[\nu(Y_j \mid Y_{1:j-1}, Y_j \in \{y_1, y_2\}).\] 
If the sampled group has more than two elements, this process may repeat multiple times before proceeding to iteration $j+1$.

\section{Experiments} \label{sec:exp}

To demonstrate the effectiveness of ARIMEC and merging, we perform experiments in two settings: Markov coding games \citep{pmlr-v162-sokota22a} and steganography~\citep{cachin_perfect}.

\subsection{Markov Coding Games}

\begin{figure}
    \centering
    \includegraphics[width=\linewidth]{figures/mcg2.pdf}
    \caption{Results for the Markov coding games CodeCart and CodePong using MaxEntRL policies with different temperatures with 95\% bootstrap confidence intervals drawn from 100 games.}
    \label{fig:mcg}
\end{figure}

In a Markov coding game (MCG) \citep{pmlr-v162-sokota22a}, the goal is to communicate messages via the trajectories of a Markov decision process (MDP), while simultaneously achieving a high expected return in the MDP.
Messages are sampled independently from a distribution known to both the player sending them and the player receiving them.
For a more complete description, see \Cref{app:mcg}.

\citet{pmlr-v162-sokota22a} give a principled approach to this setting called MEME that works in two steps.
First, MEME trains a maximum-entropy reinforcement learning (MaxEntRL) \citep{maxentrl} policy for the MDP.
(The intuition is that this policy balances between performing well in the MDP and having high bandwidth through which information can be communicated.)
Second, MEME computes (or approximates) a minimum-entropy coupling between the distribution over messages and, roughly speaking, the distribution over trajectories induced by the MaxEntRL policy.\footnote{To be more precise about the latter distribution requires nuance since environment transitions cannot be correlated with the message. See \citet{pmlr-v162-sokota22a} for details.}
MEME guarantees that the expected return in the MCG is the same as in the MDP; furthermore, at each time step, MEME greedily maximizes the amount of information encoded into the trajectory.
For a more complete description, see 
 \Cref{app:meme}.

Because the second step of MEME requires computing or approximating a MEC, prior to this work, it was only applicable to MCGs whose message distributions had small or factorable supports.
Thus our extension of IMEC to arbitrary distributions also serves as an extension of MEME to arbitrary MCGs.

To illustrate the benefits of MEME's extended applicability, we perform experiments in two MCGs based on Cartpole and Pong \citep{ale}, which we call CodeCart and CodePong, that were previously beyond MEME's applicability.
For these MCGs, the distribution over messages is dictated by GPT-2 \citep{Radford2019LanguageMA} with top-50 sampling.
For each game, we trained two policies with using different entropy bonus temperatures that each achieved perfect scores in 100 of 100 games.
As a baseline, we compare against a naive version of MEME that assumes that the message was sampled from a uniform distribution over tokens and uses FIMEC.
Note that this baseline sacrifices MEME's expected return guarantee.

We show the rate at which trajectories are decoded incorrectly for each variant of IMEC in these settings (Figure \ref{fig:mcg}).
While both FIMEC and ARIMEC maintain perfect expected return in the MDP, ARIMEC produces substantially more efficient encodings.

\subsection{Steganography}

\begin{figure}
    \centering
    \includegraphics[width=\linewidth]{figures/stego3.pdf}
    \caption{Results for information-theoretic steganography with 95\% bootstrap confidence intervals drawn from 100 samples.}
    \label{fig:stego}
\end{figure}

In steganography, the goal is to encode information (called plaintext) into innocuous-seeming content (called stegotext), such that an adversary would not realize that the innocuous-seeming content contains hidden information.
We consider two kinds of steganography for our experiments.

\paragraph{Information-Theoretic Steganography} The first is information-theoretic steganography \citep{cachin_perfect}, which seeks formal security guarantees.
\citet{witt2023perfectly} proved that this problem can be reduced to minimum-entropy coupling distributions of ciphertext (random bitstrings generated using shared private keys) with distributions of covertext (innocuous content).
For a more complete description, see \Cref{app:its}.

In this setting, \Cref{ass:fact}  holds; thus, we would expect FIMEC to perform well relative to the ARIMEC.
We show both the resulting joint entropy and the rate at which the ciphertext is decoded incorrectly in Figure \ref{fig:stego}, using 100 tokens sampled from GPT-2 as the covertext.
This error rate can be written as 
$\mathbb{E}_{X \sim \mathcal{X}} \mathbb{E}_{Y \sim \gamma(Y \mid X)} I[X \neq \arg \max_{x} \gamma(x \mid Y)]$.
Interestingly, while FIMEC produces lower joint entropy than ARIMEC, ARIMEC appears to produce a lower error rate.
This could be because the ARIMEC focuses on maximizing the certainty of the bytes earlier in the string, while FIMEC focuses on reducing the uncertainty about the most uncertain bytes.

\begin{figure}
    \centering
    \includegraphics[width=.8\linewidth]{figures/encoding.pdf}
    \caption{Results for unencrypted steganography with 95\% bootstrap confidence intervals drawn from 100 samples.}
    \label{fig:encoding}
\end{figure}

\paragraph{Linguistic Steganography} 
The second setting is linguistic steganography, a broader concept than language-based information-theoretic steganography. Unlike the latter, linguistic steganography does not necessarily involve shared private keys. While not using private keys results in less robust security guarantees, it offers two significant advantages. First, applicability is widened to settings in which a private key exchange is not possible. Second, potential information throughput is much higher, as the sender can use realistic priors about plaintext messages, which have significantly lower entropy compared to the uniform distribution of ciphertexts in information-theoretic steganography.

In the specific setting we consider, we aim to encode the output of one language model into that of another language model. The first language model acts as a prior on the plaintext messages that the sender may send, while the second language model serves as an approximate covertext distribution. In practice, one could achieve high throughput by first constructing a desired plaintext message and then translating it into a semantically equivalent message with a high likelihood under the prior.

For our experiments, the covertext distribution is generated by sampling 100 tokens from GPT-2 with the prompt ``Here's an innocuous message:'' and the plaintext message distribution is generated by GPT-2 with the prompt ``Here's a secret message:''. We compared ARIMEC with the correct prior against FIMEC under an assumed uniform prior over tokens. The results of this experiment are shown in \Cref{fig:encoding}. Our findings indicate that ARIMEC substantially outperforms FIMEC in terms of information throughput, reflecting its ability to leverage the prior.

\subsection{Merging}

To evaluate the performance of our merging technique, we consider a setting in which the objective is to transmit 10 bytes of ciphertext via GPT-2 stegotext.
The results of this experiment, which we conducted using FIMEC, are depicted in \Cref{fig:raw_vals}. 
The y-axis represents the joint entropy in bits, while the x-axis shows the dimension of the random vector $X$---i.e., $n$ in our notation.

As discussed in \Cref{sec:waste}, poor choices of partition sets can negatively impact the performance of IMEC. 
In this case, FIMEC's performance significantly decreases (i.e., joint entropy increases) with the number of components, even though the entropy of $X$ is held constant. However, our merging technique substantially reduces IMEC's vulnerability to this issue, as desired.

\begin{figure}
    \centering
    \includegraphics[width=\linewidth]{figures/raw_vals.pdf}
    \caption{Comparing merging and not merging with 95\% bootstrap confidence intervals drawn from 1000 samples.}
    \label{fig:raw_vals}
\end{figure}

\section{Conclusion and Future Work}

In this work, we investigated the problem of computing low-entropy couplings for large-support distributions, making four main contributions.
First, we unify existing algorithms under the formalism of partition sets.
Second, using this unified perspective, we introduce ARIMEC---the first approach to computing low-entropy couplings for large-support distributions that can be applied to arbitrary distributions.
Third, we increase the robustness of IMEC algorithms to the choice of partition set by introducing a merging technique.
Finally, we empirically show the utility of these innovations in MCG and steganography applications.

For future work, there are at least two application directions in which it would be interesting to push further with ARIMEC and merging.
First is linguistic steganography.
This direction is promising because ARIMEC can achieve high throughput rates, as we observed in \Cref{fig:encoding}, and because of the recent proliferation of effective language models.
Thus, there may be real-world settings in which it is applicable.
Second, because ARIMEC is the first IMEC algorithm capable of handling arbitrary discrete distributions, it potentially opens the door to using large-support distributions for other minimum-entropy coupling applications in which the distributions may not be factorable, such as entropic causal inference, random number generation, functional representations, and dimensionality reduction.

\section{Acknowledgements}

This work was supported by ONR grant \#N000142212121.

\bibliography{uai2024-template}
\bibliographystyle{abbrvnat}

\appendix
\newpage
\onecolumn


\section{Inverse Generative Processes} \label{app:inverse}

\begin{algorithm} 
    \caption{Tabular IMEC: $X \mid Y=y$} \label{alg:timec2}
    \begin{algorithmic}
        \Procedure{TIMEC}{$\mu$, $\nu, y$}
        \State $\gamma(X) \gets \mu(X)$        
        \For{$j=1, \dots, m$}
            \State $\gamma(X, Y_j \mid y_{1:j-1}) \gets \text{MEC}(\gamma(X \mid y_{1:j-1}), \nu(Y_j \mid y_{1:j-1}))$
        \EndFor
        \State $X \sim \gamma(X \mid y)$
        \State return $X$
        \EndProcedure
    \end{algorithmic}
\end{algorithm}

\begin{algorithm} 
    \caption{Factored IMEC: $X \mid Y=y$} \label{alg:fimec2}
    \begin{algorithmic}
        \Procedure{FIMEC}{$\mu$, $\nu, y$}
        \State $\gamma(X) \gets \mu(X)$        
        \For{$j=1, \dots, m$}
            \State $i^{\ast} \gets \text{argmax}_i \mathcal{H}(\gamma(X_i \mid y_{1:j-1}))$
            \State $\gamma(X_{i^{\ast}}, Y_j \mid y_{1:j-1}) \gets \text{MEC}(\gamma(X_{i^{\ast}} \mid y_{1:j-1}), \nu(Y_j \mid y_{1:j-1}))$
            \State $\gamma(X, Y_j \mid y_{1:j-1}) \gets \gamma(X_{i^{\ast}}, Y_j \mid y_{1:j-1}) \cdot \left(\prod_{i \neq i^{\ast}} \gamma(X_i \mid y_{1:j-1})\right)$
        \EndFor
        \State $X \sim \gamma(X \mid y)$
        \State return $X$
        \EndProcedure
    \end{algorithmic}
\end{algorithm}

\begin{algorithm} 
    \caption{IMEC (Generic Form): $X \mid Y=y$} \label{alg:imec2}
    \begin{algorithmic}
        \Procedure{IMEC}{$\mu$, $\nu$, $y$, $\mathfrak{P}$}
        \State $\gamma(X) \gets \gamma(\mu)$
            \For{$j=1, \dots,  m$}
                \State $\mathcal{P} \gets \arg\max_{\mathcal{P} \in \mathfrak{P}} \mathcal{H}(\gamma(B_{\mathcal{P}} \mid y_{1:j-1}))$
                \State $\gamma(B_{\mathcal{P}}, Y_j \mid y_{1:j-1}) \gets \text{MEC}(\gamma(B_{\mathcal{P}} \mid y_{1:j-1}), \nu(Y_j \mid y_{1:j-1}))$
            \EndFor
            \State $X \sim \gamma(X \mid y)$
            \State return $X$
        \EndProcedure
    \end{algorithmic}
\end{algorithm}

\section{Theory} \label{app:proofs}

\subsection{Runtime Complexity} \label{app:complexity}

\imec*
\begin{proof}
Let $M= \max_j |\mathbb{Y}_j|$ and $N = \max_i | \mathbb{X}_i|$.

It suffices to show that each of the operations in the main loop requires only polynomial time, as the main loop runs $m$ times.
\begin{itemize}
    \item By assumption, the maximum-entropy posterior partition requires only polynomial time.
    \item Performing an approximate minimum-entropy coupling on distributions of size $O(\max(M, N))$ requires only polynomial time.
    \item Marginalizing a joint distribution for variables of support $O(M), O(N)$ requires only polynomial time.
    \item Sampling from a distribution of support $O(M)$ requires only polynomial time.
\end{itemize}
\end{proof}

\timec*
\begin{proof}
Let $M= \max_j |\mathbb{Y}_j|$. Per \Cref{prop:imec}, it suffices to show that maximum-entropy posterior partition computation is a polynomial time operation.
Per \Cref{lemma:trivial-partition}, the partition of singletons is always maximum entropy.
Thus, since computing the posterior over $X$ is polynomial time in $M, |\mathbb{X}|$, the result follows.
\end{proof}

\fimec*
\begin{proof}
Let $M= \max_j |\mathbb{Y}_j|$ and $N = \max_i | \mathbb{X}_i|$.
Per \Cref{prop:imec}, it suffices to show that maximum-entropy posterior partition computation is a polynomial time operation.
Since computing the posterior over each block---and the entropy of that posterior---is polynomial in $N, M$, and there are only $n$ blocks, the result follows.
\end{proof}

\subsection{Coupling} \label{app:coupling}
\coupling*
\begin{proof}
We proceed by induction on $m$.
For the base case, consider $m=1$.
Then for any $y \in \mathbb{Y}$
\begin{align}
\sum_{x \in \mathbb{X}} \gamma(x, y)
&= \sum_{x \in \mathbb{X}}\mu(x) \gamma(y \mid x) \label{step:chain-rule-base-case}\\
&= \sum_{\mathbb{B} \in \mathcal{P}^{(1)}} \sum_{x \in \mathbb{B}} \mu(x) \gamma(y \mid \mathbb{B}) \label{step:block-sub}\\
&= \sum_{\mathbb{B} \in \mathcal{P}^{(1)}} \gamma(y \mid \mathbb{B}) \sum_{x \in \mathbb{B}} \mu(x)\\
&= \sum_{\mathbb{B} \in \mathcal{P}^{(1)}} \gamma(y \mid \mathbb{B}) \mu(\mathbb{B})\\
&= \sum_{\mathbb{B} \in \mathcal{P}^{(1)}} \gamma(y, \mathbb{B}) \label{step:chain-rule2-base-case}\\
&= \nu(y), \label{step:coupling}
\end{align}
where $\mathcal{P}^{(m)}$ denotes the partition selected at step $m$. 
Step (\ref{step:chain-rule-base-case}) follows from chain rule; step (\ref{step:block-sub}) follows by construction; step (\ref{step:chain-rule2-base-case}) follows by chain rule; step (\ref{step:coupling}) follows by the definition of a coupling.
Now, assume the result holds up to $m = \bar{m}$, and consider $m = \bar{m} + 1$.
Observe, for any $y \in \mathbb{Y}$
\begin{align}
\sum_{x \in \mathbb{X}} \gamma(x, y)
&= \sum_{x \in \mathbb{X}}\mu(x) \gamma(y_{1:\bar{m}} \mid x) \gamma(y_{\bar{m} + 1} \mid x, y_{1:\bar{m}}) \label{step:chain-rule-induction1}\\
&= \sum_{\mathbb{B} \in \mathcal{P}^{(\bar{m} + 1)}} \sum_{x \in \mathbb{B}} \gamma(y_{1:\bar{m}}, x) \gamma(y_{\bar{m} + 1} \mid \mathbb{B}, y_{1:\bar{m}}) \label{step:construction-induction}\\
&= \sum_{\mathbb{B} \in \mathcal{P}^{(\bar{m} + 1)}} \gamma(y_{\bar{m} + 1} \mid \mathbb{B}, y_{1:\bar{m}}) \sum_{x \in \mathbb{B}} \gamma(y_{1:\bar{m}}, x) \\
&= \sum_{\mathbb{B} \in \mathcal{P}^{(\bar{m} + 1)}} \gamma(y_{\bar{m} + 1} \mid \mathbb{B}, y_{1:\bar{m}}) \gamma(y_{1:\bar{m}}, \mathbb{B})\\
&= \sum_{\mathbb{B} \in \mathcal{P}^{(\bar{m} + 1)}} \gamma(y, \mathbb{B}, y_{1:\bar{m}}) \label{step:chain-rule2-induction}\\
&= \nu(y) \label{step:coupling-induction}.
\end{align}
Step (\ref{step:chain-rule-induction1}) follows from chain rule; step (\ref{step:construction-induction}) follows by construction; step (\ref{step:chain-rule2-induction}) follows by chain rule; step (\ref{step:coupling-induction}) follows by definition of a coupling.
\end{proof}

\subsection{Greediness} \label{app:greediness}

\greediness*
\begin{proof}
Consider that performing a coupling with the partition of singletons (or a partition that it is equivalent up to elements with zero probability) is equivalent to performing a partition with $\mathbb{X}$ itself.
Then, invoking Lemma \ref{lemma:trivial-partition}, it suffices to show that the statement holds for $\mathbb{X}$.

To see this, first recall
\begin{align*}
\mathcal{H}(X, Y) = \mathcal{H}(Y \mid X) + \mathcal{H}(X)
\end{align*}
Because the entropy of $X$ is fixed (as it is determined by its marginal $\mu$), minimum-entropy coupling is equivalent to minimum-conditional-entropy coupling.
Then, note that, by chain rule, we have
\begin{align*}
\mathcal{H}(Y_{1:j} \mid X) = \sum_{k=1}^j \mathcal{H}(Y_k \mid X, Y_{1:k-1}) = \mathcal{H}(Y_j \mid X, Y_{1:j-1}) + \sum_{k=1}^{j-1} \mathcal{H}(Y_k \mid X, Y_{1:k-1}).
\end{align*}
At iteration $j$, all terms below $j$ have already been determined.
Thus, the rightmost summation term is fixed and minimizing $\mathcal{H}(X, Y_{j-1})$ is reduced to minimizing $\mathcal{H}(Y_j \mid X, Y_{1:j-1})$.
By again invoking the equivalence between minimum-entropy coupling and minimum-conditional-entropy coupling, this is equivalent to minimizing $\mathcal{H}(X, Y_j \mid Y_{1:j-1})$, which is exactly what IMEC minimizes at iteration~$j$.
\end{proof}



\subsection{Condition Satisfaction for Special Cases} \label{app:remarks}

\begin{lemma} \label{lemma:trivial-partition}
Let $\mathfrak{P}$ be the set of all partitions over $\mathbb{X}$.
For any distribution over $\mathbb{X}$, any maximum-entropy partition is equivalent to the partition of singletons up to zero-probability elements.
\end{lemma}
\begin{proof}
Consider a block $\mathbb{B}$ of some partition $\mathcal{P}$ of $\mathbb{X}$.
The entropy that $\mathbb{B}$ contributes is 
\[-\gamma(\mathbb{B}) \log \gamma(\mathbb{B}).\]
The first derivative of this function is
\[- \log \gamma(\mathbb{B}) - 1.\]
The second derivative is
\[- \frac{1}{\gamma(\mathbb{B})}.\]
Since the second derivative is always negative, the contribution of $\mathbb{B}$ to the total entropy is strictly concave.
Thus, further subdividing $\mathbb{B}$ increases its contribution to the total entropy, up to elements with zero probability.
\end{proof}

\subsection{Posterior Updates} \label{app:post}

\begin{restatable}[Posterior Updates]{lemma}{bayes} \label{lem:bayes}
Let $(\mathbb{V}, \mathbb{E})$ be the prefix tree for $\mathbb{X}$.
Assume that the posterior over a partition is updated if and only if its corresponding node is touched and that nodes are touched by traversing edges of the tree (without jumps).
Let $\mathcal{P}_v$ be a partition whose posterior was updated on iteration $j$.
If $v, u$ are neighbors and $u$ was last visited on iteration $j' \leq j$, then
\[\gamma(\mathbb{B}_{u \to v} \mid Y_{1:j}) = 1 - \gamma(\mathbb{B}_{v \to u} \mid Y_{1:j})\]
and, for $\mathbb{B}' \in \mathcal{P}_u$ such that $\mathbb{B}' \neq \mathbb{B}_{u \to v}$,
\[\gamma(\mathbb{B}' \mid Y_{1:j}) \propto \gamma(\mathbb{B} \mid Y_{1:j}).\]
\end{restatable}
\begin{proof}
First consider that $\mathbb{B}_{u \to v}, \mathbb{B}_{v \to u}$ are pairs of complementary events.
Thus, their probabilities must sum to one by the complement rule.

Now, consider that, if $u$ was last visited on iteration $j'$, it follows that no element of $\mathbb{B}_{v \to u}$ can have been visited since iteration $j'$.
(This follows because every path from $\mathbb{B}_{v \to u}$ to $v$ must touch $u$ by definition of a tree.)
Therefore, every partition updated since $\mathcal{P}_u$ was last updated must correspond to a vertex in $\mathbb{V}_{u \to v}$.
Partitions corresponding to vertices in $\mathbb{V}_{u \to v}$ can only influence the blocks of $\mathcal{P}_u$ via $\mathbb{B}_{v \to u}$.
Thus, because $\mathbb{B}_{v \to u} = \cup_{\mathbb{B} \in \mathcal{P}_u \setminus \mathbb{B}_{u \to v}} \mathbb{B}$, direct evidence about $\mathbb{B}_{v \to u}$ changes the probability of each element of $\mathcal{P}_u \setminus \mathbb{B}_{u \to v}$ by the same factor.
\end{proof}

\posterior*

\begin{proof}
Let $N = \max_i | \mathbb{X}_i|$.
If $u$ is a neighbor of $v$, then, using \Cref{lem:bayes}, the posterior over $\mathcal{P}_u$ can be computed in $O(\max_i |\mathbb{X}_i|)$ time.
If $u$ is not a neighbor of $v$, then we can compute the posterior over $\mathcal{P}_u$ by iteratively applying \Cref{lem:bayes} along the path from $v$ to $u$.
Because path length is upper bounded by $O(n)$, the total time is polynomial in $\max_i |\mathbb{X}_i|, n$.
\end{proof}

\subsection{Entropy Upper Bound} \label{app:ent}

\begin{restatable}[Entropy Upper Bound]{lemma}{entropy_upper_bound}
\label{lemma:ent_ub}
Let $\mu$ be a probability distribution over $\kappa$ elements. Fix any element $\mu(x^{\ast})$. Then for any $q$ such that $\frac{1}{\kappa} \leq q \leq \mu(x^{\ast})$, we have
\[\mathcal{H}(\mu) \leq 
\begin{cases}
-q\log q - (1 - q) \log \frac{(1 - q)}{\kappa - 1} & q \in [1/\kappa, 1)\\
0 & q = 1.
\end{cases}\]
\end{restatable}

\begin{proof}
First note that if $\mu(x^{\ast}) = 1$ then $\mathcal{H}(\mu) = 0$ and the upper bound holds trivially.

Next, consider the case in which $\mu(x^{\ast}) < 1$.
We will show that this upper bound holds in the case when $q = \mu(x^{\ast})$. We first observe that the entropy is given by
\begin{align*}
    \mathcal{H}(\mu) & = -\sum_{x} \mu(x) \log \mu(x)= -\mu(x^{\ast}) \log \mu(x^{\ast}) - \sum_{x \neq x^{\ast}} \mu(x) \log \mu(x)
\end{align*}
Now, we can consider another probability distribution $\mu'$ over $n - 1$ values (everything except $x^{\ast}$), which is given by $\mu'(x) = \frac{\mu(x)}{1 - \mu(x^{\ast})}, \forall x \neq x^{\ast}$. Since entropy is maximized by a uniform distribution, we have that $\mathcal{H}(\mu') \leq -\log(\frac{1}{n-1})$.

We observe that
\begin{align*}
    \mathcal{H}(\mu')  & = -\sum_{x \neq x^{\ast}} \mu'(x) \log \mu'(x) \\
           & = -\frac{1}{(1 - \mu(x^{\ast}))} \sum_{x \neq x^{\ast}} \mu(x) \log \mu'(x) \\
           & = -\frac{1}{(1 - \mu(x^{\ast}))} \sum_{x \neq x^{\ast}} \mu(x) \Bigl(\log \mu(x) - \log(1 - \mu(x^{\ast}))  \Bigr) \\
           & = -\frac{1}{(1 - \mu(x^{\ast}))} \left[\sum_{x \neq x^{\ast}}\Bigl( \mu(x) \log \mu(x)\Bigr) + \log(1 - \mu(x^{\ast}))\right]
\end{align*}
Then, plugging this into the inequality for $\mathcal{H}(\mu')$ gives us that
\begin{align*}
    - \sum_{x \neq x^{\ast}} \mu(x) \log \mu(x) & \leq (1 - \mu(x^{\ast}))\left(-\log\left(\frac{1}{n -1} \right) - \log (1 - \mu(x^{\ast}))\right) \\
      & = - (1 - \mu(x^{\ast}))\log\left(\frac{1 - \mu(x^{\ast})}{n - 1} \right)
\end{align*}
Thus, this gives us that 
\begin{align*}
    \mathcal{H}(\mu) \leq -\mu(x^{\ast}) \log \mu(x^{\ast}) - (1 - \mu(x^{\ast})) \log \left( \frac{1 - \mu(x^{\ast})}{n - 1} \right)
\end{align*}
as desired.

Next, we will show that this upper bound decreases in $q$. We can consider taking the partial derivative with the upper bound with respect to $q$, which gives us that
\begin{align*}
    D_q \left( -q\log q - (1 - q) \log \frac{(1 - q)}{n-1}\right) = -\log q - 1 + \log \frac{(1 - q)}{n-1} + 1 = -\log q + \log \frac{(1 - q)}{n-1}.
\end{align*}
Setting this equal to zero gives us that
\begin{align*}
    \log q - \log \frac{1 - q}{n - 1} & = 0 \\
         \implies q & = \frac{1}{n}.
\end{align*}
Next, we observe that the second derivative of the upper bound with respect to $q$ is given by
\begin{align*}
    D_q D_q \left( -q\log q - (1 - q) \log \frac{(1 - q)}{n-1}\right) = D_q \left(-\log q + \log \frac{(1 - q)}{n-1}\right)
    = \frac{1}{q(q-1)}.
\end{align*}
Thus, this is negative for all values of $0 < q < 1$, which gives us that the upper bound is decreasing in $q$ on the interval $[\frac{1}{n}, 1)$.
Therefore, since it holds for $q = \mu(x^{\ast})$, it must hold for $q \in [1/n, \mu(x^{\ast})]$.
\end{proof}

\maxent*

\begin{proof}
Observe
\begin{align*}
\gamma(\mathbb{B}_{v \to u} \mid Y_{1:j}) &< 1-1/ \kappa\\
\iff - \gamma(\mathbb{B}_{v \to u} \mid Y_{1:j}) &> -1 + 1/\kappa\\
\iff 1- \gamma(\mathbb{B}_{v \to u} \mid Y_{1:j}) &> 1/\kappa\\
\iff \gamma(\mathbb{B}_{u \to v} \mid Y_{1:j}) &> 1/\kappa.
\end{align*}
Fix any $u' \in \mathbb{V}_{v \to u}$.
Then, $\mathbb{B}_{u \to v} \subset \mathbb{B}_{u' \to v}$.
Therefore, we have $\gamma(\mathbb{B}_{u' \to v} \mid Y_{1:j}) \geq \gamma(\mathbb{B}_{u \to v} \mid Y_{1:j})$.
The bound follows from applying Lemma~\ref{lemma:ent_ub}.
\end{proof}

\section{Visualizations} \label{app:vis}

\begin{figure}[H]
    \centering
    \includegraphics[width=\textwidth]{figures/fimec_vis.pdf}
    \caption{Visualization of two iterations of FIMEC.}
    \label{fig:vis_fimec}
\end{figure}

For comparison to ARIMEC, \Cref{fig:vis_fimec} shows two iterations of FIMEC.

\section{Experiments}

\subsection{Maximum-Entropy Partition Search} \label{app:search}

In \Cref{prop:imec}, we demonstrated that instances of IMEC are efficient if and only if the maximum-entropy posterior partition can be computed efficiently. For ARIMEC, we established in \Cref{prop:posterior} that the posterior of individual nodes can be computed efficiently. However, we did not prove that \Cref{alg:maxentpartition} searches through only a polynomial number of nodes, raising concerns about the practical efficiency of ARIMEC. Fortunately, our empirical observations indicate that the search procedure is highly effective.
To illustrate this, in \Cref{fig:search}, we show the number of nodes the search procedure required, on average, to compute the maximum-entropy posterior partition as a function of the number of nodes in the prefix tree for two distributions: GPT-2 and random bytestrings.
We find that, even as the prefix tree grows very large, average the number of nodes touched per iteration remains manageable.


\begin{figure} \label{fig:search}
    \centering
    \includegraphics[width=0.6\linewidth]{figures/nodes_searched.pdf}
    \caption{Results for number of nodes touched with 95\% bootstrap confidence intervals drawn from 100 samples.}
    \label{fig:search}
\end{figure}

\subsection{Markov Coding Games} \label{app:mcg}

\citet{pmlr-v162-sokota22a} specify Markov coding games as the following setting:
\begin{quote}
An MCG is a tuple $\langle (\mathcal{S}, \mathcal{A}, \mathcal{T}, \mathcal{R}), \mathcal{M}, \mu, \zeta \rangle$, where $(\mathcal{S}, \mathcal{A}, \mathcal{T}, \mathcal{R})$ is a Markov decision process, $\mathcal{M}$ is a set of messages, $\mu$ is a distribution over $\mathcal{M}$ (i.e., the prior over messages), and $\zeta$ is a non-negative real number we call the message priority.
\textbf{An MCG proceeds in the following steps:}
\begin{enumerate}[leftmargin=*, nosep]
    \item First, a message $M \sim \mu$ is sampled from the prior over messages and revealed to the sender.
    \item Second, the sender uses a message conditional policy $\pi_{\mid M}$, which takes states $s \in \mathcal{S}$ and messages $m \in \mathcal{M}$ as input and outputs distributions over MDP actions $\Delta(\mathcal{A})$, to generate a trajectory $Z \sim (\mathcal{T}, \pi_{\mid M})$ from the MDP.
    \item Third, the sender's terminal MDP trajectory $Z$ is given to the receiver as an observation.
    \item Fourth, the receiver uses a terminal MDP trajectory conditional policy $\pi_{\mid Z}$, which takes terminal trajectories $z \in \mathcal{Z}$ as input and outputs distributions over messages $\Delta(\mathcal{M})$, to estimate the message $\hat{M} \sim \pi_{\mid Z}(Z)$.
\end{enumerate}
The objective of the agents is to maximize the expected weighted sum of the return and the accuracy of the receiver's estimate $\mathbb{E} \left[\mathcal{R}(Z) + \zeta \mathbb{I}[M = \hat{M}] \mid \pi_{\mid M}, \pi_{\mid Z} \right]$.
Optionally, in cases in which a reasonable distance function is available, we allow for the objective to be modified to minimizing the distance between the message and the guess $d(M, \hat{M})$, rather than maximizing the probability that the guess is correct.
\end{quote}

\subsection{MEME} \label{app:meme}

\citet{pmlr-v162-sokota22a} specify MEME as follows:

\begin{quote}
\textbf{Step One: Maximum Entropy Reinforcement Learning} In the first step, MEME uses MaxEnt RL to construct an MDP policy $\pi$.
This policy is an MDP policy, not an MCG policy, and therefore does not depend on the message.
Note that this policy depends on the choice of temperature $\alpha$ used for the MaxEnt RL algorithm.

\textbf{Step Two: Minimum Entropy Coupling} 
In the second step, at execution time, MEME constructs a message-conditional policy online using MECs.
Say that, up to time $t$, the sender is in state $s^t$, history $h^t$ and has played according to the state and message conditional policy $\pi_{\mid M}^{:t}$ so far.
Let 
\[b^t = \mathcal{P}(M \mid h^t, \pi_{\mid M}^{:t})\] be the posterior over the message, conditioned on the history and the historical policy.
MEME performs a MEC between the posterior over the message $b^t$ and distribution over actions $\pi(s^t)$, as determined by the MDP policy.
Let $\nu = \text{MEC}(b^t, \pi(s^t))$ denote joint distribution over messages and actions resulting from the coupling.
Then MEME sets the sender to act according to the message conditional distribution 
\[\pi^t_{\mid M}(s^t, m) = \nu(A^t \mid M=m)\] 
of the coupling distribution $\nu = \text{MEC}(b^t, \pi(s^t))$.

Given the sender's MDP trajectory, MEME's receiver uses the sender's MDP policy and MEC procedure to reconstruct the sender's message conditional policy along the trajectory;
thereafter, the receiver computes the posterior and guesses the maximum a posteriori (MAP) message.
\end{quote}

\subsection{Information-Theoretic Steganography} \label{app:its}

\citet{witt2023perfectly} summarize \citet{cachin_perfect}'s information-theoretic steganography setting as follows:
\begin{quote}
\textbf{Problem Setting} The objects involved in information-theoretic steganography can be divided into two classes: those which are externally specified and those which require algorithmic specification.
Each class contains three objects.
The externally specified objects include the distribution over plaintext messages $\mathcal{M}$, the distribution over covertext $\mathcal{C}$, and the random source generator.
\begin{itemize}[leftmargin=*]
    \item The distribution over plaintext messages $\mathcal{M}$ may be known by the adversary, but is not known by the sender or the receiver.
    However, the sender and receiver are aware of the domain $\mathbb{M}$ over which $\mathcal{M}$ ranges.
    The sampled plaintext message $M$ is explicitly known by the sender, but not to the receiver or the adversary.
    \item The covertext distribution $\mathcal{C}$ is assumed to be known by the sender, the receiver, and the adversary.
    \item The random source generator provides the sender with a mechanism to take random samples from distributions.
    This random source is known to the sender but not to the receiver or adversary.
\end{itemize}

The objects requiring algorithmic specification, which are collectively referred to as a stegosystem, are the key generator, the encoder, and the decoder.
\begin{itemize}[leftmargin=*]
    \item The key generator produces a private key $K$ in the form of a binary string.
    This private key is shared between the sender and receiver over a secure channel prior to the start of the stegoprocess and can be used to coordinate encryption and decryption.
    The key generation process may be known to the adversary, but the realization of the key $K$ is not.
    \item The encoder takes a private key $K$, a plaintext message $M$, and a source of randomness $R$ as input and produces a stegotext $S$ in the space of covertexts $\mathbb{C}$.
    \item The decoder takes a private key $K$ and a stegotext $S$ as input and returns an estimated plaintext message $\hat{M}$.
\end{itemize}
\end{quote}


They specify the following objectives and methodological outline for the setting:
\begin{quote}
\begin{definition} \label{def:perf}
\citep{cachin_perfect} Given covertext distribution $\mathcal{C}$ and plaintext message space $\mathbb{M}$, a stegosystem is $\epsilon$-secure against passive adversaries if the KL divergence between the distribution of covertext $\mathcal{C}$ and the distribution of stegotext $\mathcal{S}$ less than $\epsilon$; i.e., $\text{KL}(\mathcal{C}, \mathcal{S}) < \epsilon$.
It is perfectly secure if the KL divergence is zero; i.e., $\text{KL}(\mathcal{C}, \mathcal{S}) = 0$.
\end{definition}

In other words, a steganographic system is perfectly secure if the distribution of stegotext $\mathcal{S}$ communicated by the sender is exactly the same as the distribution of covertext $\mathcal{C}$.

In addition to security, it is desirable for stegosystems to transmit information efficiently.
Mutual information between messages and stegotexts is one way to quantify efficiency.
\begin{definition} \label{def:mi}
The mutual information $\mathcal{I}(M; S) = \mathcal{H}(M) - \mathcal{H}(M \mid S)$ between the message $M$ and stegotext $S$ is the expected amount of uncertainty in the message $M$ that is removed by conditioning on the stegotext $S$.
\end{definition}

\textbf{Methodological Outline} A common class of stegosystems uses two-step encoding and two-step decoding processes, as described below:
\begin{enumerate}[leftmargin=*]
    \item \label{step:encrypt} The sender uses the private key $K$ to injectively map the plaintext message $M$ into ciphertext $\mathbb{X} = \{0, 1\}^{\ell}$ in such a way that the induced distribution over ciphertext $\mathcal{X}$ is uniformly random, regardless of the distribution of $\mathcal{M}$.\footnote{For example, if $K$ is drawn from a uniform random distribution, $\text{bin}(M)$ denotes a deterministic binarization of $M$, and $\text{XOR}$ represents the component-wise exclusive-or function, then $X=\text{XOR}(\text{bin}(M), K)$ is guaranteed to be distributed uniformly randomly, regardless of the distribution of messages~$\mathcal{M}$. \label{fn:enc}}
    \item \label{step:encode} The sender uses a (potentially stochastic) mapping $f \colon \mathbb{X} \rightsquigarrow \mathbb{C}$ to transform the ciphertext $X$ into stegotext $S$ (which exists in the space of covertexts $\mathbb{C}$).
    \item \label{step:estimate} The receiver estimates the ciphertext $\hat{X}$ from the stegotext $S$.
    \item \label{step:decrypt} The receiver inverts the estimated ciphertext $\hat{X}$ to a plaintext message $\hat{M}$ with private key~$K$.\footnote{For the example in footnote \ref{fn:enc}, the receiver can recover the binarized message $\text{bin}(M)$ using the mapping $X \mapsto \text{XOR}(X, K)$ and invert the binarization to recover the plaintext $M$.}
\end{enumerate}
\end{quote}
Given the definition below \citet{witt2023perfectly} show the following guarantees:
\begin{quote}
\begin{definition}
We say that an encoding procedure $f \colon \mathbb{X} \rightsquigarrow \mathbb{C}$ is induced by a coupling if there exists $\gamma \in \Gamma(\mathcal{X}, \mathcal{C})$ such that for all $x \in \mathbb{X}, c \in \mathbb{C}, \mathcal{P}(f(x){=}c) = \gamma(C{=}c \mid X{=}x)$. 
\end{definition}
\begin{theorem} \label{prop:coupling}
A steganographic encoding procedure is perfectly secure if and only if it is induced by a coupling.
\end{theorem}

\begin{theorem} \label{prop:mec}
Among perfectly secure encoding procedures, a procedure $f \colon \mathbb{X} \rightsquigarrow \mathbb{C}$ maximizes the mutual information $\mathcal{I}(M; S)$ if and only if $f$ is induced by a minimum entropy coupling.
\end{theorem}
\end{quote}

\end{document}
