\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
    \usepackage{url}
% \def\UrlBreaks{\do\/\do-} % needed to break up long links in bib
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)
\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
% \usepackage{caption}

% \captionsetup[table]{labelformat=empty}

% Is this allowed?
\usepackage{tablefootnote}
\usepackage{enumitem}

\usepackage{graphicx}
\usepackage{amsthm}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{ulem}

\usepackage{hyperref}
\usepackage{float}
% \usepackage{subfig}

% \usepackage[top=0.25inch, bottom=0.25inch]{geometry}
% MARGINS HAVE BEEN CHANGED IN THE STYLE FILE


\usepackage{caption}
\usepackage{subcaption}  % <----
% \captionsetup[subfigure]{labelformat=empty}
\DeclareMathOperator*{\E}{\mathbb{E}}


\usepackage{color}
\newcommand{\JA}[1]{{\color{blue}#1}}
\newcommand{\ST}[1]{{\color{red}#1}}



% \bibliographystyle{abbrvnat}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

\theoremstyle{plain}
\newtheorem{innercustomgeneric}{\customgenericname}
\providecommand{\customgenericname}{}
\newcommand{\newcustomtheorem}[2]{%
  \newenvironment{#1}[1]
  {%
   \renewcommand\customgenericname{#2}%
   \renewcommand\theinnercustomgeneric{##1}%
   \innercustomgeneric
  }
  {\endinnercustomgeneric}
}

\newcustomtheorem{customthm}{Theorem}
\newcustomtheorem{customlemma}{Lemma}
\newcustomtheorem{customcor}{Corollary}


\newcommand{\s}{\mathcal{S}}
\newcommand{\A}{\mathcal{A}}
\newcommand{\M}{\mathcal{M}}
\newcommand{\T}{\mathcal{T}}
\newcommand{\TL}{\mathcal{F}}
\newcommand{\Q}{\mathcal{Q}}
\newcommand{\overbar}[1]{\mkern 1.5mu\overline{\mkern-1.5mu#1\mkern-1.5mu}\mkern 1.5mu}



\title{Bounding the Optimal Value Function \\in Compositional Reinforcement Learning\\(Supplementary Material)}

% Add authors
\author[1]{\href{mailto:<jacob.adamczyk001@umb.edu>?Subject=Your UAI 2023 paper}{Jacob~Adamczyk}{}}
% \author[1]{Jacob~Adamczyk}
\author[2]{Volodymyr~Makarenko}
\author[1]{Argenis~Arriojas}
\author[2]{Stas~Tiomkin}
\author[1]{Rahul~V.~Kulkarni}
% \author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
Department of Physics\\
University of Massachusetts Boston\\
Boston, MA, USA
}
\affil[2]{%
    Department of Computer Engineering\\
    San Jos\'e State University\\
    San Jos\'e, CA, USA
}
% \affil[3]{%
%     Another Affiliation\\
%     Address\\
%     …
%   }
  
\begin{document}
\onecolumn
\maketitle


\section{Introduction}
In the following, we discuss the results of additional experiments in the four room domain. In these experiments, we want to answer the following questions:
\begin{itemize}
    \item How do the optimal policies and value functions compare to those calculated from the zero-shot approximations using the derived bounds?
    \item What are other examples of compositions and functional transformations that can be analyzed using our approach?
    \item Does warmstarting (using the derived bounds for initialization) in the tabular case improve the convergence?

\end{itemize}

To address these issues, we modify OpenAI's frozen lake environment \cite{openAI} to allow for stochastic dynamics.

In the tabular experiments, numerical solutions for the optimal $Q$ functions were obtained by solving the Bellman backup equations iteratively. Iterations are considered converged once the maximum difference between successive iterates is less than $10^{-10}$.

% The figures in the proceeding sections provide the following information: the transformed (composite) reward function, the zero-shot approximation for the $Q$ function (based on the derived results, using the known solutions for the primitive tasks), and the optimal solution  for the $Q$ function (obtained by directly solving the model with the transformed reward function). The results show that the zero-shot approximation and the derived policy are quite close to the optimal solutions. 

Beyond the motivating example shown in the main text, we have included video files demonstrating a full range of zero-shot compositions with convex weights between the Bottom Left (BL) room and Bottom Right (BR) room subtasks, in both entropy-regularized ($\beta=5$) and standard RL with deterministic dynamics. These videos, along with all code for the above experiments are made publicly available at a repository on \url{https://github.com/JacobHA/Q-Bounding-in-Compositional-RL}.


\section{Experiments}
\subsection{Function Approximators}

For function approximator experiments (as shown in the main text), we use the DQN implementation from Stable-Baselines3 \cite{stable-baselines3}. We first fully train the subtasks (seen in Fig. 1 of the main text). Then, we perform hyperparameter sweeps for each possible clipping option. Several hyperparameters are kept fixed (Table~\ref{tab:shared}), and we sweep with the range and distribution shown below in Table~\ref{tab:sweep}. Finally, we use the optimal hyperparameters (as measured by those which maximize the accumulated reward throughout training). These values are shown in Table~\ref{tab:optimal}.
\clearpage
\begin{center}
    \captionof{table}{Hyperparameters shared by all Deep Q Networks}
    \begin{tabular}{||c c||}
        \hline
        Hyperparameter              & Value     \\ [0.5ex]
        \hline\hline
        Buffer Size                 & 1,000,000 \\
        \hline
        Discount factor, $\gamma$   & 0.99      \\
        \hline
        $\epsilon_{\text{initial}}$ & 1.0       \\
        \hline
        $\epsilon_{\text{final}}$   & 0.05      \\
        \hline
        ``learning starts''         & 5,000     \\
        \hline
        Target Update Interval      & 10000     \\ [1ex]
        \hline
    \end{tabular}
    \label{tab:shared}
\end{center}

\begin{center}
    \captionof{table}{Hyperparameter Ranges Used for Finetuning}
    \begin{tabular}{||c c c c||}
        \hline
        Hyperparameter        & Sampling Distribution & Min. Value & Max. Value \\ [0.5ex]
        \hline
        Learning Rate         & Log Uniform           & $10^{-4}$  & $10^{-1}$  \\
        \hline
        Batch Size            & Uniform               & $32$       & $256$      \\
        \hline
        Exploration Fraction  & Uniform               & $0.1$      & $0.3$      \\
        \hline
        Polyak Update, $\tau$ & Uniform               & $0.5$      & $1.0$      \\ [1ex]
        \hline
    \end{tabular}
    \label{tab:sweep}
\end{center}



\begin{center}
    \captionof{table}{Hyperparameters used for different clipping methods}
    \begin{tabular}{||c c c c c||}
        \hline
        Hyperparameter        & None                 & Soft                 & Hard                 & Soft-Hard            \\ [0.5ex]
        \hline
        Learning Rate         & $7.825\times10^{-4}$ & $3.732\times10^{-3}$ & $1.457\times10^{-3}$ & $3.184\times10^{-3}$ \\
        \hline
        Batch Size            & 245                  & 247                  & 146                  & 138                  \\
        \hline
        Exploration Fraction  & 0.137                & 0.1075               & 0.1243               & 0.1207               \\
        \hline
        Polyak Update, $\tau$ & 0.9107               & 0.9898               & 0.5545               & 0.7682               \\ [1ex]
        \hline
    \end{tabular}
    \label{tab:optimal}
\end{center}


\subsection{Tabular experiments}
In these experiments we will demonstrate on simple discrete environments the effect of increasingly stochastic dynamics and increasingly dense rewards. As a proxy for measuring the usefulness or accuracy of the bound $f(Q)$, we calculate the mean difference between $f\left(Q(s,a)\right)-\widetilde{Q}(s,a)$, as well as the mean Kullback-Liebler (KL) divergence between $\pi$ (the true optimal policy) and $\pi_f$, the policy derived from the bound $f(Q)$. The proceeding experiments are situated in the entropy-regularized formalism (unless $\beta=\inf$ as shown in Fig.~\ref{fig:beta_inf_stoch_expt}) with the uniform prior policy $\pi_0(a|s) = 1/ |\mathcal{A}|$.

\subsubsection{Stochasticity of Dynamics}
In this experiment, we investigate the effect of stochastic dynamics on the bounds. Specifically, we vary the probability that taking an action will result in the intended action. This is equivalent to a slip probability.

\begin{figure}[ht]
    \centering
    \subfloat[Task 1]{{\includegraphics[width=5cm]{figures/subtask_5x5A.pdf} }}
    % \qquad
    \subfloat[Task 2]{{\includegraphics[width=5cm]{figures/subtask_5x5B.pdf} }}
    % \qquad
    \subfloat[Task 3: AND composition]{{\includegraphics[width=5cm]{figures/subtask_5x5C.pdf} }}
    % \qquad
    \caption{Reward functions for a simple maze domain; used for stochasticity experiments. We place reward (whose cost is half the default step penalty of $-1$) at the edges of the room, denoted by an orange diamond. }
    \label{fig:stochastic_desc}
\end{figure}
We notice in the following plots that at near-deterministic dynamics the bound becomes tighter. We also remark that the Kullback-Liebler divergence is lowest in very highly-stochastic environments. This is because for any $\beta>0$, the cost of changing the policy $\pi$ away from the prior policy is not worth it: the dynamics are so stochastic that there will be no considerable difference in trajectories even if significant controls (nearly deterministic choices) are applied via $\pi$.

\begin{figure}[ht]
    \centering
    \subfloat[]{{\includegraphics[width=7cm]{figures/b1.pdf} }}
    % \qquad
    \subfloat[]{{\includegraphics[width=7cm]{figures/b1kl.pdf} }}

    \caption{$\beta=1$ KL divergence between $\pi$ and $\pi_f$ and average difference between optimal $Q$ function and presented bound.}
    \label{fig:b1}
\end{figure}

\begin{figure}[ht]
    \centering
    \subfloat[]{{\includegraphics[width=8cm]{figures/b3.pdf} }}
    % \qquad
    \subfloat[]{{\includegraphics[width=7cm]{figures/b3kl.pdf} }}

    \caption{$\beta=3$ KL divergence between $\pi$ and $\pi_f$ and average difference between optimal $Q$ function and presented bound.}
    \label{fig:b3}
\end{figure}

\begin{figure}[ht]
    \centering
    \subfloat[]{{\includegraphics[width=7cm]{figures/b5.pdf} }}
    % \qquad
    \subfloat[]{{\includegraphics[width=7cm]{figures/b5kl.pdf} }}

    \caption{$\beta=5$ KL divergence between $\pi$ and $\pi_f$ and average difference between optimal $Q$ function and presented bound.}
    \label{fig:b5}
\end{figure}

\begin{figure}[ht]
    \centering
    \includegraphics[width=12cm]{figures/binf.pdf}

    \caption{$\beta=\inf$, standard RL.
        Average difference between optimal $Q$ function and presented bound. Note that we do not plot a KL divergence for this case as $\pi$ is greedy and hence the divergence is always infinite.}
    \label{fig:beta_inf_stoch_expt}
\end{figure}

\clearpage
% \newpage

\subsection{Sparsity of Rewards}
In this experiment, we consider an empty environment ($|S|\times |S|$ empty square) with reward $r=0$ everywhere and deterministic dynamics. No other rewards or obstacles are present. Then fix an integer $0<n<|S|$. Drawing randomly (without repetition), we choose one of the states of the environment to grant a reward, drawn uniformly between $(0, 1)$. We do this again for another copy of the empty environment.

We then compose these two (randomly generated as described) subtasks by using a simple average $F(r^{(1)}, r^{(2)}) = 0.5r^{(1)} + 0.5 r^{(2)}$. We have used $\beta=5$ for all experiments in this subsection.
\begin{figure}[ht]
    \centering
    \subfloat[]{{\includegraphics[width=7cm]{figures/6x6_norm.pdf} }}
    % \qquad
    \subfloat[]{{\includegraphics[width=7cm]{figures/6x6_kld.pdf} }}

    \caption{$6\times 6$ environment. KL divergence between $\pi$ and $\pi_f$ and average difference between optimal $Q$ function and presented bound, with the shaded region representing one standard deviation over 1000 runs.}
    \label{fig:6x6}
\end{figure}


\begin{figure}[ht]
    \centering
    \subfloat[]{{\includegraphics[width=7cm]{figures/10x10_norm.pdf} }}
    % \qquad
    \subfloat[]{{\includegraphics[width=7cm]{figures/10x10_kld.pdf} }}

    \caption{$10\times 10$ environment. KL divergence between $\pi$ and $\pi_f$ and average difference between optimal $Q$ function and presented bound, with the shaded region representing one standard deviation over 1000 runs.}
    \label{fig:10x10}
\end{figure}
Interestingly, we find a somewhat universal behavior, in that there is a certain level of density which makes the bound a poor approximation to the true $Q$ function. We also note that the bound is a better approximation at low densities.


\clearpage
\newpage
\section{Boolean Composition Definitions}
In this section, we explicitly define the action of Boolean operators on subtask reward functions. These definitions are similar to those used by \cite{boolean}.

\begin{definition}[OR Composition]
    Given subtask rewards $\{r^{(1)}, r^{(2)}, \dotsc , r^{(M)} \}$, the OR composition among them is given by the \textit{maximum} over all subtasks, at each state-action pair:
    \begin{equation}
        r^{(\text{OR})}(s,a) = \max_k r^{(k)}(s,a).
    \end{equation}
\end{definition}

\begin{definition}[AND Composition]
    Given subtask rewards $\{r^{(1)}, r^{(2)}, \dotsc , r^{(M)} \}$, the AND composition among them is given by the \textit{minimum} over all subtasks, at each state-action:
    \begin{equation}
        r^{(\text{AND})}(s,a) = \min_k r^{(k)}(s,a).
    \end{equation}
\end{definition}

\begin{definition}[NOT Gate]
    Given a subtask reward function $r$, applying the NOT gate transforms the reward function by negating all rewards (i.e. rewards $\to$ costs):
    \begin{equation}
        r^{(\text{NOT})}(s,a) = - r(s,a),
    \end{equation}
    % where $r_{\text{max}}$ is added to ensure non-negativity of rewards. 
\end{definition}


The proofs in all subsequent sections follow an inductive form based on the Bellman backup equation, whose solution converges to the optimal $Q$ function. This is a similar approach as employed by \cite{Haarnoja2018} and \cite{hunt_diverg}, but with the extension to all applicable functions; rather than (linear) convex combinations.

\section{Proofs for Standard RL}


Let $X$ be the codomain for the $Q$ function of the primitive task ($Q: \s \times \A \to X \subseteq \mathbb{R}$).

\begin{lemma}[Convex Conditions]\label{thm:convex_cond_std}
    Given a primitive task with discount factor $\gamma$ and a bounded, continuous transformation function $f~:~X~\to~\mathbb{R}$ which satisfies:
    \begin{enumerate}
        \item $f$ is convex on its domain $X$ (for stochastic dynamics);
        \item $f$ is sublinear:
              \begin{enumerate}[label=(\roman*)]
                  \item $f(x+y) \leq f(x) + f(y)$ for all $x,y \in X$
                  \item $f(\gamma x) \leq \gamma f(x)$ for all $x \in X$ % and all %$\lambda \in (0,1)$,
              \end{enumerate}
        \item $f\left( \max_{a} \mathcal{Q}(s,a) \right) \leq \max_{a}~f\left( \mathcal{Q}(s,a) \right)$ for all $\mathcal{Q}: \s \times \A \to \mathbb{R}.$
    \end{enumerate}

    then the optimal action-value function for the transformed rewards, $\widetilde{Q}$, is now related to the optimal action-value function with respect to the original rewards  by:

    \begin{equation}\label{eqn:convex_std}
        f(Q(s,a)) \leq \widetilde{Q}(s,a) \leq f(Q(s,a)) + C(s,a)
    \end{equation}

    where $C$ is the optimal value function for a task with reward
    \begin{equation}\label{eq:std_convex_C_def}
        r_C(s,a) = f(r(s,a)) + \gamma \mathbb{E}_{s'} V_f(s') - f(Q(s,a)).
    \end{equation}

    % for all states $s \in \s$ and actions $a \in \A$.
\end{lemma}



\begin{proof}
    We will prove all inequalities by induction on the number of backup steps, $N$. We start with the lower bound $\widetilde{Q} \ge f(Q)$. The base case, $N=1$ is trivial since $f(r(s,a))=f(r(s,a))$. The inductive step is the assumption $\widetilde{Q}^{(N)}(s,a) \geq f(Q^{(N)}(s,a))$ for some $N>1$.
    In the case of standard RL, the Bellman backup equation for transformed rewards is given by:

    \begin{equation}
        \widetilde{Q}^{(N+1)}(s,a) = f\left(r(s,a)\right) + \gamma \mathbb{E}_{s' \sim{} p(s'|s,a)} \max_{a'} \widetilde{Q}^{(N)}(s',a')
    \end{equation}
    Using the inductive assumption,
    \begin{equation}
        \widetilde{Q}^{(N+1)}(s,a) \geq f\left(r(s,a)\right) +
        \gamma \mathbb{E}_{s' \sim{} p(s'|s,a)} \max_{a'} f \left({Q}^{(N)}(s',a') \right)
    \end{equation}

    The condition $v_f(s) \ge f(v(s)) $ is used on the right hand side to give:
    \begin{equation}
        \widetilde{Q}^{(N+1)}(s,a) \geq f\left(r(s,a)\right) + \gamma \mathbb{E}_{s' \sim{} p(s'|s,a)} f \left( \max_{a'} {Q}^{(N)}(s',a') \right)
    \end{equation}
    Since $f$ is convex, we use Jensen's inequality to factor it out of the expectation. Note that this condition on $f$ is only required for stochastic dynamics. The error introduced by swapping these operators is characterized by the ``Jensen's gap'' for the transformation function $f$.
    \begin{equation}
        \widetilde{Q}^{(N+1)}(s,a) \geq f\left(r(s,a)\right) + \gamma f \left( \mathbb{E}_{s' \sim{} p(s'|s,a)}  \max_{a'} {Q}^{(N)}(s',a') \right)
    \end{equation}
    Finally, using both sublinearity conditions
    \begin{equation}
        \widetilde{Q}^{(N+1)}(s,a) \geq f\left(r(s,a) + \gamma \mathbb{E}_{s' \sim{} p(s'|s,a)}  \max_{a'} {Q}^{(N)}(s',a') \right)
        \label{eq:pf:last_line_induction}
    \end{equation}

    where the right-hand side is simply $f(Q^{(N+1)}(s,a))$. Since this inequality holds for all $N$, we take the limit $N \to \infty$ wherein $Q^{(N)}$ converges to the optimal $Q$-function. For the right-hand side of Eq. \eqref{eq:pf:last_line_induction}, we thus have (by continuity of $f$):

    \begin{equation}
        \lim_{N \to \infty} f\left(Q^{(N)}(s,a)\right) =  f\left(\lim_{N \to \infty}Q^{(N)}(s,a)\right) = f(Q(s,a))
    \end{equation}
    where $Q(s,a)$ is the optimal action value function for the primitive task.
    Combined with the limit of the left-hand side, we arrive at the desired inequality:
    \begin{equation}
        \widetilde{Q}(s,a) \geq f\left(Q(s,a)\right).
    \end{equation}

    This completes the proof of the lower bound. To prove the upper bound we again use induction on the backup equation of $\widetilde{Q}^{(N)}$. We wish to show $\widetilde{Q}^{(N)} \le f\left(Q(s,a)\right) + C^{(N)}(s,a)$ holds for all $N$, with the definition of $C$ provided in Lemma~4.1.

    Let $f$ satisfy the convex conditions.
    Consider the backup equation for $\widetilde{Q}$.
    Again, the base case ($N=1$) is trivially satisfied with equality. Using the inductive assumption, we find

    \begin{align*}
        \widetilde{Q}^{(N+1)}(s,a) & = f(r(s,a)) + \gamma \E_{s'} \max_{a'} \widetilde{Q}^{(N)}(s',a')
        \\
        % \widetilde{Q}^{(k+1)}_i 
                                   & \le f(r(s,a)) + \gamma \E_{s'} \max_{a'} \left( f(Q(s',a')) + C^{(N)}(s',a')\right)
        \\
        % \widetilde{Q}^{(k+1)}_i 
                                   & \le f(r(s,a)) + \gamma \E_{s'} \max_{a'} f(Q(s',a')) + \gamma \E \max_{a'}C^{(N)}(s',a')
        \\
        % \widetilde{Q}^{(k+1)}_i 
                                   & = f(Q(s,a)) + \left[ f(r_i) + \gamma \E_{s'} V_f(s') - f(Q(s,a)) \right] + \gamma \E_{s'} \max_{a'}C^{(N)}(s',a')
        \\
        % \widetilde{Q}^{(k+1)}_i 
                                   & = f(Q(s,a)) +  C^{(N+1)}(s,a)
    \end{align*}
\end{proof}

At this point, we verify that $C(s,a)>0$ which ensures the double-sided bounds above are valid.

To do so, we can simply bound the reward function $r_C(s,a)$. By determining $r_C(s,a)>0$, this will ensure $C(s,a) > \min r_C / (1-\gamma) > 0$.
\begin{align*}
    r_C(s,a) & = f(r(s,a)) + \gamma \mathbb{E}_{s'} V_f(s') - f(Q(s,a))     \\
             & \geq f(r(s,a)) + \gamma \mathbb{E}_{s'} f(V(s')) - f(Q(s,a)) \\
             & \geq f(r(s,a)) + f(\gamma \mathbb{E}_{s'} V(s')) - f(Q(s,a)) \\
             & \geq f(r(s,a) + \gamma \mathbb{E}_{s'} V(s')) - f(Q(s,a))    \\
             & \geq 0
\end{align*}
where each line follows from the required conditions in Lemma \ref{thm:convex_cond_std}. A similar proof holds for showing the quantities $\hat{C}$, $D$, $\hat{D}$ are all non-negative.


We now prove the policy evaluation bound for standard RL.

\begin{lemma}
    %Moreover, when evaluating the 
    Consider the value of the policy $\pi_f(s) = \max_{a} f(Q(s,a))$ on the transformed task of interest, denoted by $\widetilde{Q}^{\pi_f}(s,a)$. %, the value of this policy 
    The sub-optimality of $\pi_f$ is then upper bounded by:
    \begin{equation}
        \widetilde{Q}(s,a) - \widetilde{Q}^{\pi_f}(s,a) \leq D(s,a)
    \end{equation}
    where $D$ is the value of the policy $\pi_f$ in a task with reward
    \begin{align*}
        r_D = \gamma \mathbb{E}_{s',a' \sim{} \pi_f} \biggr[ \max_{a} \big( f(Q(s',a')) + C(s',a') \big) - f(Q(s,a)) \biggr]
        % D(s,a) \xleftarrow{} &\gamma \mathbb{E}_{s' \sim{} p}\mathbb{E}_{a' \sim{} \pi_f} \biggr[ \max_{a} \big( f(Q(s',a')) + \\
        % &\hat{C}(s',a') \big) - f(Q(s,a)) + D(s',a') \biggr]
    \end{align*}
\end{lemma}

\def\qpif{\widetilde{Q}^{\pi_f}(s,a)}
\def\qpifp{\widetilde{Q}^{\pi_f}(s',a')}

\def\fr{f\left(r(s,a)\right)}
\begin{proof}
    We will again prove this bound by induction on steps in the Bellman backup equation for the value of $\pi_f$, as given by the following fixed point equation:

    \begin{equation}
        \qpif = \fr + \gamma \E_{s', a' \sim{} \pi_f}\qpifp
    \end{equation}

    We consider the following initial conditions: $ \widetilde{Q}^{\pi_f(0)}(s,a) = \widetilde{Q}(s,a), D(s,a)=0$.
    We note that there is freedom in the choice of initial conditions, as the final statement (regarding the optimal value functions) holds regardless of initialization. As usual, the base case is trivially satisfied. We will now show that the equivalent inequality
    \begin{equation}
        \widetilde{Q}^{\pi_f (N)}(s,a) \ge \widetilde{Q}(s,a) - D^{(N)}(s,a)
    \end{equation}
    holds for all $N$. Similar to the previous proofs, we will subsequently take the limit $N \to \infty$ to recover the desired result.

    To do so, we consider the next step of the Bellman backup, and apply the inductive hypothesis:
    \begin{align}
        \widetilde{Q}^{\pi_f (N+1)}(s,a) & = \fr + \gamma \E_{s', a' \sim{} \pi_f}\left( \widetilde{Q}^{\pi_f (N)}(s',a') \right)                                                                                         \\
                                         & \geq \fr + \gamma \E_{s', a' \sim{} \pi_f}\left( \widetilde{Q}(s',a') - D^{(N)}(s',a') \right)                                                                                 \\
                                         & \geq \fr + \gamma \E_{s', a' \sim{} \pi_f}\left( f\left(Q(s',a')\right) - D^{(N)}(s',a') \right)                                                                               \\
                                         & = \fr + \gamma \E_{s'} \widetilde{V}(s') + \gamma \E_{s', a' \sim{} \pi_f}\left( f\left(Q(s',a')\right) - D^{(N)}(s',a') - \widetilde{V}(s') \right)                           \\
                                         & \geq \widetilde{Q}(s,a) + \gamma \E_{s', a' \sim{} \pi_f}\left( f\left(Q(s',a')\right) - D^{(N)}(s',a') - \max_{a'} \left\{ f\left(Q(s',a')\right) + C(s',a') \right\} \right) \\
                                         & = \widetilde{Q}(s,a) - \gamma \E_{s', a' \sim{} \pi_f}\left( \max_{a'} \left\{ f\left(Q(s',a')\right) + C(s',a') \right\}  - f\left(Q(s',a')\right) + D^{(N)}(s',a')\right)    \\
                                         & = \widetilde{Q}(s,a) - \left( r_D(s,a) + \gamma \E_{s',a' \sim{} \pi_f} D^{(N)}(s',a') \right)                                                                                 \\
                                         & = \widetilde{Q}(s,a) - D^{(N+1)}(s,a)
    \end{align}
    The third and fifth line follow from the previous bounds (Lemma 4.1). In the limit $N \to \infty$, we can thus see that the fixed point $D$ corresponds to the policy evaluation for $\pi_f$ in an environment with reward function $r_D$.
\end{proof}


Now we prove similar results, but for the ``concave conditions'' presented in the main text.
\begin{lemma}[Concave Conditions]\label{thm:concave_cond_std}
    Given a primitive task with discount factor $\gamma$ and a bounded, continuous transformation function $f~:~X~\to~\mathbb{R}$ which satisfies:
    \begin{enumerate}
        \item $f$ is concave on its domain $X$ (for stochastic dynamics);
        \item $f$ is superlinear:
              \begin{enumerate}[label=(\roman*)]
                  \item $f(x+y) \geq f(x) + f(y)$ for all $x,y \in X$
                  \item $f(\gamma x) \geq \gamma f(x)$ for all $x \in X$ %and all $\lambda \in (0,1)$
              \end{enumerate}
        \item $f\left( \max_{a} \mathcal{Q}(s,a) \right) \geq \max_{a}~f\left( \mathcal{Q}(s,a) \right)$ for all functions $\mathcal{Q}:~\s~\times~\A \to X.$
    \end{enumerate}

    then the optimal action-value functions are now related in the following way:
    \begin{equation}\label{eqn:concave_std}
        f(Q(s,a)) - \hat{C}(s,a) \leq \widetilde{Q}(s,a) \leq f(Q(s,a))
    \end{equation}

    where $\hat{C}$ is the optimal value function for a task with reward
    \begin{equation}
        \hat{r}_C(s,a) = f(Q(s,a)) - f(r(s,a)) - \gamma \E_{s'} V_f(s')
    \end{equation}
    % for all states $s \in \s$ and actions $a \in \A$.
\end{lemma}

\begin{proof}
    The proof of $\widetilde{Q} \le f(Q)$ is the same as the preceding theorem's lower bound but with all inequalities reversed. To prove the upper bound involving $\hat{C}$, we use a similar approach
    \begin{align*}
        \widetilde{Q}^{(N+1)}(s,a) & = f(r(s,a)) + \gamma \E_{s'} \max_{a'} \widetilde{Q}^{(N)}(s',a')
        \\
        % \widetilde{Q}^{(k+1)}_i 
                                   & \ge f(r(s,a)) + \gamma \E_{s'} \max_{a'} \left( f(Q(s',a')) - \hat{C}^{(N)}(s',a')\right)
        \\
        % \widetilde{Q}^{(k+1)}_i 
                                   & \ge f(r(s,a)) + \gamma \E_{s'} \left( \max_{a'} f(Q(s',a')) - \max_{a'} \hat{C}^{(N)}(s',a') \right)
        \\
        % \widetilde{Q}^{(k+1)}_i 
                                   & = f(Q(s,a)) - \left[f(Q(s,a)) - f(r(s,a)) - \gamma \E_{s'} V_f(s') + \gamma \E_{s'} \max_{a'} \hat{C}^{(N)}(s',a')\right]
        \\
                                   & = f(Q(s,a)) - \hat{C}^{(N+1)}(s,a)
    \end{align*}
    The second line follows from the inductive hypothesis. The third line follows from the $\max$ of a difference. In the penultimate line, we add and subtract $f(Q)$, and identify the definitions for $V_f$ and the backup equation for $\hat{C}$. In the limit $N \to \infty$, we have the desired result.
\end{proof}


\begin{lemma}

    Consider the value of the policy $\pi_f(s) = \max_{a} f(Q(s,a))$ on the transformed task of interest, denoted by $\widetilde{Q}^{\pi_f}(s,a)$. %, the value of this policy 
    The sub-optimality of $\pi_f$ is then upper bounded by:
    \begin{equation}
        \widetilde{Q}(s,a) - \widetilde{Q}^{\pi_f}(s,a) \leq \hat{D}(s,a)
    \end{equation}
    where $\hat{D}$ is the value of the policy $\pi_f$ in a task with reward
    \begin{equation}
        \hat{r}_D = \gamma \mathbb{E}_{s',a' \sim{} \pi_f} \biggr[ V_f(s') - f(Q(s',a')) + \hat{C}(s',a') \biggr]
        % \hat{D}(s,a) \xleftarrow{} &\gamma \mathbb{E}_{s' \sim{} p}\mathbb{E}_{a' \sim{} \pi_f} \biggr[ V_f(s') - f(Q(s',a'))  \\ &+ \hat{C}(s',a') + \hat{D}(s',a') \biggr]
    \end{equation}
\end{lemma}

\begin{proof}
    The proof of this result is similar to that of Lemma 4.2, except now we must employ the corresponding results of Lemma 4.3. Beginning with a substitution of the inductive hypothesis:
    \begin{align}
        \widetilde{Q}^{\pi_f (N+1)}(s,a) & = \fr + \gamma \E_{s', a' \sim{} \pi_f}\left( \widetilde{Q}^{\pi_f (N)}(s',a') \right)                                                                                      \\
                                         & \geq \fr + \gamma \E_{s', a' \sim{} \pi_f}\left( \widetilde{Q}(s',a') - \hat{D}^{(N)}(s',a') \right)                                                                        \\
                                         & \geq \fr + \gamma \E_{s', a' \sim{} \pi_f}\left( f\left(Q(s',a')\right) - \hat{C}(s',a') - \hat{D}^{(N)}(s',a') \right)                                                     \\
                                         & = \fr + \gamma \E_{s'} \widetilde{V}(s') + \gamma \E_{s', a' \sim{} \pi_f}\left( f\left(Q(s',a')\right) - \hat{C}(s',a') - \hat{D}^{(N)}(s',a') - \widetilde{V}(s') \right) \\
                                         & \geq \widetilde{Q}(s,a) + \gamma \E_{s', a' \sim{} \pi_f}\left( f\left(Q(s',a')\right) - \hat{C}(s',a') - \hat{D}^{(N)}(s',a') - V_f(s') \right)                            \\
                                         & = \widetilde{Q}(s,a) - \gamma \E_{s', a' \sim{} \pi_f}\left( V_f(s') - f\left(Q(s',a')\right)+ \hat{C}(s',a') + \hat{D}^{(N)}(s',a')\right)                                 \\
                                         & = \widetilde{Q}(s,a) - \left( \hat{r}_D(s,a) + \gamma \E_{s',a' \sim{} \pi_f} \hat{D}^{(N)}(s',a') \right)                                                                  \\
                                         & = \widetilde{Q}(s,a) - \hat{D}^{(N+1)}(s,a)
    \end{align}

\end{proof}

Now we provide further details on the technical conditions for compositions (rather than transformations) of primitive tasks to satisfy the derived bounds.
\begin{lemma}[Convex Composition of Primitive Tasks]
    Suppose $F:\bigotimes_k X^{(k)} \to \mathbb{R}$ is convex on its domain and is sublinear (separately in each argument), that is:


    \begin{align}
        F(x^{(1)}+y^{(1)},x^{(2)}, \dotsc, x^{(M)}) & \leq F(x^{(1)},x^{(2)},\dotsc, x^{(M)}) + F(y^{(1)},x^{(2)},\dotsc, x^{(M)}) \\
        F(x^{(1)},x^{(2)}+y^{(2)}, \dotsc, x^{(M)}) & \leq F(x^{(1)},y^{(2)},\dotsc, x^{(M)}) + F(x^{(1)},y^{(2)},\dotsc, x^{(M)})
    \end{align}
    and similarly for the remaining arguments.

    \begin{equation}
        F(\gamma x^{(1)}, \dotsc, \gamma  x^{(M)}) \leq \gamma F(x^{(1)}, \dotsc x^{(M)})
    \end{equation}and also satisfies
    \begin{equation}
        F\left( \max_a  \Q^{(1)}(s,a), \dotsc , \max_a \Q^{(M)}(s,a) \right) \leq \max_a F\left(\Q^{(1)}(s,a), \dotsc , \Q^{(M)}(s,a)\right)
    \end{equation}
    for all functions $\Q^{(k)}:\s \times \A \to \mathbb{R}$.
    Then,
    \begin{equation}
        F(\vec{Q}(s,a)) \le \widetilde{Q}(s,a) \le F(\vec{Q}(s,a)) + C(s,a)
    \end{equation}
    where we use a vector notation to emphasize that the function acts over the set of optimal value functions $\{Q^{(k)}\}$  corresponding to each primitive task, defined by $r^{(k)}$.
\end{lemma}
\begin{proof}
    The proof of this statement is identical to the proof of Lemma \ref{thm:convex_cond_std}, now using the fact that $F$ is a multivariable function $F: X^N \to Y$, with each argument obeying the required conditions. $C$ takes the analogous definition as provided for the original result.
\end{proof}

\begin{lemma}[Concave Composition of Primitive Tasks]\label{thm:compos_concave_std}
    If on the other hand $F$ is concave and superlinear in each argument, and also satisfies

    \begin{equation}
        F\left( \max_a  \Q^{(1)}(s,a), \dotsc , \max_a \Q^{(M)}(s,a) \right) \leq \max_a F\left(\Q^{(1)}(s,a), \dotsc , \Q^{(M)}(s,a)\right)
    \end{equation}
    for all functions $\Q^{(k)}:\s \times \A \to \mathbb{R}$, then


    \begin{equation}
        F(\vec{Q}(s,a)) - \hat{C}(s,a) \le \widetilde{Q}(s,a) \le F(\vec{Q}(s,a)).
    \end{equation}
\end{lemma}
\begin{proof}
    Again, the proof of this statement is identical to the proof of Lemma \ref{thm:concave_cond_std}, now using the fact that $F$ is a multivariable function $F: X^N \to Y$, with each argument obeying the required conditions.
\end{proof}

\subsection{Examples of transformations and compositions}
In this section, we consider the examples of transformations and compositions mentioned in the main text, and discuss the corresponding results in standard RL.
\begin{remark}
    Given the convex composition of subtasks  $r^{(c)} \equiv f(\{r^{(k)}\}) = \sum_k \alpha_k r^{(k)}$ considered by \cite{Haarnoja2018} and \cite{hunt_diverg}, we can use the results of Lemma \ref{thm:compos_concave_std} to bound the optimal $Q$ function by using the optimal $Q$ functions for the primitive tasks:
    \begin{equation}
        Q^{(c)}(s,a) \leq \sum_k \alpha_k Q^{(k)}(s,a)
    \end{equation}
\end{remark}
\begin{proof}
    In standard RL, we need only show that $f( \max_i  x_{1i}, \dotsc , \max_i x_{ni} ) \geq \max_i f(x_i, \dotsc , x_n)$:
    \begin{equation}
        \sum_k \alpha_k  \max_i  x^{(k)}_{i} \geq \max_i \sum_k \alpha_k x^{(k)}_i
    \end{equation} which holds given $\alpha_k \geq 0$ for all $k$.
    We also note that in this case the result clearly holds for general $\alpha_k \geq 0$ not necessarily with $\sum_k \alpha_k = 1$ (as assumed in \cite{Haarnoja2018} and \cite{hunt_diverg}).
\end{proof}

\begin{remark}
    Given the AND composition defined above and considered in \cite{boolean}, we have the following result in standard RL:
    \begin{equation}
        Q^{\text{AND}}(s,a) \leq \min_k \left\{Q^{(k)}(s,a)\right\}
    \end{equation}
\end{remark}
\begin{proof}
    We could proceed via induction as in the previous proofs, or simply use the above remark, and prove the necessary conditions on the function $f(\cdot) = \min(\cdot)$.
    The function $\min(\cdot)$ is concave in each argument. It is also straightforward to show that $\min(\cdot)$ is subadditive over all arguments.
\end{proof}


\begin{remark}
    Result of (hard) OR composition result in standard RL:
    \begin{equation}
        Q^{\text{OR}}(s,a) \geq \max_k \left\{Q^{(k)}(s,a)\right\}
    \end{equation}
\end{remark}


\begin{proof}
    The proof is analogous to the (hard) AND result: $\max$ is a convex, superadditive function.

\end{proof}


\begin{remark}
    Result for NOT operation in standard RL:
    \begin{equation}
        Q^{\text{NOT}}(s,a) \geq - Q(s,a)
    \end{equation}
\end{remark}

\begin{proof}
    Since the ``NOT'' gate is a unary function, and we are in the standard RL setting, we must check the conditions of Lemma 4.1 or 4.3. Moreoever, since the transformation function applied to the rewards, $f(r)=-r$ is linear, we must check the final condition: $\max_i\{-x_i\} = -\min_i\{x_i\} \geq -\max_i\{x_i\}$. This is the condition required by the concave conditions.
\end{proof}

\section{Proofs for Entropy-Regularized RL}

Let $X$ be the codomain for the $Q$ function of the primitive task ($Q: \s \times \A \to X \subseteq \mathbb{R}$).
\begin{lemma}[Convex Conditions]
    \label{thm:forward_cond_entropy-regularized}
    Given a bounded, continuous transformation function $f~:~X~\to~\mathbb{R}$ which satisfies:
    \begin{enumerate}
        \item $f$ is convex on its domain $X$ (for stochastic dynamics);
        \item $f$ is sublinear:
              \begin{enumerate}[label=(\roman*)]
                  \item $f(x+y) \leq f(x) + f(y)$ for all $x,y \in X$
                  \item $f(\gamma x) \leq \gamma f(x)$ for all $x \in X$ % and all %$\lambda \in (0,1)$,
              \end{enumerate}
        \item $f\left( \log \E \exp \mathcal{Q}(s,a) \right) \leq \log \E \exp f\left( \mathcal{Q}(s,a) \right)$ for all functions $\mathcal{Q}:~\s~\times~\A \to \mathbb{R}.$

    \end{enumerate}

    then the optimal action-value function for the transformed rewards, $\widetilde{Q}$, is now related to the optimal action-value function with respect to the original rewards by:
    \begin{equation}\label{eq:convex_entropy-regularized}
        f \left( Q(s,a) \right) \leq \widetilde{Q}(s,a) \leq f \left( Q(s,a) \right) + C(s,a)
    \end{equation}

\end{lemma}


\begin{proof}
    We will again prove the result with induction, beginning by writing the backup equation for the optimal soft $Q$ function in the transformed reward environment to prove the upper bound on $\widetilde{Q}$:
    \begin{equation}
        \widetilde{Q}^{(N+1)}(s,a) = f(r(s,a)) + \gamma \mathbb{E}_{s' \sim{} p(s'|s,a)} \frac{1}{\beta} \log \mathbb{E}_{a' \sim{} \pi_0(a'|s')} \exp \left(\beta Q^{(N)}(s',a')\right)
    \end{equation}
    where $p$ is the dynamics and $\pi_0$ is the prior policy. Applying the inductive assumption,
    \begin{equation}
        \widetilde{Q}^{(N+1)}(s,a) \geq f(r(s,a)) + \gamma \mathbb{E}_{s' \sim{} p(s'|s,a)} \frac{1}{\beta} \log \mathbb{E}_{a' \sim{} \pi_0(a'|s')}\exp \left( f\left(\beta Q^{(N)}(s',a')\right) \right)
    \end{equation}
    Next, using the third condition on $f$ as well as its convexity, we may factor $f$ out of the expectations by Jensen's inequality:
    \begin{equation}
        \widetilde{Q}^{(N+1)}(s,a) \geq f(r(s,a)) + \gamma f\left( \mathbb{E}_{s' \sim{} p(s'|s,a)} \frac{1}{\beta} \log \mathbb{E}_{a' \sim{} \pi_0(a'|s')}\exp \left(\beta Q^{(N)}(s',a')\right) \right)
    \end{equation}
    Finally, using the sublinearity conditions of $f$, we arrive at
    \begin{equation}
        \widetilde{Q}^{(N+1)}(s,a) \geq f \left(r(s,a) + \gamma \mathbb{E}_{s' \sim{} p(s'|s,a)} \frac{1}{\beta} \log \mathbb{E}_{a' \sim{} \pi_0(a'|s')}\exp \left(\beta Q^{(N)}(s',a')\right) \right)
    \end{equation}
    The right hand side is $f \left(Q^{(N+1)}(s,a) \right)$. In the limit $N\to \infty,\ Q^{(N)}(s,a)  \to Q(s,a)$ so the inductive proof for the upper bound is complete.

    Let $f$ satisfy the ``convex conditions''.
    Consider the backup equation for $\widetilde{Q}$. For the initialization (base case) we let $\widetilde{Q}^{(0)}(s,a)=f\left(Q(s,a)\right)$ and $C^{(0)}(s,a)=0$.
    Using the inductive assumption,

    \begin{align*}
        \widetilde{Q}^{(N+1)}(s,a) & = f(r(s,a)) + \frac{\gamma}{\beta} \E_{s' \sim{} p} \log \E_{a' \sim{} \pi_0} \exp \beta \widetilde{Q}^{(N)}(s',a')
        \\
        % \widetilde{Q}^{(k+1)}(s,a) 
                                   & \leq f(r(s,a)) + \frac{\gamma}{\beta} \E_{s'} \log \E_{a'} \exp \beta \left( f(Q(s',a')) + C^{(N)}(s',a')\right)
        \\
        % \widetilde{Q}^{(k+1)}(s,a) 
                                   & \leq f(r(s,a)) + \frac{\gamma}{\beta} \E_{s'} \left(\log \E_{a'} \exp \beta  f(Q(s',a')) + \max_{a'} C^{(N)}(s',a')\right)
        \\
        % \widetilde{Q}^{(k+1)}(s,a) 
                                   & = f(Q(s,a)) + f(r(s,a)) + \gamma \E_{s'} V_f(s') - f(Q(s,a)) + \gamma \E_{s'} \max_{a'} C^{(N)}(s',a')
        % \\
        %     % \widetilde{Q}^{(k+1)}(s,a) 
        %     &\leq f(Q(s,a)) +  \frac{\gamma}{\beta} \E_{s'} \left(\log \E \exp \beta  f(Q(s',a')) - f(V) \right) - \E_{s'} \left(\min_{a'} \hat{C}^{(k)}(s',a')\right)
        \\
        % \widetilde{Q}^{(k+1)}(s,a) 
                                   & = f(Q(s,a)) + C^{(N+1)}(s,a)
    \end{align*}

    Therefore in the limit $N \to \infty$, we have:
    $\widetilde{Q}(s,a) \leq f(Q(s,a)) + C(s,a)$ as desired. We note that since $f(r(s,a))~+~\gamma~\E_{s'}~V_f(s')~\geq~f(Q(s,a))$, we immediately have $C(s,a) \ge 0$, as is required for the bound to be non-vacuous.
\end{proof}

\begin{lemma}
    % Moreover, when using soft-policy evaluation on $\pi_f \propto \exp \beta f(Q)$, the value of this policy is bounded from below:
    Consider the soft value of the policy $\pi_f \propto \exp \beta f(Q)$ on the transformed task of interest, denoted by $\widetilde{Q}^{\pi_f}$(s,a). %, the value of this policy 
    The sub-optimality of $\pi_f$ is then upper bounded by:
    \begin{equation}
        \widetilde{Q}(s,a) - \widetilde{Q}^{\pi_f}(s,a) \leq D(s,a)
    \end{equation}
    where $D$ is the value of the policy $\pi_f$
    with reward
    \begin{equation}
        r_D(s,a) = \gamma \mathbb{E}_{s' \sim{} p}
        \left[ \max_{b} \left\{ f\left(Q(s',b)\right) + C(s',b) \right\} -V_f(s') \right]
    \end{equation}


\end{lemma}

\begin{proof}
    To prove the (soft) policy evaluation bound, we use iterations of soft-policy evaluation \cite{haarnoja_SAC} and denote iteration $N$ of the evaluation of $\pi_f$ in the composite environment as $\widetilde{Q}^{\pi_f(N)}$. Beginning with the definitions $\widetilde{Q}^{\pi_f(0)}(s,a) = Q(s,a)$ (since the evaluation is independent of the initialization), and $D^{(0)}=0$, the $N=0$ step is trivially satisfied. Assuming the inductive hypothesis, we consider the next step of soft policy evaluation:
    As in the previous policy evaluation results, we prove an equivalent result with induction.

    \begin{align*}
        \widetilde{Q}^{\pi_f(N+1)}(s,a) & = f(r(s,a)) + \gamma \E_{s' \sim{} p}\E_{ a'\sim{} \pi_f} \left[\widetilde{Q}^{\pi_f(N)}(s',a') - \frac{1}{\beta} \log \frac{\pi_f(a'|s')}{\pi_0(a'|s')} \right]
        \\
        % Q^{(k)} 
                                        & \geq f(r(s,a)) + \gamma \E_{s',a'} \left[\widetilde{Q}(s',a') - D^{(N)}(s',a') - f(Q(s',a')) + V_f(s') \right]
        \\
        % Q^{(k)}
                                        & = f(r(s,a)) + \gamma \E_{s'}\widetilde{V}(s')  + \gamma \E_{s',a'} \left[\widetilde{Q}(s',a') - D^{(N)}(s',a') - f(Q(s',a'))  + V_f(s') - \widetilde{V}(s')\right] \\
        % Q^{(k)}
                                        & \geq \widetilde{Q}(s,a)  + \gamma \E_{s',a'} \left[f(Q(s',a')) - D^{(N)}(s',a') - f(Q(s',a'))  + V_f(s') - \widetilde{V}(s')\right]                                \\
                                        & \geq \widetilde{Q}(s,a)  + \gamma \E_{s',a'} \left[ - D^{(N)}(s',a') + V_f(s') - \max_{b} \left\{ f\left(Q(s',b)\right) + C(s',b) \right\} \right]                 \\
                                        & \geq  \widetilde{Q}(s,a) -D^{(N+1)}(s,a)
        \\
    \end{align*}

    where we have used $\widetilde{Q}(s,a) \geq f(Q(s,a))$ in the fourth line.
    % \JA{double check the last lines; tighten bound? update main text}

    where we have used the fact that $ \widetilde{V}(s) \leq \max_b \left\{ f\left(Q(s,b)\right) + \max_a C(s,a)\right\}$ and $\widetilde{Q}(s,a) - f(Q(s,a)) \geq 0$ which both follow from the previously stated bounds.
\end{proof}


\begin{lemma}[Concave Conditions]
    \label{thm:reverse_cond_entropy-regularized}
    Given a bounded, continuous transformation function $f~:~X~\to~\mathbb{R}$ which satisfies:
    \begin{enumerate}
        \item $f$ is concave on its domain $X$ (for stochastic dynamics);
        \item $f$ is superlinear:
              \begin{enumerate}[label=(\roman*)]
                  \item $f(x+y) \geq f(x) + f(y)$ for all $x,y \in X$
                  \item $f(\gamma x) \geq \gamma f(x)$ for all $x \in X$ %and all $\lambda \in (0,1)$
              \end{enumerate}
        \item $f\left( \log \E \exp \mathcal{Q}(s,a) \right) \geq \log \E \exp f\left( \mathcal{Q}(s,a) \right)$ for all functions $\mathcal{Q}:~\s~\times~\A \to \mathbb{R}.$

              % \item $f\left(\frac{1}{\beta}\log\mathbb{E}_{a' \sim{} \pi_0(a'|s')} \exp(\beta x_i)\right) \ge \frac{1}{\beta}\log \mathbb{E}_{a' \sim{} \pi_0(a'|s')} \exp(\beta f(x_i)) \ \forall x_i \in X$
    \end{enumerate}
    then the optimal action-value function for the transformed rewards obeys the following inequality:
    \begin{equation}\label{eq:concave_entropy-regularized}
        f\left( Q(s,a) \right) - \hat{C}(s,a) \leq \widetilde{Q}(s,a) \leq f \left( Q(s,a) \right)
    \end{equation}
\end{lemma}



\begin{proof}
    The proof of the upper bound is the same as the preceding theorem's lower bound with all inequalities reversed.
    For the lower bound involving $C$,

    Again consider the backup equation for $\widetilde{Q}$.
    Using the definitions and inductive assumption as before, we have

    \begin{align*}
        \widetilde{Q}^{(N+1)}(s,a) & = f(r(s,a)) + \frac{\gamma}{\beta} \E_{s' \sim{} p} \log \E_{a' \sim{} \pi_0} \exp \beta \widetilde{Q}^{(N)}(s',a')
        \\
        % \widetilde{Q}^{(k+1)}(s,a) 
                                   & \geq f(r(s,a)) + \frac{\gamma}{\beta} \E_{s'} \log \E_{a'} \exp \beta \left( f(Q(s',a')) -\hat{C}^{(N)}(s',a')\right)
        \\
        % \widetilde{Q}^{(k+1)}(s,a) 
                                   & \geq f(r(s,a)) + \frac{\gamma}{\beta} \E_{s'} \left(\log \E_{a'} \exp \beta  f(Q(s',a')) - \max_{a'} \hat{C}^{(N)}(s',a')\right)
        \\
        % \widetilde{Q}^{(k+1)}(s,a) 
                                   & = f(Q(s,a)) - \left[f(Q(s,a)) - f(r(s,a)) - \gamma \E_{s'} V_f(s') + \gamma \E_{s'} \max_{a'} \hat{C}^{(N)}(s',a')\right]
        % \\
        %     % \widetilde{Q}^{(k+1)}(s,a) 
        %     &\geq f(Q(s,a)) +  \frac{\gamma}{\beta} \E_{s'} \left(\log \E \exp \beta  f(Q(s',a')) - f(V) \right) - \E_{s'} \left(\max_{a'} C^{(k)}(s',a')\right)
        \\
        % \widetilde{Q}^{(k+1)}(s,a) 
                                   & = f(Q(s,a)) -  \hat{C}^{(N+1)}(s,a)
    \end{align*}


    Therefore in the limit $N \to \infty$, we have:
    $\widetilde{Q}(s,a) \geq f(Q(s,a)) - \hat{C}(s,a)$ as desired.
\end{proof}

\begin{lemma}
    Consider the soft value of the policy $\pi_f \propto \exp \beta f(Q)$ on the transformed task of interest, denoted by $\widetilde{Q}^{\pi_f}$(s,a). %, the value of this policy 
    The sub-optimality of $\pi_f$ is then upper bounded by:
    \begin{equation}
        \widetilde{Q}(s,a) - \widetilde{Q}^{\pi_f}(s,a) \leq \hat{D}(s,a)
    \end{equation}
    where $\hat{D}$ is the fixed point of
    \begin{equation}
        \hat{D}(s,a) \xleftarrow{} \gamma \mathbb{E}_{s' \sim{} p}\mathbb{E}_{a' \sim{} \pi_f} \left[ \hat{C}(s',a') + \hat{D}(s',a') \right]
    \end{equation}

\end{lemma}


\begin{proof}
    We will show the policy evaluation result by induction, by evaluating $\pi_f \propto \exp(\beta f(Q))$ in the environment with rewards $f(r)$. We shall denote iterations of policy evaluation for $\pi_f$ in the environment with rewards $f(r)$ by $\widetilde{Q}^{\pi_f(N)}(s,a)$.% in order to ease the notation.

    \begin{align*}
        \widetilde{Q}^{\pi_f(N+1)}(s,a) & = f(r(s,a)) + \gamma \E_{s'\sim{}p} \E_{a'\sim{} \pi_f} \left[\widetilde{Q}^{\pi_f(N)}(s',a') - \frac{1}{\beta} \log \frac{\pi_f(a'|s')}{\pi_0(a'|s')} \right]
        \\
        % \widetilde{Q}_{\pi_f}^{(k+1)} 
                                        & \geq f(r(s,a)) + \gamma \E_{s',a'} \left[\widetilde{Q}(s',a')-\hat{D}^{(N)}(s',a') - (f(Q(s',a')) - V_f(s')) \right]
        \\
        % \widetilde{Q}_{\pi_f}^{(k+1)} 
                                        & \geq f(r(s,a)) + \gamma \E_{s',a'} \left[\widetilde{Q}(s',a')-\hat{D}^{(N)}(s',a') - \widetilde{Q}(s',a') -\hat{C}(s',a') + V_f(s') \right]
        \\
                                        & \ge f(r(s,a)) + \gamma \E_{s'} \widetilde{V}(s') - \gamma \E_{s',a'} \left[\hat{D}^{(N)}(s',a') + \hat{C}(s',a') \right]
        \\
        %     &= f(r(s,a)) + \gamma \E_{s'} V(s') - D^{(k+1)}(s,a)
        % \\
                                        & = \widetilde{Q}(s,a)) - \hat{D}^{(N+1)}(s,a)
        \\
    \end{align*}
    where we have used the inductive assumption and $V_f(s) \ge \widetilde{V}(s)$ and which follows from the previously stated bounds.
    Therefore in the limit $N \to \infty$, we have:
    $    \widetilde{Q}^{\pi_f}(s,a) \geq \widetilde{Q}(s,a) - \hat{D}(s,a)
    $ as desired.
\end{proof}

\begin{lemma}[Convex Composition of Primitive Tasks]\label{thm:compos_convex_maxent}
    Suppose $F:X^N \to Y$ is convex on its domain $X^N$ and satisfies all conditions of Lemma 5.1 (Main Text) component-wise. Then,
    \begin{equation}
        F(\vec{Q}(s,a)) \le \widetilde{Q}(s,a) \le F(\vec{Q}(s,a)) + C(s,a)
    \end{equation}
    and
    \begin{equation}
        \widetilde{Q}^{\pi_f}(s,a) \geq \widetilde{Q}(s,a) - D(s,a)
    \end{equation}
    where we use a vector notation to emphasize that the function acts over the set of optimal $\{Q_k\}$ functions corresponding to each subtask, defined by $r_k$.
\end{lemma}
\begin{proof}
    The proof of this statement is identical to the previous proofs, now using the fact that $F$ is a multivariable function $F: X^N \to Y$, with each argument obeying the required conditions.
\end{proof}

\begin{lemma}[Concave Composition of Primitive Tasks]\label{thm:compos_concave_maxent}
    If on the other hand $F$ is concave and and satisfies all conditions of Lemma 5.2 (Main Text) component-wise, then
    \begin{equation}
        F(\vec{Q}(s,a)) - \hat{C}(s,a) \le \widetilde{Q}(s,a) \le F(\vec{Q}(s,a)).
    \end{equation}
    and
    \begin{equation}
        \widetilde{Q}^{\pi_f}(s,a) \geq \widetilde{Q}(s,a) - \hat{D}(s,a)
    \end{equation}
\end{lemma}
\begin{proof}
    Again, the proof of this statement is identical to the previous proofs, now using the fact that $F$ is a multivariable function $F: X^N \to Y$, with each argument obeying the required conditions.
\end{proof}

\subsection{Examples of Transformations and Compositions}
In this section we consider several examples mentioned in the main text, and show how they are proved with our results in entropy-regularized RL.

\begin{remark}
    Given the convex composition of subtasks  $r^{(c)} \equiv F(\{r^{(k)}\}) = \sum_k \alpha_k r^{(k)}$ considered by \cite{Haarnoja2018} and \cite{hunt_diverg}, we can use the results of Lemma \ref{thm:compos_concave_maxent} to bound the optimal $Q$ function by using the optimal $Q$ functions for the primitive tasks:
    \begin{equation}
        Q^{(c)}(s,a) \leq \sum_k \alpha_k Q^{(k)}(s,a)
    \end{equation}
\end{remark}
\begin{proof}
    In entropy-regularized RL we need to show that the final condition holds (in vectorized form). This is simply H\"older's inequality \cite{hardy1952inequalities} for vector-valued functions in a probability space (with measure defined by $\pi_0$).

\end{proof}

\begin{remark}
    Given the AND composition defined above and considered in \cite{boolean}, we have the following result in standard RL:
    \begin{equation}
        Q^{\text{AND}}(s,a) \leq \min_k \left\{Q^{(k)}(s,a)\right\}
    \end{equation}
\end{remark}
\begin{proof}
    The function $\min(\cdot)$ is concave in each argument. It is also straightforward to show that $\min(\cdot)$ is subadditive over all arguments. For the final condition, the $\min$ acts globally over all subtasks:
    \begin{equation}
        \min_k \left\{ \frac{1}{\beta}\log\mathbb{E}_{a \sim{} \pi_0(a|s)} \exp\left(\beta \Q^{(k)}(s,a)\right)\right\} \leq \frac{1}{\beta}\log\mathbb{E}_{a \sim{} \pi_0(a|s)} \exp\left( \beta \min_k \left\{\Q^{(k)}(s,a)\right\}\right).
    \end{equation}
\end{proof}


\begin{remark}
    Result of (hard) OR composition result in standard RL:
    \begin{equation}
        Q^{\text{OR}}(s,a) \geq \max_k \left\{Q^{(k)}(s,a)\right\}
    \end{equation}
\end{remark}


\begin{proof}
    The proof is analogous to the (hard) AND result: $\max$ is a convex, superadditive function.
    For the final condition, the $\max$ again acts globally over all subtasks:
    \begin{equation}
        \max_k \left\{ \frac{1}{\beta}\log\mathbb{E}_{a \sim{} \pi_0(a|s)} \exp\left(\beta
        \Q^{(k)}(s,a)\right)\right\} \ge \frac{1}{\beta}\log\mathbb{E}_{a \sim{} \pi_0(a|s)} \exp\left( \beta \max_k \left\{\Q^{(k)}(s,a)\right\}\right).
    \end{equation}

\end{proof}

\begin{remark}
    Again we consider the NOT operation defined above, now in entropy-regularized RL, which yields the bound:
    \begin{equation}
        Q^{\text{NOT}}(s,a) \geq - Q(s,a)
    \end{equation}
\end{remark}

\begin{proof}
    As in the standard RL case, we need only consider the third condition of either Lemma 5.1 or 5.3.
    In particular, we show
    \begin{equation}
        f\left( \log \E \exp \mathcal{Q}(s,a) \right) \leq \log \E \exp f\left( \mathcal{Q}(s,a) \right)
    \end{equation}
    for all functions $\mathcal{Q}:~\s~\times~\A \to \mathbb{R}$. This follows from
    \begin{align}
        \frac{1}{\E \exp \mathcal{Q}(s,a)} \leq \E \frac{1}{\exp \mathcal{Q}(s,a) }
    \end{align}
    which is given by Jensen's inequality, since the function $f(x)=1/x$ is convex.

\end{proof}


\begin{remark}[Linear Scaling]
    \label{thm:scaling}
    Given some $k \in (0,1)$ the function $f(x) = k x$ satisfies the results of the first theorem. Conversely, if  $k \geq 1$, $f(x) = k x$ satisfies the results of the second theorem.
\end{remark}
\begin{proof}
    This result (specifically the third condition of Lemma 5.1, 5.3) follows from the monotonicity of $\ell_p$ norms.
\end{proof}


Since we have already shown the case of $k=-1$ (NOT gate), with the result of Theorem \ref{thm:compos}, the case for all $k \in \mathbb{R}$ has been characterized.

\section{Extension for Error-Prone $Q$-Values}
In this section, we provide some discussion on the case of inexact $Q$-values, as often occurs in practice (discussed at the end of Section 4.1 in the main text). We focus on the case of task transformation in standard RL. The corresponding statements in the settings of composition and entropy-regularized RL follow similarly.

As our starting point, we assume that an ``$\varepsilon$-optimal estimate'' $\overbar{Q}(s,a)$ for a primitive task's exact value function $Q(s,a)$ is known.

\begin{definition}
    An $\varepsilon$-optimal $Q$-function, $\overbar{Q}$, satisfies
    \begin{equation}
        |Q(s,a)-\overbar{Q}(s,a)|\leq \varepsilon
    \end{equation}
    for all $s \in \s, a \in \A$.
\end{definition}

To allow the derived double-sided bounds on the transformed tasks' $Q$-values to carry over to this more general setting, we assume that the transformation function is $L$-Lipschitz continuous. With these assumptions, we prove the following extensions of Lemma 4.1 and 4.3:

\begin{customlemma}{4.1A}[Convex Conditions, Error-Prone]\label{thm:convex_cond_std_err}
    Given a primitive task with discount factor $\gamma$, corresponding $\varepsilon$-optimal value function $\overbar{Q}$, and a bounded, continuous, $L$-Lipschitz transformation function $f~:~X~\to~\mathbb{R}$ which satisfies:
    \begin{enumerate}
        \item $f$ is convex on its domain $X$ (for stochastic dynamics);
        \item $f$ is sublinear:
              \begin{enumerate}[label=(\roman*)]
                  \item $f(x+y) \leq f(x) + f(y)$ for all $x,y \in X$
                  \item $f(\gamma x) \leq \gamma f(x)$ for all $x \in X$ % and all %$\lambda \in (0,1)$,
              \end{enumerate}
        \item $f\left( \max_{a} \mathcal{Q}(s,a) \right) \leq \max_{a}~f\left( \mathcal{Q}(s,a) \right)$ for all $\mathcal{Q}: \s \times \A \to \mathbb{R}.$
    \end{enumerate}

    then the optimal action-value function for the transformed rewards, $\widetilde{Q}$, is now related to the optimal action-value function with respect to the original rewards  by:

    \begin{equation}\label{eqn:convex_std_err}
        f(\overbar{Q}(s,a)) - L \varepsilon \leq \widetilde{Q}(s,a) \leq f(\overbar{Q}(s,a)) + \overbar{C}(s,a)  + \frac{2}{1-\gamma}L \varepsilon
    \end{equation}

    where $\overbar{C}$ is the optimal value function for a task with reward
    \begin{equation}\label{eq:std_convex_C_def_err}
        \overbar{r_C}(s,a) = f(r(s,a)) + \gamma \mathbb{E}_{s'} \overbar{V_f}(s') - f(\overbar{Q}(s,a)).
    \end{equation}
    with $\overbar{V_f}(s)=\max_a f(\overbar{Q}(s,a))$.
    % for all states $s \in \s$ and actions $a \in \A$.
\end{customlemma}

Note that as $\varepsilon \to 0$, the exact result (Lemma 4.1) is recovered. If the function $\overbar{C}$ is not known exactly, one can similarly exchange $\overbar{C}$ for $\overbar{\overbar{C}}$, an $\varepsilon$-optimal estimate for $\overbar{C}$. This consideration loosens the upper-bound by an addition of $\varepsilon$, shown at the end of the proof.

We will make use a well-known result (cf. proof of Lemma 1 in \cite{barreto_sf}) that bounds the difference in optimal $Q$-values for two tasks with different reward functions.
\begin{lemma}
    Let two tasks, only differing in their reward functions, be given with reward $r_1(s,a)$ and $r_2(s,a)$, respectively. Suppose $|r_1(s,a)-r_2(s,a)|\leq \delta$ Then, the optimal value functions for the tasks satisfies:
    \begin{equation}
        |Q_1(s,a)-Q_2(s,a)|\leq \frac{\delta}{1-\gamma}
    \end{equation}
    \label{lem:bounded_q_diff}
\end{lemma}

Now we are in a position to prove Lemma \ref{thm:convex_cond_std_err}:
\begin{proof}
    To prove the lower bound, we begin with the original lower bound in Lemma 4.1, for the optimal primitive task $Q$-values:
    \begin{equation}
        \widetilde{Q}(s,a) \geq f(Q(s,a)),
    \end{equation}
    or equivalently
    \begin{align}
        -\widetilde{Q}(s,a) & \leq -f(Q(s,a))                                               \\
        -\widetilde{Q}(s,a) & \leq -f(Q(s,a)) + f(\overbar{Q}(s,a))  -f(\overbar{Q}(s,a))   \\
        -\widetilde{Q}(s,a) & \leq |f(Q(s,a)) - f(\overbar{Q}(s,a))| - f(\overbar{Q}(s,a))  \\
        \widetilde{Q}(s,a)  & \geq -|f(Q(s,a)) - f(\overbar{Q}(s,a))| + f(\overbar{Q}(s,a)) \\
        \widetilde{Q}(s,a)  & \geq -L|Q(s,a) - \overbar{Q}(s,a)| + f(\overbar{Q}(s,a))      \\
        \widetilde{Q}(s,a)  & \geq f(\overbar{Q}(s,a)) - L \varepsilon                      \\
    \end{align}

    Where the final steps follow from the function $f$ being $L$-Lipschitz and the definition of $\varepsilon$-optimality of $\overbar{Q}(s,a)$.

    To prove the upper bound, we take a similar approach, noting that the reward function $r_C$ in Lemma 4.1 must be updated to account for the inexact $Q$-values. Therefore, we must account for the following error propagations:
    \begin{align*}
        Q(s,a)   & \to \overbar{Q}(s,a)    \\
        V_f(s)   & \to \overbar{V_f}(s)    \\
        r_C(s,a) & \to \overbar{r_C}(s,a).
    \end{align*}
    We first find the difference between $r_C$ and $\overbar{r_C}$ to be bounded by $(1+\gamma)L\varepsilon$:
    \begin{align}
        |r_C(s,a)-\overbar{r_C}(s,a)| & = |\gamma \E_{s' \sim{} p } V_f^*(s') - f(Q^*(s,a)) - \gamma \E_{s' \sim{} p } V_f(s') + f(Q(s,a))| \\
                                      & \leq \gamma \E_{s'} |V_f^*(s') - V_f(s')| + |f(Q^*(s,a)) - f(Q(s,a))|                               \\
                                      & \leq \gamma \E_{s'} \max_{a'} |f(Q^*(s',a')) - f(Q(s',a'))| + |f(Q^*(s,a)) - f(Q(s,a))|             \\
                                      & \leq (1+\gamma)L \varepsilon
    \end{align}
    where in the third line we have used the bound $|\max_{x} f(x) - \max_{x} g(x)| \leq \max_{x} |f(x)-g(x)|$.

    Now, applying Lemma \ref{lem:bounded_q_diff} to the reward functions $r_C$ and $\overbar{r_C}$:
    \begin{equation}
        |C(s,a) - \overbar{C}(s,a)| \leq \frac{ (1+\gamma)}{1-\gamma}L \varepsilon
    \end{equation}
    With the same technique as was used above for the lower bound, we find:
    \begin{align}
        \widetilde{Q}(s,a) & \leq f(Q(s,a)) +  C(s,a)                                                                                \\
                           & \leq f(\overbar{Q}(s,a)) + L \varepsilon + C(s,a)                                                       \\
                           & = f(\overbar{Q}(s,a)) + L \varepsilon + \overbar{C}(s,a) - \overbar{C}(s,a) + C(s,a)                    \\
                           & \leq f(\overbar{Q}(s,a)) +  L \varepsilon + |C(s,a) - \overbar{C}(s,a)| + \overbar{C}(s,a)              \\
                           & \leq f(\overbar{Q}(s,a)) + L \varepsilon + \overbar{C}(s,a) + \frac{ (1+\gamma)}{1-\gamma}L \varepsilon \\
                           & = f(\overbar{Q}(s,a)) + \overbar{C}(s,a) + \frac{2}{1-\gamma}L \varepsilon
    \end{align}
    Further extending the result to the case where only an $\varepsilon$-optimal estimate of $\overbar{C}$ is known, denoted by $\overbar{\overbar{C}}$, we find:
    \begin{align}
        \widetilde{Q}(s,a) & \leq f(\overbar{Q}(s,a)) + \overbar{C}(s,a) + \frac{2}{1-\gamma}L \varepsilon                                                            \\
                           & \leq f(\overbar{Q}(s,a)) + \overbar{\overbar{C}}(s,a) + |\overbar{\overbar{C}}(s,a)- \overbar{C}(s,a)| + \frac{2}{1-\gamma}L \varepsilon \\
                           & \leq f(\overbar{Q}(s,a)) + \overbar{\overbar{C}}(s,a) + \varepsilon + \frac{2}{1-\gamma}L \varepsilon                                    \\
                           & = f(\overbar{Q}(s,a)) + \overbar{\overbar{C}}(s,a) + \left(1+ \frac{2}{1-\gamma}L \right)\varepsilon
    \end{align}
\end{proof}

Similarly, Lemma 4.3 from the main text can be extended under the same conditions:
\begin{customlemma}{4.3A}[Concave Conditions, Error-Prone]\label{thm:concave_cond_std_err}
    Given a primitive task with discount factor $\gamma$, corresponding $\varepsilon$-optimal value function $\overbar{Q}$, and a bounded, continuous, $L$-Lipschitz transformation function $f~:~X~\to~\mathbb{R}$ which satisfies:
    \begin{enumerate}
        \item $f$ is concave on its domain $X$ (for stochastic dynamics);
        \item $f$ is superlinear:
              \begin{enumerate}[label=(\roman*)]
                  \item $f(x+y) \geq f(x) + f(y)$ for all $x,y \in X$
                  \item $f(\gamma x) \geq \gamma f(x)$ for all $x \in X$ %and all $\lambda \in (0,1)$
              \end{enumerate}
        \item $f\left( \max_{a} \mathcal{Q}(s,a) \right) \geq \max_{a}~f\left( \mathcal{Q}(s,a) \right)$ for all functions $\mathcal{Q}:~\s~\times~\A \to X.$
    \end{enumerate}

    then the optimal action-value functions are now related in the following way:
    \begin{equation}\label{eqn:concave_std_err}
        f(\overbar{Q}(s,a)) - \overbar{\hat{C}}(s,a)-\frac{2}{1-\gamma}L \varepsilon \leq \widetilde{Q}(s,a) \leq f(\overbar{Q}(s,a)) + L \varepsilon
    \end{equation}

    where $\overbar{\hat{C}}$ is the optimal value function for a task with reward
    \begin{equation}
        \overbar{\hat{r}_C}(s,a) = f(\overbar{Q}(s,a)) - f(r(s,a)) - \gamma \E_{s'\sim{}p} \overbar{V_f}(s')
    \end{equation}
    with $\overbar{V_f}(s)=\max_a f(\overbar{Q}(s,a))$.
    % for all states $s \in \s$ and actions $a \in \A$.
\end{customlemma}

The proof of Lemma \ref{thm:concave_cond_std_err} is the same as that given above for Lemma \ref{thm:convex_cond_std_err}, with all signs flipped.

Finally, we note that both extensions of Lemma \ref{thm:convex_cond_std_err} and \ref{thm:concave_cond_std_err} hold for the entropy-regularized case. The only differences required to prove the results are showing that Lemma \ref{lem:bounded_q_diff} and $|V_f(s)-\overbar{V_f}(s)|\leq L\varepsilon$ hold in entropy-regularized RL. Both statements are trivial given that the necessary soft-max operation is $1$-Lipschitz. Similar results can be derived for the case of compositions, when each subtasks' $Q$-function is replaced by an $\varepsilon$-optimal estimate thereof.

% \clearpage
% \newpage
\section{Results Applying to Both Entropy-Regularized and Standard RL}

As we have discussed in the main text; an agent with a large library of accessible functions will be able to transform and compose their primitive knowledge in a wider variety of ways. Therefore, we would like to extend $\mathcal{F}$ to encompass as many functions as possible. Below, we will show that the functions $f\in \mathcal{F}$ characterizing the Transfer MDP Library have two closure properties (additivity and function composition) which enables more accessible transfer functions.

First, let $\mathcal{F}^+$ denote the set of functions $f \in \mathcal{F}$ obeying the convex conditions, and similarly let $\mathcal{F}^-$ denote the set of functions obeying the concave conditions.

In standard RL, we have the following closure property for addition of functions.
\begin{theorem}
    Let $f,g \in  \mathcal{F}^+$. Then $f+g \in \mathcal{F}^+$. Similarly, if $f,g \in \mathcal{F}^-$, then $f+g \in \mathcal{F}^-$.
\end{theorem}
\begin{proof}
    Let $f,g \in \mathcal{F}^+$.

    Convexity:
    The sum of two convex functions is convex.

    Subadditive:
    $(f+g)(x+y) = f(x+y)+g(x+y)\leq f(x)+g(x)+f(y)+g(y)=(f+g)(x)+(f+g)(y)$.

    Submultiplicative:
    $(f+g)(\gamma x) = f(\gamma x) + g(\gamma x) \le \gamma f(x) + \gamma g(x) = \gamma(f+g)(x)$.

    The proof for $f,g \in \mathcal{F}^-$ is the same with all signs flipped, except for the additional final condition:
    $(f+g)(\max_i x_i) = f(\max_i x_i) + g(\max_i x_i) = \max f(x) + \max g(x) \ge \max f(x) + g(x).$ Although this is not equality as shown in the main text, the condition still suffices. For the case of a single function (no addition, as seen in main text), it can never be the cases that $\max_i f(x_i)~>~\max f(x)$ and therefore was excluded. (Just as $\max_i f(x_i) \le \max f(x)$ is automatically satisfied for the convex conditions.)
\end{proof}

\begin{theorem}[Function Composition]
    \label{thm:compos}
    For any reward-mapping functions $f$, $g \in \mathcal{F}^+$ ($\mathcal{F}^-$) with $f$ non-decreasing, the composition of functions $f$ and $g$, $h(x) = f(g(x)) \in \mathcal{F}^+ (\mathcal{F}^-)$.
\end{theorem}
\begin{proof}
    Let $f,g \in \mathcal{F}^+$ assume $f: B \to C$ and $g:A \to B$, and let $f$ be non-decreasing. This guarantees that $f(g(x))$ is convex.
    % (\JA{Reference to book?: (comment) })
    % https://web.stanford.edu/~boyd/cvxbook/bv_cvxbook.pdf
    Additionally, $f(g(x+y)) \leq f(g(x)+g(y)) \leq f(g(x)) + f(g(y))$ by the sublinearity of $g,f$ respectively. Similarly $f(g(\gamma x)) \leq f(\gamma g(x)) \leq \gamma f(g(x))$.

    For the standard RL (concave) condition, note that  for all functions $\mathcal{Q}:~\s~\times~\A \to X$:
    \begin{equation}
        f\left( g\left( \max_{a} \mathcal{Q}(s,a)\right) \right) \geq f\left(\max_{a}~g\left( \mathcal{Q}(s,a)\right) \right) \geq \max_{a}~f\left(g\left( \mathcal{Q}(s,a)\right) \right)
    \end{equation}

    For the entropy-regularized condition, we first apply the condition to $g$:

    \begin{equation}
        f\left( g\left( \frac{1}{\beta}\log\mathbb{E}_{a \sim{} \pi_0(a|s)} \exp(\beta \Q(s,a))\right) \right) \le f\left( \frac{1}{\beta}\log\mathbb{E}_{a \sim{} \pi_0(a|s)} \exp(\beta g(\Q(s,a)) )\right)
    \end{equation}
    Then to $f$:

    \begin{equation}
        f\left( g\left( \frac{1}{\beta}\log\mathbb{E}_{a \sim{} \pi_0(a|s)} \exp(\beta \Q(s,a))\right) \right) \le  \frac{1}{\beta}\log\mathbb{E}_{a \sim{} \pi_0(a|s)} \exp\left(\beta f\left( g\left(\Q(s,a)\right) \right) \right)
    \end{equation}
    The reversed statement, when $f,g \in \mathcal{F}^-$ with $f$ non-decreasing has a similar proof and is omitted.
\end{proof}

With this result established, we are now able to concatenate multiple transformations. This allows for multiple gates in Boolean logic statements, for example. As stated in the main text, this ability to compose multiple functions will greatly expand the number of tasks in the Transfer MDP Library which the agent may (approximately) solve.

\nocite{openAI}
\bibliography{uai2023-template.bib}

\end{document}