\setcounter{section}{0}
\renewcommand{\thesection}{\Alph{section}}
\section{Correctness} \label{app:proof}
Here, we prove the correctness of our approach, as stated in \thmref{thm:main}. 
We start by proving the following auxiliary lemmas.

%==========================================================================
\begin{lemma}\label{lem:h2p}
Adaptive hybrid reward function $\Rah{k}$ tends to adaptive progression reward function $\Rap{k}$ with an increasing number of updates $k$, that is, $\lim_{k \to \infty} \Rah{k} = \Rap{k}$. 
\end{lemma}

\begin{proof}
By the definition of adaptive hybrid reward function $\Rah{k}$ (cf. \eqnref{eqn:rah}), $\eta_0 \in [0,1]$ and $\eta_k = \frac{\eta_{k-1}}{\theta}$ with $\theta > 1$.
We have $\lim_{k \to \infty} \eta_k = 0$. 
The first case of \eqnref{eqn:rah}, $\eta_k \cdot - d^k_\varphi(q)$, tends to $0$; and the second case tends to $\max \{\rho^0_\varphi (q, q'), \rho^k_\varphi (q, q')\}$.
Thus, it holds that $\lim_{k \to \infty} \Rah{k} = \Rap{k}$.
\end{proof}




%==========================================================================
\begin{lemma}\label{lem:progress}
Given an episodic MDP $\cM$ and a DFA $\cA_\varphi$ for a co-safe LTL formula $\varphi$, let $\pi^{*}_k$ and $\pi^{*}_{k+1}$ denote the optimal policies of the product MDP $\cM^\otimes = \cM \otimes \cA_\varphi$, maximizing the expected return based on adaptive progression reward functions $\Rap{k}$ and $\Rap{k+1}$, respectively. If a policy exists that achieves a higher expected return than $\pi^{*}_k$ based on $\Rap{k+1}$, then $\pi^{*}_{k+1}$ achieves better task progression than $\pi^{*}_k$, that is, $b(\pi^{*}_{k+1}) < b(\pi^{*}_k)$. 
\end{lemma}

\begin{proof}
For the sake of contradiction, suppose that $b(\pi^{*}_{k+1}) \ge b(\pi^{*}_k)$. 
Let $\tau$ be a path through the product MDP $\cM^\otimes$ under policy $\pi^{*}_{k+1}$. 
For any state $\langle s, q\rangle$ in the path $\tau$, we have $q \in B_i$ where $i \ge b(\pi^{*}_{k+1}) \ge b(\pi^{*}_k) = b_k$.
For every transition $(\langle s, q\rangle, a,\langle s', q'\rangle) \in \tau$, it holds that:
\begin{align*} 
& \Rap{k+1} \left(\langle s, q\rangle, a,\langle s', q'\rangle\right) \\
    =& \max \{\rho^0_\varphi (q, q'), \rho^{k+1}_\varphi (q, q')\} \\
    =& \max \{\rho^0_\varphi (q, q'), \max \{0, d^{k+1}_\varphi(q) - d^{k+1}_\varphi(q') \} \} \\
    =& \max \{\rho^0_\varphi (q, q'), \max \{0, d_\varphi^{k}(q) + \theta - d_\varphi^{k}(q') - \theta \} \} \\
    =& \max \{\rho^0_\varphi (q, q'), \max \{0, d^k_\varphi(q) - d^k_\varphi(q') \} \} \\   
    =& \max \{\rho^0_\varphi (q, q'), \rho^k_\varphi (q, q') \} \\
    =& \Rap{k} \left(\langle s, q\rangle, a,\langle s', q'\rangle\right)   
\end{align*}
Thus, we have $\Vap{k+1}{\pi^{*}_{k+1}}(s_0^{\otimes}) = \Vap{k}{\pi^{*}_{k+1}}(s_0^{\otimes})$, meaning that the expected return stays the same.
Similarly, we can show that $\Vap{k+1}{\pi^{*}_{k}}(s_0^{\otimes}) = \Vap{k}{\pi^{*}_{k}}(s_0^{\otimes})$. 


Since $\pi^{*}_{k}$ is the optimal policy maximizing the expected return based on $\Rap{k}$, we have 
\begin{equation} \label{eqn:c1}
    \Vap{k}{\pi^{*}_{k}}(s_0^{\otimes}) \ge \Vap{k}{\pi^{*}_{k+1}}(s_0^{\otimes}) = 
    \Vap{k+1}{\pi^{*}_{k+1}}(s_0^{\otimes}).
\end{equation}

Given that there exists a policy that achieves a higher expected return than $\pi^{*}_k$ based on $\Rap{k+1}$, 
it holds that
\begin{equation} \label{eqn:c2}
    \Vap{k}{\pi^{*}_{k}}(s_0^{\otimes}) = \Vap{k+1}{\pi^{*}_k}(s_0^{\otimes}) < 
    \Vap{k+1}{\pi^{*}_{k+1}}(s_0^{\otimes}).
\end{equation}

\eqnref{eqn:c1} contradicts with \eqnref{eqn:c2}. Thus, we have $b(\pi^{*}_{k+1}) < b(\pi^{*}_k)$. 
\end{proof}


%==========================================================================
Now we are ready to prove \thmref{thm:main} as stated in \sectref{sec:approach} and repeated here. 
\setcounter{theorem}{0}
\begin{theorem}
Given an episodic MDP $\cM$ and a DFA $\cA_\varphi$ corresponding to a co-safe LTL formula $\varphi$, there exists an optimal policy $\pi^*$ of the product MDP $\cM^\otimes = \cM \otimes \cA_\varphi$ that maximizes the expected return based on a reward function $R^{\otimes} \in \{\Rap{k}, \Rah{k}\}$ for some $k \in \Nset$, where the task progression for policy $\pi^*$ matches the best possible task progression $b^*$ across all feasible policies in the product MDP $\cM^\otimes$, that is, $b^* = b(\pi^*)$. 
\end{theorem}

\begin{proof}
Without loss of generality, we focus on the adaptive progression reward function $\Rap{k}$, as \lemref{lem:h2p} shows that $\lim_{k \to \infty} \Rah{k} = \Rap{k}$. 

Let $\pi^{*}_k$ denote an optimal policy of the product MDP $\cM^\otimes$ that maximizes the expected return based on $\Rap{k}$.
Suppose that $b(\pi^{*}_k) > b^*$. 
There exists a policy $\pi$ in the product MDP that achieves the best possible task progression $b^*$, where 
$\Vap{k}{\pi}(s_0^{\otimes}) \le \Vap{k}{\pi^{*}_{k}}(s_0^{\otimes})$.
%
If $\Vap{k}{\pi}(s_0^{\otimes}) = \Vap{k}{\pi^{*}_{k}}(s_0^{\otimes})$, then $\pi$ is the desired optimal policy $\pi^*$ that maximizes the expected return based on $\Rap{k} $while achieving the best possible task progression $b^*$. This theorem is thus proved. 

Otherwise, when $\Vap{k}{\pi}(s_0^{\otimes}) < \Vap{k}{\pi^{*}_{k}}(s_0^{\otimes})$, we proceed to prove the theorem as follows. 
Let the difference in expected returns be denoted by 
$\sigma = \Vap{k}{\pi^{*}_{k}}(s_0^{\otimes}) - \Vap{k}{\pi}(s_0^{\otimes}) > 0$.
Consider a worst-case scenario where policy $\pi$ reaches a state with the best possible task progression only at the end of an episode. 
Formally, there is only one path $\tau$ of length $|\tau|=H$ through the product MDP $\cM^\otimes$ under policy $\pi$ that ends with a transition $(\langle s, q\rangle, a, \langle s', q'\rangle)$ where $q \in B_i$, $q' \in B_j$, and $i > j = b^*$.
Based on the definition of adaptive progression reward function, we have 
$\Rap{k+1} \left(\langle s, q\rangle, a,\langle s', q'\rangle\right) = \Rap{k} \left(\langle s, q\rangle, a,\langle s', q'\rangle\right) + \theta$. 
Thus, $\Vap{k+1}{\pi}(s_0^{\otimes}) = \Vap{k}{\pi}(s_0^{\otimes}) + p \cdot \gamma^{H-1} \cdot \theta$, 
where $p$ is the probability of path $\tau$ and $\gamma$ is the MDP's discount factor. 
Following the argument in \lemref{lem:progress}, it holds that 
$\Vap{k+1}{\pi^{*}_{k}}(s_0^{\otimes}) = \Vap{k}{\pi^{*}_{k}}(s_0^{\otimes})$. 
When the hyperparameter value $\theta$ is sufficiently large, more precisely,
$\theta > \frac{\sigma}{p \cdot \gamma^{H-1}}$,
we have $\Vap{k+1}{\pi}(s_0^{\otimes}) > \Vap{k+1}{\pi^{*}_{k}}(s_0^{\otimes})$. 
Let $\pi^{*}_{k+1}$ denote an optimal policy of the product MDP $\cM^\otimes$ that maximizes the expected return based on $\Rap{k+1}$.
If $\Vap{k+1}{\pi}(s_0^{\otimes}) = \Vap{k+1}{\pi^{*}_{k+1}}(s_0^{\otimes})$, then $\pi$ is the desired optimal policy $\pi^*$ and the theorem is thus proved. 
Otherwise, following \lemref{lem:progress}, it holds that $b(\pi^{*}_{k+1}) < b(\pi^{*}_k)$, meaning that the task progression for $\pi^{*}_{k+1}$ has improved compared to that of policy $\pi^{*}_k$. 
Since a task progression value is bounded by the state partition size of DFA $\cA_\varphi$, it takes only a finite number of updates before an optimal policy yielding $b^*$ is learned.


In conclusion, there exists an optimal policy $\pi^*$ for the product MDP $\cM^\otimes$ that achieves the best possible task progression $b^*$ while maximizing the expected return based on $\Rap{k}$ for some $k \in \Nset$, which is an adaptive progression reward function updated in a finite number of rounds with a sufficiently large hyperparameter value $\theta$.
\end{proof}