\begin{algorithm}[h]
\caption{Self-Taught Principle Learning (STaPLe)}\label{alg:STaPLe}
\begin{algorithmic}[1]
\Require 
  Dataset $\displaystyle \mathcal{D}=\{(x_i,y^G_i)\}_{i=1}^n$, pretrained LM parameters $\theta^{(0)}$, 
  number of EM iterations $T$,  
  number of principle samples $N$,  
  similarity threshold $\tau$,  
  similarity function $f(\cdot,\cdot)$; (optional) embedding model $EMB$, clustering algorithm $\mathcal{C}$, and cluster representative scheme $\mathcal{R}$
\For{$t=0,\dots,T-1$}
  \State \textbf{E-step:} initialize $\mathcal{D}'\leftarrow\varnothing$.
  \For{each $(x_i,y^G_i)\in\mathcal{D}$}
    \State Sample initial response \;$y_i^1\sim \pi_{\theta^{(t)}}(\cdot\mid x_i)$.
    \If{$f(y_i^1,y^G_i)<\tau$}  \Comment{needs refinement}
      \State Draw principles 
        $\{z_i^{(j)}\}_{j=1}^N \sim p_{\theta^{(t)}}\bigl(z\mid x_i,y_i^1,y^G_i\bigr)$.
      \For{$j=1,\dots,N$}
        \State Generate critique 
          $c_i^{(j)}\leftarrow\mathrm{Critique}\bigl(y_i^1,\,z_i^{(j)}\bigr)$.
        \State Sample refinement 
          $y_i^{2,(j)}\sim \pi_{\theta^{(t)}}\bigl(\cdot\mid x_i,y_i^1,z_i^{(j)},c_i^{(j)}\bigr)$.
      \EndFor
      \State $j^*\leftarrow\arg\!\max_j\,f\bigl(y_i^{2,(j)},y^G_i\bigr)$
      \State $(z_i,\,y_i^2)\leftarrow\bigl(z_i^{(j^*)},\,y_i^{2,(j^*)}\bigr)$
      \If{$f(y_i^2,y^G_i)>f(y_i^1,y^G_i)$}
        \State Add trajectory $(x_i,\,y_i^1,\,z_i,\,y_i^2)$ to $\mathcal{D}'$.
      \EndIf
    \EndIf
  \EndFor
  \State \textbf{(Optional):} Cluster the principles to a smaller set in augmented dataset $\widetilde D$ 
    \Statex \hspace{\algorithmicindent}
           Clusters $C \gets \mathcal C\bigl(EMB(\{z_i\})\bigr)$
    \Statex 
    \hspace{\algorithmicindent} Assign cluster representatives (e.g. Medoid) 
           $\widetilde Z \gets \mathcal R(C)$
    \Statex \hspace{\algorithmicindent} Augment dataset
           $\widetilde{\mathcal{D}} \gets \mathrm{Rep}(\mathcal D',\,\widetilde Z)$
    \State \textbf{M-step:}
    \[
      \theta^{(t+1)}
        \;\leftarrow\;
      \arg\max_{\theta}
      \sum_{(x,y^1,z,y^2)\in\mathcal D'} 
        \log p_{\theta}\bigl(y^2,\,z \mid x,\,y^1\bigr).
    \]
\EndFor
\Ensure Final LM parameters $\theta^{(T)}$
\end{algorithmic}
\end{algorithm}
