\documentclass{article}
\usepackage{amsmath}


% if you need to pass options to natbib, use, e.g.:
     %\PassOptionsToPackage{numbers, compress}{natbib}
     \PassOptionsToPackage{round,semicolon}{natbib}
% before loading neurips_2025


% ready for submission
%\usepackage{neurips_2025}


% to compile a preprint version, e.g., for submission to arXiv, add add the
% [preprint] option:
     \usepackage[preprint]{neurips_2025}


% to compile a camera-ready version, add the [final] option, e.g.:
%     \usepackage[final]{neurips_2025}


% to avoid loading the natbib package, add option nonatbib:
%    \usepackage[nonatbib]{neurips_2025}


\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\hypersetup{
    colorlinks=True,
    citecolor=cyan,
    linkcolor=blue,
}
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
%\usepackage{xcolor}         % colors
\usepackage[dvipsnames]{xcolor}
\usepackage{comment}
\usepackage{amsmath}
\usepackage{amsthm}

\usepackage[toc,page,header]{appendix}
\usepackage{minitoc}
\renewcommand \thepart{}
\renewcommand \partname{}

\usepackage[pdftex]{graphicx}

\makeatletter
\newcommand*\footnotescript{%
  \@setfontsize\footnotescript{8.3}{9.5}%
}
\makeatother

\newtheorem{theorem}{Theorem}

\usepackage{algorithm}
\usepackage{algpseudocode}

\usepackage{amssymb}

\usepackage{footnote}
\makesavenoteenv{figure}

\usepackage[most]{tcolorbox}

\definecolor{ultralightgray}{gray}{0.97}

\newtcolorbox{promptbox}[1][]{
  breakable,           
  enhanced,
  colback=ultralightgray, 
  colframe=ultralightgray,   
  boxrule=0pt,           
  arc=0pt,                 
  left=3pt,right=3pt,      
  top=3pt,bottom=3pt,      
  fontupper=\tt\normalsize,
  #1                        
}

\colorlet{lighttan}{Tan!8!white}

\newtcolorbox{constitutionbox}[1][]{
  breakable,           
  enhanced,
  colback=lighttan, 
  colframe=ultralightgray,   
  boxrule=0pt,           
  arc=0pt,                 
  left=3pt,right=3pt,      
  top=3pt,bottom=3pt,      
  fontupper=\tt\normalsize,
  #1                        
}


\newtcolorbox{examplebox}[1][]{
  breakable,           
  enhanced,
  colback=ultralightgray, 
  colframe=ForestGreen,   
  boxrule=1pt,           
  arc=0pt,                 
  left=3pt,right=3pt,      
  top=3pt,bottom=3pt,      
  fontupper=\normalfont\normalsize,
  #1                        
}


\title{Latent Principle Discovery for \\ Language Model Self-Improvement}


% The \author macro works with any number of authors. There are two commands
% used to separate the names and addresses of multiple authors: \And and \AND.
%
% Using \And between authors leaves it to LaTeX to determine where to break the
% lines. Using \AND forces a line break at that point. So, if LaTeX puts 3 of 4
% authors names on the first line, and the last on the second line, try using
% \AND instead of \And before the third author name.


\author{%
  Keshav Ramji\thanks{Correspondence to \texttt{keshav.ramji@ibm.com}.}\hspace{1mm}, Tahira Naseem, Ramón Fernandez Astudillo \\
  IBM Research AI\\
}


\begin{document}
\doparttoc
\faketableofcontents

\maketitle


\begin{abstract}
When language model (LM) users aim to improve the quality of its generations, it is crucial to specify concrete behavioral attributes that the model should strive to reflect. However, curating such principles across many domains, even non-exhaustively, requires a labor-intensive annotation process. To automate this process, we propose eliciting these latent attributes guiding model reasoning towards human-preferred responses by explicitly modeling them in a self-correction setting. Our approach mines new principles from the LM itself and compresses the discovered elements to an interpretable set via clustering. Specifically, we employ an approximation of posterior-regularized Monte Carlo Expectation-Maximization to both identify a condensed set of the most effective latent principles and teach the LM to strategically invoke them in order to intrinsically refine its responses. We demonstrate that bootstrapping our algorithm over multiple iterations enables smaller language models (7-8B parameters) to self-improve, achieving +8-10\% in AlpacaEval win-rate, an average of +0.3 on MT-Bench, and +19-23\% in principle-following win-rate on IFEval. We also show that clustering the principles yields interpretable and diverse model-generated constitutions while retaining model performance. The gains our method achieves highlight the potential of automated, principle-driven post-training recipes toward continual self-improvement.
\end{abstract}

\input{Sections/introduction}
\input{Sections/related_work}
\input{Sections/algorithm}
\input{Sections/results}
\input{Sections/discussion}
\input{Sections/conclusion}
\input{Sections/acknowledgements}

\bibliographystyle{abbrvnat}
%\bibliography{refs}
\begin{thebibliography}{58}
\providecommand{\natexlab}[1]{#1}
\providecommand{\url}[1]{\texttt{#1}}
\expandafter\ifx\csname urlstyle\endcsname\relax
  \providecommand{\doi}[1]{doi: #1}\else
  \providecommand{\doi}{doi: \begingroup \urlstyle{rm}\Url}\fi

\bibitem[Abdin et~al.(2024)Abdin, Aneja, Behl, Bubeck, Eldan, Gunasekar, Harrison, Hewett, Javaheripi, Kauffmann, Lee, Lee, Li, Liu, Mendes, Nguyen, Price, de~Rosa, Saarikivi, Salim, Shah, Wang, Ward, Wu, Yu, Zhang, and Zhang]{phi4}
M.~Abdin, J.~Aneja, H.~Behl, S.~Bubeck, R.~Eldan, S.~Gunasekar, M.~Harrison, R.~J. Hewett, M.~Javaheripi, P.~Kauffmann, J.~R. Lee, Y.~T. Lee, Y.~Li, W.~Liu, C.~C.~T. Mendes, A.~Nguyen, E.~Price, G.~de~Rosa, O.~Saarikivi, A.~Salim, S.~Shah, X.~Wang, R.~Ward, Y.~Wu, D.~Yu, C.~Zhang, and Y.~Zhang.
\newblock Phi-4 technical report, 2024.
\newblock URL \url{https://arxiv.org/abs/2412.08905}.

\bibitem[Bai et~al.(2022{\natexlab{a}})Bai, Jones, Ndousse, Askell, Chen, DasSarma, Drain, Fort, Ganguli, Henighan, Joseph, Kadavath, Kernion, Conerly, El-Showk, Elhage, Hatfield-Dodds, Hernandez, Hume, Johnston, Kravec, Lovitt, Nanda, Olsson, Amodei, Brown, Clark, McCandlish, Olah, Mann, and Kaplan]{bai2022traininghelpfulharmlessassistant}
Y.~Bai, A.~Jones, K.~Ndousse, A.~Askell, A.~Chen, N.~DasSarma, D.~Drain, S.~Fort, D.~Ganguli, T.~Henighan, N.~Joseph, S.~Kadavath, J.~Kernion, T.~Conerly, S.~El-Showk, N.~Elhage, Z.~Hatfield-Dodds, D.~Hernandez, T.~Hume, S.~Johnston, S.~Kravec, L.~Lovitt, N.~Nanda, C.~Olsson, D.~Amodei, T.~Brown, J.~Clark, S.~McCandlish, C.~Olah, B.~Mann, and J.~Kaplan.
\newblock Training a helpful and harmless assistant with reinforcement learning from human feedback, 2022{\natexlab{a}}.
\newblock URL \url{https://arxiv.org/abs/2204.05862}.

\bibitem[Bai et~al.(2022{\natexlab{b}})Bai, Kadavath, Kundu, Askell, Kernion, Jones, Chen, Goldie, Mirhoseini, McKinnon, Chen, Olsson, Olah, Hernandez, Drain, Ganguli, Li, Tran-Johnson, Perez, Kerr, Mueller, Ladish, Landau, Ndousse, Lukosuite, Lovitt, Sellitto, Elhage, Schiefer, Mercado, DasSarma, Lasenby, Larson, Ringer, Johnston, Kravec, Showk, Fort, Lanham, Telleen-Lawton, Conerly, Henighan, Hume, Bowman, Hatfield-Dodds, Mann, Amodei, Joseph, McCandlish, Brown, and Kaplan]{bai2022constitutionalaiharmlessnessai}
Y.~Bai, S.~Kadavath, S.~Kundu, A.~Askell, J.~Kernion, A.~Jones, A.~Chen, A.~Goldie, A.~Mirhoseini, C.~McKinnon, C.~Chen, C.~Olsson, C.~Olah, D.~Hernandez, D.~Drain, D.~Ganguli, D.~Li, E.~Tran-Johnson, E.~Perez, J.~Kerr, J.~Mueller, J.~Ladish, J.~Landau, K.~Ndousse, K.~Lukosuite, L.~Lovitt, M.~Sellitto, N.~Elhage, N.~Schiefer, N.~Mercado, N.~DasSarma, R.~Lasenby, R.~Larson, S.~Ringer, S.~Johnston, S.~Kravec, S.~E. Showk, S.~Fort, T.~Lanham, T.~Telleen-Lawton, T.~Conerly, T.~Henighan, T.~Hume, S.~R. Bowman, Z.~Hatfield-Dodds, B.~Mann, D.~Amodei, N.~Joseph, S.~McCandlish, T.~Brown, and J.~Kaplan.
\newblock Constitutional ai: Harmlessness from ai feedback, 2022{\natexlab{b}}.
\newblock URL \url{https://arxiv.org/abs/2212.08073}.

\bibitem[Chen et~al.(2024{\natexlab{a}})Chen, Feng, Liu, Yao, Prabhakar, Heinecke, Ho, Mui, Savarese, Xiong, and Wang]{chen2024languagemodelshiddenreasoners}
H.~Chen, Y.~Feng, Z.~Liu, W.~Yao, A.~Prabhakar, S.~Heinecke, R.~Ho, P.~Mui, S.~Savarese, C.~Xiong, and H.~Wang.
\newblock Language models are hidden reasoners: Unlocking latent reasoning capabilities via self-rewarding, 2024{\natexlab{a}}.
\newblock URL \url{https://arxiv.org/abs/2411.04282}.

\bibitem[Chen et~al.(2024{\natexlab{b}})Chen, Wen, Nag, Luo, Yin, Li, Li, and Wang]{chen-etal-2024-iteralign}
X.~Chen, H.~Wen, S.~Nag, C.~Luo, Q.~Yin, R.~Li, Z.~Li, and W.~Wang.
\newblock {I}ter{A}lign: Iterative constitutional alignment of large language models.
\newblock In K.~Duh, H.~Gomez, and S.~Bethard, editors, \emph{Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)}, pages 1423--1433, Mexico City, Mexico, June 2024{\natexlab{b}}. Association for Computational Linguistics.
\newblock \doi{10.18653/v1/2024.naacl-long.78}.
\newblock URL \url{https://aclanthology.org/2024.naacl-long.78/}.

\bibitem[Chen et~al.(2024{\natexlab{c}})Chen, Deng, Yuan, Ji, and Gu]{SPIN}
Z.~Chen, Y.~Deng, H.~Yuan, K.~Ji, and Q.~Gu.
\newblock Self-play fine-tuning convertsweak language models to strong language models.
\newblock In \emph{Proceedings of the 41st International Conference on Machine Learning}, ICML'24. JMLR.org, 2024{\natexlab{c}}.

\bibitem[Cui et~al.(2024)Cui, Yuan, Ding, Yao, He, Zhu, Ni, Xie, Xie, Lin, Liu, and Sun]{cui2024ultrafeedbackboostinglanguagemodels}
G.~Cui, L.~Yuan, N.~Ding, G.~Yao, B.~He, W.~Zhu, Y.~Ni, G.~Xie, R.~Xie, Y.~Lin, Z.~Liu, and M.~Sun.
\newblock Ultrafeedback: Boosting language models with scaled ai feedback, 2024.
\newblock URL \url{https://arxiv.org/abs/2310.01377}.

\bibitem[Dayan(1990)]{dayan1990reinforcement}
P.~Dayan.
\newblock Reinforcement comparison.
\newblock In D.~S. Touretzky, J.~L. Elman, T.~J. Sejnowski, and G.~E. Hinton, editors, \emph{Proceedings of the 1990 Connectionist Models Summer School}, pages 45--51, San Mateo, CA, 1990. Morgan Kaufmann.

\bibitem[DeepSeek-AI(2025)]{deepseekai2025deepseekr1incentivizingreasoningcapability}
DeepSeek-AI.
\newblock Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning, 2025.
\newblock URL \url{https://arxiv.org/abs/2501.12948}.

\bibitem[DeepSeek-AI et~al.(2025)DeepSeek-AI, Liu, Feng, Xue, Wang, Wu, Lu, Zhao, Deng, Zhang, Ruan, Dai, Guo, Yang, Chen, Ji, Li, Lin, Dai, Luo, Hao, Chen, Li, Zhang, Bao, Xu, Wang, Zhang, Ding, Xin, Gao, Li, Qu, Cai, Liang, Guo, Ni, Li, Wang, Chen, Chen, Yuan, Qiu, Li, Song, Dong, Hu, Gao, Guan, Huang, Yu, Wang, Zhang, Xu, Xia, Zhao, Wang, Zhang, Li, Wang, Zhang, Zhang, Tang, Li, Tian, Huang, Wang, Zhang, Wang, Zhu, Chen, Du, Chen, Jin, Ge, Zhang, Pan, Wang, Xu, Zhang, Chen, Li, Lu, Zhou, Chen, Wu, Ye, Ye, Ma, Wang, Zhou, Yu, Zhou, Pan, Wang, Yun, Pei, Sun, Xiao, Zeng, Zhao, An, Liu, Liang, Gao, Yu, Zhang, Li, Jin, Wang, Bi, Liu, Wang, Shen, Chen, Zhang, Chen, Nie, Sun, Wang, Cheng, Liu, Xie, Liu, Yu, Song, Shan, Zhou, Yang, Li, Su, Lin, Li, Wang, Wei, Zhu, Zhang, Xu, Xu, Huang, Li, Zhao, Sun, Li, Wang, Yu, Zheng, Zhang, Shi, Xiong, He, Tang, Piao, Wang, Tan, Ma, Liu, Guo, Wu, Ou, Zhu, Wang, Gong, Zou, He, Zha, Xiong, Ma, Yan, Luo, You, Liu, Zhou, Wu, Ren, Ren, Sha, Fu, Xu, Huang, Zhang, Xie, Zhang, Hao,
  Gou, Ma, Yan, Shao, Xu, Wu, Zhang, Li, Gu, Zhu, Liu, Li, Xie, Song, Gao, and Pan]{deepseekai2025deepseekv3technicalreport}
DeepSeek-AI, A.~Liu, B.~Feng, B.~Xue, B.~Wang, B.~Wu, C.~Lu, C.~Zhao, C.~Deng, C.~Zhang, C.~Ruan, D.~Dai, D.~Guo, D.~Yang, D.~Chen, D.~Ji, E.~Li, F.~Lin, F.~Dai, F.~Luo, G.~Hao, G.~Chen, G.~Li, H.~Zhang, H.~Bao, H.~Xu, H.~Wang, H.~Zhang, H.~Ding, H.~Xin, H.~Gao, H.~Li, H.~Qu, J.~L. Cai, J.~Liang, J.~Guo, J.~Ni, J.~Li, J.~Wang, J.~Chen, J.~Chen, J.~Yuan, J.~Qiu, J.~Li, J.~Song, K.~Dong, K.~Hu, K.~Gao, K.~Guan, K.~Huang, K.~Yu, L.~Wang, L.~Zhang, L.~Xu, L.~Xia, L.~Zhao, L.~Wang, L.~Zhang, M.~Li, M.~Wang, M.~Zhang, M.~Zhang, M.~Tang, M.~Li, N.~Tian, P.~Huang, P.~Wang, P.~Zhang, Q.~Wang, Q.~Zhu, Q.~Chen, Q.~Du, R.~J. Chen, R.~L. Jin, R.~Ge, R.~Zhang, R.~Pan, R.~Wang, R.~Xu, R.~Zhang, R.~Chen, S.~S. Li, S.~Lu, S.~Zhou, S.~Chen, S.~Wu, S.~Ye, S.~Ye, S.~Ma, S.~Wang, S.~Zhou, S.~Yu, S.~Zhou, S.~Pan, T.~Wang, T.~Yun, T.~Pei, T.~Sun, W.~L. Xiao, W.~Zeng, W.~Zhao, W.~An, W.~Liu, W.~Liang, W.~Gao, W.~Yu, W.~Zhang, X.~Q. Li, X.~Jin, X.~Wang, X.~Bi, X.~Liu, X.~Wang, X.~Shen, X.~Chen, X.~Zhang, X.~Chen, X.~Nie, X.~Sun,
  X.~Wang, X.~Cheng, X.~Liu, X.~Xie, X.~Liu, X.~Yu, X.~Song, X.~Shan, X.~Zhou, X.~Yang, X.~Li, X.~Su, X.~Lin, Y.~K. Li, Y.~Q. Wang, Y.~X. Wei, Y.~X. Zhu, Y.~Zhang, Y.~Xu, Y.~Xu, Y.~Huang, Y.~Li, Y.~Zhao, Y.~Sun, Y.~Li, Y.~Wang, Y.~Yu, Y.~Zheng, Y.~Zhang, Y.~Shi, Y.~Xiong, Y.~He, Y.~Tang, Y.~Piao, Y.~Wang, Y.~Tan, Y.~Ma, Y.~Liu, Y.~Guo, Y.~Wu, Y.~Ou, Y.~Zhu, Y.~Wang, Y.~Gong, Y.~Zou, Y.~He, Y.~Zha, Y.~Xiong, Y.~Ma, Y.~Yan, Y.~Luo, Y.~You, Y.~Liu, Y.~Zhou, Z.~F. Wu, Z.~Z. Ren, Z.~Ren, Z.~Sha, Z.~Fu, Z.~Xu, Z.~Huang, Z.~Zhang, Z.~Xie, Z.~Zhang, Z.~Hao, Z.~Gou, Z.~Ma, Z.~Yan, Z.~Shao, Z.~Xu, Z.~Wu, Z.~Zhang, Z.~Li, Z.~Gu, Z.~Zhu, Z.~Liu, Z.~Li, Z.~Xie, Z.~Song, Z.~Gao, and Z.~Pan.
\newblock Deepseek-v3 technical report, 2025.
\newblock URL \url{https://arxiv.org/abs/2412.19437}.

\bibitem[D'Oosterlinck et~al.(2024)D'Oosterlinck, Xu, Develder, Demeester, Singh, Potts, Kiela, and Mehri]{doosterlinck2024anchoredpreferenceoptimizationcontrastive}
K.~D'Oosterlinck, W.~Xu, C.~Develder, T.~Demeester, A.~Singh, C.~Potts, D.~Kiela, and S.~Mehri.
\newblock Anchored preference optimization and contrastive revisions: Addressing underspecification in alignment, 2024.
\newblock URL \url{https://arxiv.org/abs/2408.06266}.

\bibitem[Dubois et~al.(2024)Dubois, Galambosi, Liang, and Hashimoto]{dubois2024length}
Y.~Dubois, B.~Galambosi, P.~Liang, and T.~B. Hashimoto.
\newblock Length-controlled alpacaeval: A simple way to debias automatic evaluators.
\newblock \emph{arXiv preprint arXiv:2404.04475}, 2024.

\bibitem[Findeis et~al.(2025)Findeis, Kaufmann, H{\"u}llermeier, Albanie, and Mullins]{findeis2025inverse}
A.~Findeis, T.~Kaufmann, E.~H{\"u}llermeier, S.~Albanie, and R.~D. Mullins.
\newblock Inverse constitutional {AI}: Compressing preferences into principles.
\newblock In \emph{The Thirteenth International Conference on Learning Representations}, 2025.
\newblock URL \url{https://openreview.net/forum?id=9FRwkPw3Cn}.

\bibitem[Fr\"{a}nken et~al.(2024)Fr\"{a}nken, Zelikman, Rafailov, Gandhi, Gerstenberg, and Goodman]{franken}
J.-P. Fr\"{a}nken, E.~Zelikman, R.~Rafailov, K.~Gandhi, T.~Gerstenberg, and N.~D. Goodman.
\newblock Self-supervised alignment with mutual information: Learning to follow principles without preference labels.
\newblock In A.~Globerson, L.~Mackey, D.~Belgrave, A.~Fan, U.~Paquet, J.~Tomczak, and C.~Zhang, editors, \emph{Advances in Neural Information Processing Systems}, volume~37, pages 61328--61371. Curran Associates, Inc., 2024.
\newblock URL \url{https://proceedings.neurips.cc/paper_files/paper/2024/file/70d638f3177d2f0bbdd9f400b43f0683-Paper-Conference.pdf}.

\bibitem[Ganchev et~al.(2010)Ganchev, Gra{\c{c}}a, Gillenwater, and Taskar]{pr-latent-var-models}
K.~Ganchev, J.~Gra{\c{c}}a, J.~Gillenwater, and B.~Taskar.
\newblock Posterior regularization for structured latent variable models.
\newblock \emph{Journal of Machine Learning Research}, 11\penalty0 (67):\penalty0 2001--2049, 2010.
\newblock URL \url{http://jmlr.org/papers/v11/ganchev10a.html}.

\bibitem[Granite~Team(2024)]{granite}
I.~Granite~Team.
\newblock Granite 3.0 language models.
\newblock \url{https://www.rivista.ai/wp-content/uploads/2024/10/paper-1.pdf}, Oct. 2024.

\bibitem[{Granite Team and IBM}(2024)]{ibm-granite-3.1-8b-instruct}
{Granite Team and IBM}.
\newblock Granite-3.1-8b-instruct.
\newblock \url{https://huggingface.co/ibm-granite/granite-3.1-8b-instruct}, Dec. 2024.
\newblock Release Date: December 18, 2024.

\bibitem[Grattafiori et~al.(2024)]{grattafiori2024llama3herdmodels}
A.~Grattafiori et~al.
\newblock The llama 3 herd of models, 2024.
\newblock URL \url{https://arxiv.org/abs/2407.21783}.

\bibitem[Guan et~al.(2025)Guan, Joglekar, Wallace, Jain, Barak, Helyar, Dias, Vallone, Ren, Wei, Chung, Toyer, Heidecke, Beutel, and Glaese]{guan2025deliberativealignmentreasoningenables}
M.~Y. Guan, M.~Joglekar, E.~Wallace, S.~Jain, B.~Barak, A.~Helyar, R.~Dias, A.~Vallone, H.~Ren, J.~Wei, H.~W. Chung, S.~Toyer, J.~Heidecke, A.~Beutel, and A.~Glaese.
\newblock Deliberative alignment: Reasoning enables safer language models, 2025.
\newblock URL \url{https://arxiv.org/abs/2412.16339}.

\bibitem[Head et~al.(2020)Head, Kumar, Nahrstaedt, Louppe, and Shcherbatyi]{head2020scikitoptimize}
T.~Head, M.~Kumar, H.~Nahrstaedt, G.~Louppe, and I.~Shcherbatyi.
\newblock {scikit-optimize}: Sequential model-based optimization in python, Sept.~4 2020.
\newblock URL \url{https://doi.org/10.5281/zenodo.4014775}.

\bibitem[Huang et~al.(2025)Huang, Block, Foster, Rohatgi, Zhang, Simchowitz, Ash, and Krishnamurthy]{huang2025selfimprovement}
A.~Huang, A.~Block, D.~J. Foster, D.~Rohatgi, C.~Zhang, M.~Simchowitz, J.~T. Ash, and A.~Krishnamurthy.
\newblock Self-improvement in language models: The sharpening mechanism.
\newblock In \emph{The Thirteenth International Conference on Learning Representations}, 2025.
\newblock URL \url{https://openreview.net/forum?id=WJaUkwci9o}.

\bibitem[Huang et~al.(2023)Huang, Gu, Hou, Wu, Wang, Yu, and Han]{huang-etal-2023-large}
J.~Huang, S.~Gu, L.~Hou, Y.~Wu, X.~Wang, H.~Yu, and J.~Han.
\newblock Large language models can self-improve.
\newblock In H.~Bouamor, J.~Pino, and K.~Bali, editors, \emph{Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing}, pages 1051--1068, Singapore, Dec. 2023. Association for Computational Linguistics.
\newblock \doi{10.18653/v1/2023.emnlp-main.67}.
\newblock URL \url{https://aclanthology.org/2023.emnlp-main.67/}.

\bibitem[Katsis et~al.(2025)Katsis, Rosenthal, Fadnis, Gunasekara, Lee, Popa, Shah, Zhu, Contractor, and Danilevsky]{katsis2025mtragmultiturnconversationalbenchmark}
Y.~Katsis, S.~Rosenthal, K.~Fadnis, C.~Gunasekara, Y.-S. Lee, L.~Popa, V.~Shah, H.~Zhu, D.~Contractor, and M.~Danilevsky.
\newblock Mtrag: A multi-turn conversational benchmark for evaluating retrieval-augmented generation systems, 2025.
\newblock URL \url{https://arxiv.org/abs/2501.03468}.

\bibitem[Kim et~al.(2024)Kim, Suk, Longpre, Lin, Shin, Welleck, Neubig, Lee, Lee, and Seo]{prometheus}
S.~Kim, J.~Suk, S.~Longpre, B.~Y. Lin, J.~Shin, S.~Welleck, G.~Neubig, M.~Lee, K.~Lee, and M.~Seo.
\newblock Prometheus 2: An open source language model specialized in evaluating other language models, 2024.
\newblock URL \url{https://arxiv.org/abs/2405.01535}.

\bibitem[Kojima et~al.(2022)Kojima, Gu, Reid, Matsuo, and Iwasawa]{kojima}
T.~Kojima, S.~S. Gu, M.~Reid, Y.~Matsuo, and Y.~Iwasawa.
\newblock Large language models are zero-shot reasoners.
\newblock In S.~Koyejo, S.~Mohamed, A.~Agarwal, D.~Belgrave, K.~Cho, and A.~Oh, editors, \emph{Advances in Neural Information Processing Systems}, volume~35, pages 22199--22213. Curran Associates, Inc., 2022.
\newblock URL \url{https://proceedings.neurips.cc/paper_files/paper/2022/file/8bb0d291acd4acf06ef112099c16f326-Paper-Conference.pdf}.

\bibitem[Kumar et~al.(2025)Kumar, Zhuang, Agarwal, Su, Co-Reyes, Singh, Baumli, Iqbal, Bishop, Roelofs, Zhang, McKinney, Shrivastava, Paduraru, Tucker, Precup, Behbahani, and Faust]{kumar2025training}
A.~Kumar, V.~Zhuang, R.~Agarwal, Y.~Su, J.~D. Co-Reyes, A.~Singh, K.~Baumli, S.~Iqbal, C.~Bishop, R.~Roelofs, L.~M. Zhang, K.~McKinney, D.~Shrivastava, C.~Paduraru, G.~Tucker, D.~Precup, F.~Behbahani, and A.~Faust.
\newblock Training language models to self-correct via reinforcement learning.
\newblock In \emph{The Thirteenth International Conference on Learning Representations}, 2025.
\newblock URL \url{https://openreview.net/forum?id=CjwERcAU7w}.

\bibitem[Kwon et~al.(2023)Kwon, Li, Zhuang, Sheng, Zheng, Yu, Gonzalez, Zhang, and Stoica]{kwon2023efficient}
W.~Kwon, Z.~Li, S.~Zhuang, Y.~Sheng, L.~Zheng, C.~H. Yu, J.~E. Gonzalez, H.~Zhang, and I.~Stoica.
\newblock Efficient memory management for large language model serving with pagedattention.
\newblock In \emph{Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles}, 2023.

\bibitem[Li et~al.(2023)Li, Zhang, Dubois, Taori, Gulrajani, Guestrin, Liang, and Hashimoto]{alpaca_eval}
X.~Li, T.~Zhang, Y.~Dubois, R.~Taori, I.~Gulrajani, C.~Guestrin, P.~Liang, and T.~B. Hashimoto.
\newblock Alpacaeval: An automatic evaluator of instruction-following models.
\newblock \url{https://github.com/tatsu-lab/alpaca_eval}, 5 2023.

\bibitem[Lin(2004)]{lin-2004-rouge}
C.-Y. Lin.
\newblock {ROUGE}: A package for automatic evaluation of summaries.
\newblock In \emph{Text Summarization Branches Out}, pages 74--81, Barcelona, Spain, July 2004. Association for Computational Linguistics.
\newblock URL \url{https://aclanthology.org/W04-1013/}.

\bibitem[Liu et~al.(2025)Liu, Wang, Xu, Ma, Ruan, Li, Liu, and Wu]{liu2025inferencetimescalinggeneralistreward}
Z.~Liu, P.~Wang, R.~Xu, S.~Ma, C.~Ruan, P.~Li, Y.~Liu, and Y.~Wu.
\newblock Inference-time scaling for generalist reward modeling, 2025.
\newblock URL \url{https://arxiv.org/abs/2504.02495}.

\bibitem[Loshchilov and Hutter(2019)]{loshchilov2018decoupled}
I.~Loshchilov and F.~Hutter.
\newblock Decoupled weight decay regularization.
\newblock In \emph{International Conference on Learning Representations}, 2019.
\newblock URL \url{https://openreview.net/forum?id=Bkg6RiCqY7}.

\bibitem[Madaan et~al.(2023)Madaan, Tandon, Gupta, Hallinan, Gao, Wiegreffe, Alon, Dziri, Prabhumoye, Yang, Gupta, Majumder, Hermann, Welleck, Yazdanbakhsh, and Clark]{madaan}
A.~Madaan, N.~Tandon, P.~Gupta, S.~Hallinan, L.~Gao, S.~Wiegreffe, U.~Alon, N.~Dziri, S.~Prabhumoye, Y.~Yang, S.~Gupta, B.~P. Majumder, K.~Hermann, S.~Welleck, A.~Yazdanbakhsh, and P.~Clark.
\newblock Self-refine: Iterative refinement with self-feedback.
\newblock In A.~Oh, T.~Naumann, A.~Globerson, K.~Saenko, M.~Hardt, and S.~Levine, editors, \emph{Advances in Neural Information Processing Systems}, volume~36, pages 46534--46594. Curran Associates, Inc., 2023.
\newblock URL \url{https://proceedings.neurips.cc/paper_files/paper/2023/file/91edff07232fb1b55a505a9e9f6c0ff3-Paper-Conference.pdf}.

\bibitem[{Meta}(2024)]{meta-llama-3.1-8b-instruct}
{Meta}.
\newblock Llama-3.1-8b-instruct.
\newblock \url{https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct}, July 2024.
\newblock Release Date: July 23, 2024.

\bibitem[OpenAI(2024)]{openai2024gpt4ocard}
OpenAI.
\newblock Gpt-4o system card, 2024.
\newblock URL \url{https://arxiv.org/abs/2410.21276}.

\bibitem[{OpenAI}(2025)]{openai2025o3_o4mini_systemcard}
{OpenAI}.
\newblock {OpenAI o3 and o4-mini System Card}.
\newblock System card, OpenAI, Apr. 2025.
\newblock URL \url{https://cdn.openai.com/pdf/2221c875-02dc-4789-800b-e7758f3722c1/o3-and-o4-mini-system-card.pdf}.

\bibitem[Patel et~al.(2024)Patel, Hofmarcher, Leoveanu-Condrei, Dinu, Callison-Burch, and Hochreiter]{patel2024largelanguagemodelsselfimprove}
A.~Patel, M.~Hofmarcher, C.~Leoveanu-Condrei, M.-C. Dinu, C.~Callison-Burch, and S.~Hochreiter.
\newblock Large language models can self-improve at web agent tasks, 2024.
\newblock URL \url{https://arxiv.org/abs/2405.20309}.

\bibitem[Petridis et~al.(2024)Petridis, Wedin, Yuan, Wexler, and Thain]{petridis-etal-2024-constitutionalexperts}
S.~Petridis, B.~Wedin, A.~Yuan, J.~Wexler, and N.~Thain.
\newblock {C}onstitutional{E}xperts: Training a mixture of principle-based prompts.
\newblock In L.-W. Ku, A.~Martins, and V.~Srikumar, editors, \emph{Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)}, pages 574--582, Bangkok, Thailand, Aug. 2024. Association for Computational Linguistics.
\newblock \doi{10.18653/v1/2024.acl-short.52}.
\newblock URL \url{https://aclanthology.org/2024.acl-short.52/}.

\bibitem[Phan et~al.(2023)Phan, Hoffman, Dohan, Douglas, Le, Parisi, Sountsov, Sutton, Vikram, and Saurous]{TRICE}
D.~Phan, M.~D. Hoffman, D.~Dohan, S.~Douglas, T.~A. Le, A.~Parisi, P.~Sountsov, C.~Sutton, S.~Vikram, and R.~A. Saurous.
\newblock Training chain-of-thought via latent-variable inference.
\newblock In \emph{Proceedings of the 37th International Conference on Neural Information Processing Systems}, NIPS '23, Red Hook, NY, USA, 2023. Curran Associates Inc.

\bibitem[Qu et~al.(2024)Qu, Zhang, Garg, and Kumar]{RISE}
Y.~Qu, T.~Zhang, N.~Garg, and A.~Kumar.
\newblock Recursive introspection: Teaching language model agents how to self-improve.
\newblock In A.~Globerson, L.~Mackey, D.~Belgrave, A.~Fan, U.~Paquet, J.~Tomczak, and C.~Zhang, editors, \emph{Advances in Neural Information Processing Systems}, volume~37, pages 55249--55285. Curran Associates, Inc., 2024.
\newblock URL \url{https://proceedings.neurips.cc/paper_files/paper/2024/file/639d992f819c2b40387d4d5170b8ffd7-Paper-Conference.pdf}.

\bibitem[Qwen(2025)]{qwen2025qwen25technicalreport}
Qwen.
\newblock Qwen2.5 technical report, 2025.
\newblock URL \url{https://arxiv.org/abs/2412.15115}.

\bibitem[Ramji et~al.(2024)Ramji, Lee, Astudillo, Sultan, Naseem, Munawar, Florian, and Roukos]{ramji2024selfrefinementlanguagemodelsexternal}
K.~Ramji, Y.-S. Lee, R.~F. Astudillo, M.~A. Sultan, T.~Naseem, A.~Munawar, R.~Florian, and S.~Roukos.
\newblock Self-refinement of language models from external proxy metrics feedback, 2024.
\newblock URL \url{https://arxiv.org/abs/2403.00827}.

\bibitem[Ruan et~al.(2025)Ruan, Band, Maddison, and Hashimoto]{ruan2025reasoninglearnlatentthoughts}
Y.~Ruan, N.~Band, C.~J. Maddison, and T.~Hashimoto.
\newblock Reasoning to learn from latent thoughts, 2025.
\newblock URL \url{https://arxiv.org/abs/2503.18866}.

\bibitem[{Sentence Transformers}(2021)]{sentence-transformers-all-MiniLM-L6-v2}
{Sentence Transformers}.
\newblock {all-MiniLM-L6-v2}.
\newblock \url{https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2}, 2021.

\bibitem[Stiennon et~al.(2020)Stiennon, Ouyang, Wu, Ziegler, Lowe, Voss, Radford, Amodei, and Christiano]{tldr}
N.~Stiennon, L.~Ouyang, J.~Wu, D.~Ziegler, R.~Lowe, C.~Voss, A.~Radford, D.~Amodei, and P.~F. Christiano.
\newblock Learning to summarize with human feedback.
\newblock In H.~Larochelle, M.~Ranzato, R.~Hadsell, M.~Balcan, and H.~Lin, editors, \emph{Advances in Neural Information Processing Systems}, volume~33, pages 3008--3021. Curran Associates, Inc., 2020.
\newblock URL \url{https://proceedings.neurips.cc/paper_files/paper/2020/file/1f89885d556929e98d3ef9b86448f951-Paper.pdf}.

\bibitem[Sun et~al.(2023)Sun, Shen, Zhou, Zhang, Chen, Cox, Yang, and Gan]{dromedary}
Z.~Sun, Y.~Shen, Q.~Zhou, H.~Zhang, Z.~Chen, D.~Cox, Y.~Yang, and C.~Gan.
\newblock Principle-driven self-alignment of language models from scratch with minimal human supervision.
\newblock In A.~Oh, T.~Naumann, A.~Globerson, K.~Saenko, M.~Hardt, and S.~Levine, editors, \emph{Advances in Neural Information Processing Systems}, volume~36, pages 2511--2565. Curran Associates, Inc., 2023.
\newblock URL \url{https://proceedings.neurips.cc/paper_files/paper/2023/file/0764db1151b936aca59249e2c1386101-Paper-Conference.pdf}.

\bibitem[Sun et~al.(2024)Sun, Shen, Zhang, Zhou, Chen, Cox, Yang, and Gan]{sun2024salmon}
Z.~Sun, Y.~Shen, H.~Zhang, Q.~Zhou, Z.~Chen, D.~D. Cox, Y.~Yang, and C.~Gan.
\newblock {SALMON}: Self-alignment with instructable reward models.
\newblock In \emph{The Twelfth International Conference on Learning Representations}, 2024.
\newblock URL \url{https://openreview.net/forum?id=xJbsmB8UMx}.

\bibitem[Sutton(1984)]{sutton1984temporal}
R.~S. Sutton.
\newblock \emph{Temporal Credit Assignment in Reinforcement Learning}.
\newblock Ph.d. dissertation, University of Massachusetts, Amherst, Amherst, MA, 1984.

\bibitem[Sutton(2019)]{sutton2019bitter}
R.~S. Sutton.
\newblock The bitter lesson.
\newblock \url{https://www.cs.utexas.edu/~eunsol/courses/data/bitter_lesson.pdf}, 2019.

\bibitem[von Neumann(1951)]{vonNeumann1951RandomDigits}
J.~von Neumann.
\newblock Various techniques used in connection with random digits.
\newblock 1951.
\newblock URL \url{https://mcnp.lanl.gov/pdf_files/InBook_Computing_1961_Neumann_JohnVonNeumannCollectedWorks_VariousTechniquesUsedinConnectionwithRandomDigits.pdf}.
\newblock J.\ Res.\ Natl.\ Bur.\ Stand.\ Appl.\ Math.\ Series, vol.\ 3, pp.\ 36--38 (1955).

\bibitem[Wei and Tanner(1990)]{mc-em}
G.~C.~G. Wei and M.~A. Tanner.
\newblock A monte carlo implementation of the em algorithm and the poor man's data augmentation algorithms.
\newblock \emph{Journal of the American Statistical Association}, 85\penalty0 (411):\penalty0 699--704, 1990.
\newblock ISSN 01621459, 1537274X.
\newblock URL \url{http://www.jstor.org/stable/2290005}.

\bibitem[Wei et~al.(2023)Wei, Wang, Schuurmans, Bosma, Ichter, Xia, Chi, Le, and Zhou]{wei2023chainofthoughtpromptingelicitsreasoning}
J.~Wei, X.~Wang, D.~Schuurmans, M.~Bosma, B.~Ichter, F.~Xia, E.~Chi, Q.~Le, and D.~Zhou.
\newblock Chain-of-thought prompting elicits reasoning in large language models, 2023.
\newblock URL \url{https://arxiv.org/abs/2201.11903}.

\bibitem[Yang et~al.(2018)Yang, Qi, Zhang, Bengio, Cohen, Salakhutdinov, and Manning]{yang-etal-2018-hotpotqa}
Z.~Yang, P.~Qi, S.~Zhang, Y.~Bengio, W.~Cohen, R.~Salakhutdinov, and C.~D. Manning.
\newblock {H}otpot{QA}: A dataset for diverse, explainable multi-hop question answering.
\newblock In E.~Riloff, D.~Chiang, J.~Hockenmaier, and J.~Tsujii, editors, \emph{Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing}, pages 2369--2380, Brussels, Belgium, Oct.-Nov. 2018. Association for Computational Linguistics.
\newblock \doi{10.18653/v1/D18-1259}.
\newblock URL \url{https://aclanthology.org/D18-1259/}.

\bibitem[Zelikman et~al.(2022)Zelikman, Wu, Mu, and Goodman]{zelikman}
E.~Zelikman, Y.~Wu, J.~Mu, and N.~Goodman.
\newblock Star: Bootstrapping reasoning with reasoning.
\newblock In S.~Koyejo, S.~Mohamed, A.~Agarwal, D.~Belgrave, K.~Cho, and A.~Oh, editors, \emph{Advances in Neural Information Processing Systems}, volume~35, pages 15476--15488. Curran Associates, Inc., 2022.
\newblock URL \url{https://proceedings.neurips.cc/paper_files/paper/2022/file/639a9a172c044fbb64175b5fad42e9a5-Paper-Conference.pdf}.

\bibitem[Zhan et~al.(2025)Zhan, Azmat, Horesh, Li, and Yurochkin]{zhan2025sprialigninglargelanguage}
H.~Zhan, M.~Azmat, R.~Horesh, J.~J. Li, and M.~Yurochkin.
\newblock Spri: Aligning large language models with context-situated principles, 2025.
\newblock URL \url{https://arxiv.org/abs/2502.03397}.

\bibitem[Zhang et~al.(2024)Zhang, Madaan, Gao, Zheng, Mishra, Yang, Tandon, and Alon]{zhang2024incontextprinciplelearningmistakes}
T.~Zhang, A.~Madaan, L.~Gao, S.~Zheng, S.~Mishra, Y.~Yang, N.~Tandon, and U.~Alon.
\newblock In-context principle learning from mistakes, 2024.
\newblock URL \url{https://arxiv.org/abs/2402.05403}.

\bibitem[Zheng et~al.(2023)Zheng, Chiang, Sheng, Zhuang, Wu, Zhuang, Lin, Li, Li, Xing, Zhang, Gonzalez, and Stoica]{mt-bench}
L.~Zheng, W.-L. Chiang, Y.~Sheng, S.~Zhuang, Z.~Wu, Y.~Zhuang, Z.~Lin, Z.~Li, D.~Li, E.~Xing, H.~Zhang, J.~E. Gonzalez, and I.~Stoica.
\newblock Judging llm-as-a-judge with mt-bench and chatbot arena.
\newblock In A.~Oh, T.~Naumann, A.~Globerson, K.~Saenko, M.~Hardt, and S.~Levine, editors, \emph{Advances in Neural Information Processing Systems}, volume~36, pages 46595--46623. Curran Associates, Inc., 2023.
\newblock URL \url{https://proceedings.neurips.cc/paper_files/paper/2023/file/91f18a1287b398d378ef22505bf41832-Paper-Datasets_and_Benchmarks.pdf}.

\bibitem[Zhou et~al.(2023)Zhou, Lu, Mishra, Brahma, Basu, Luan, Zhou, and Hou]{zhou2023instructionfollowingevaluationlargelanguage}
J.~Zhou, T.~Lu, S.~Mishra, S.~Brahma, S.~Basu, Y.~Luan, D.~Zhou, and L.~Hou.
\newblock Instruction-following evaluation for large language models, 2023.
\newblock URL \url{https://arxiv.org/abs/2311.07911}.

\bibitem[Zhou et~al.(2020)Zhou, Hu, Zhang, Liang, Sun, Xiong, and Tang]{ELV}
W.~Zhou, J.~Hu, H.~Zhang, X.~Liang, M.~Sun, C.~Xiong, and J.~Tang.
\newblock Towards interpretable natural language understanding with explanations as latent variables.
\newblock In \emph{Proceedings of the 34th International Conference on Neural Information Processing Systems}, NIPS '20, Red Hook, NY, USA, 2020. Curran Associates Inc.
\newblock ISBN 9781713829546.

\end{thebibliography}



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\newpage
\input{Sections/appendix}


\end{document}