\documentclass{article}

% Agents4Science 2025 style
\usepackage{agents4science_2025}

% Common packages
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\PassOptionsToPackage{numbers}{natbib}
\usepackage{amsmath,amssymb,amsfonts}
\usepackage{amsthm}
\usepackage{graphicx}
\usepackage{xcolor}
\usepackage{hyperref}
\newcommand{\sym}{\operatorname{sym}} % sym(A) := (A + A^T)/2
% Fallbacks in case checklist macros are unavailable in the style
\providecommand{\involvementTODO}[1][]{\textcolor{red}{\bf [TODO]}}
\providecommand{\answerTODO}[1][]{\textcolor{red}{\bf [TODO]}}
\providecommand{\justificationTODO}[1][]{\textcolor{red}{\bf [TODO]}}

% Title and authors (anonymized at submission)
\title{Theoretical Foundations of Wasserstein Policy Optimization}

\author{%
  Anonymous Authors \\
  Anonymous Institution \\
  \texttt{anonymous@example.com}
}

\begin{document}

\maketitle

\begin{abstract}
We revisit Wasserstein Policy Optimization (WPO) as policy transport on action densities followed by a projection onto a parametric manifold. Evolving policies by a 2-Wasserstein gradient flow and projecting in the Fisher/KL inner product yields a covariant natural step with a mixed-derivative cross-term. We make this projection-based view explicit, prove baseline invariance (via a constrained G\'ateaux variation) and parameterization covariance (via Fisher pullbacks), and delineate when the step coincides with natural policy gradient (affine-in-action exponential families) versus when it departs (mixtures, squashings). For Gaussian policies we give mean and covariance updates, including a full-covariance Cholesky implementation that preserves SPD. We extend to c-Wasserstein dynamics to obtain principled stability via convex conjugates and state precise energy inequalities in the frozen-critic regime. Assumptions and weak-form conditions are spelled out, and connections to classic PG/DPG/NPG are established.
\end{abstract}

\section{Introduction}
Wasserstein Policy Optimization (WPO) offers a principled link between optimal transport and policy updates through Wasserstein gradient flows. Despite elegant foundations, theoretical questions remain: the precise projection from infinite-dimensional flows to parametric updates; invariance to baselines and parameterizations; and stability when action-value gradients are large. We present a self-contained treatment addressing these points with label-anchored derivations, and we identify the conditions under which the WPO update coincides with, or departs from, classic policy gradient/natural gradient directions.

\section{Background}
On the 2-Wasserstein manifold, steepest descent of a functional $\mathcal{J}[\pi]$ follows the continuity equation with velocity $v = -\nabla_{\mathbf{a}}(\delta\mathcal{J}/\delta\pi)$ \citep{ambrosio2008gradient,benamou2000computational}. Taking $\mathcal{J}[\pi] = -\mathbb{E}[Q^\pi]$ yields $\partial_t \pi = -\nabla_{\mathbf{a}}\!\cdot(\pi\,\nabla_{\mathbf{a}} Q^\pi)$. Projecting this infinite-dimensional flow to a parametric family $\{\pi_\theta\}$ leads to the natural-gradient form $\Delta\theta = F_{\theta\theta}^{-1}\,\mathcal{F}_{t\theta}$, where the cross term is $\mathbb{E}[\nabla_\theta \nabla_{\mathbf{a}}\log\pi\,\nabla_{\mathbf{a}}Q^\pi]$ under mild regularity and boundary conditions. We relate to natural policy gradient \citep{kakade2001natural} and to neural ODE views of transport \citep{chen2018neural}.

\paragraph{Weak form and boundary conditions.} We interpret the continuity equation in weak form, with no-flux boundary condition $(\pi v)\cdot n=0$ on bounded action domains or vanishing flux at infinity on $\mathbb R^d$. Under $C^1$/$C^2$ regularity and dominated convergence, integration by parts is valid and boundary terms vanish; all energy identities and projections below are stated in this weak-form sense.

% theorem environments
\newtheorem{theorem}{Theorem}
\newtheorem{proposition}{Proposition}
\newtheorem{lemma}{Lemma}
\newtheorem{corollary}{Corollary}
\newtheorem{assumption}{Assumption}

\section{Preliminaries and Notation}
Let $\mathcal{A} \subseteq \mathbb{R}^d$ denote the action domain and $\mathcal{S}$ the state space. Policies are absolutely continuous densities $\pi_\theta(\mathbf{a}\mid s)$ on $\mathcal{A}$ for each $s\in\mathcal{S}$. We write expectations as $\mathbb{E}_{s\sim d^\pi,\,\mathbf{a}\sim\pi_\theta(\cdot\mid s)}[\cdot]$ for a fixed reference weighting $d^\pi(s)$ (e.g., discounted occupancy). In line with the policy gradient theorem, our first-order variations operate in a semi-gradient setting where $d^\pi$ and $Q^\pi$ are treated as fixed.

\paragraph{Notation.} $\mathcal S$ states, $\mathcal A\subseteq\mathbb R^d$ actions; $d^\pi$ discounted occupancy; $\pi_\theta(\mathbf a\mid s)$ policy density; $Q^\pi(s,\mathbf a)$ critic; $F_{\theta\theta}$ Fisher; $\sym(X)=\tfrac12(X+X^\top)$; $\mathbb S_{++}^d$ SPD cone. We use bold $\mathbf a$ for vectors and $a$ for 1D examples.

Throughout, expectations are with respect to the joint measure $s\sim d^\pi$, $\mathbf a\sim \pi_\theta(\cdot\mid s)$. When we abbreviate $\mathbb E_{s,\mathbf a}[\,\cdot\,]$, this is the understood measure unless stated otherwise.

The Fisher matrix is
\begin{equation}\label{eq:Fisher}
F_{\theta\theta} \;=\; \mathbb{E}_{s\sim d^\pi,\,\mathbf{a}\sim\pi_\theta(\cdot\mid s)}\big[\nabla_\theta\log\pi_\theta(\mathbf{a}\mid s)\,\nabla_\theta\log\pi_\theta(\mathbf{a}\mid s)^\top\big].
\end{equation}
We equip density variations with the inner product
\begin{equation}\label{eq:inner}
\langle f, g \rangle \;=\; \mathbb{E}_{s\sim d^\pi}\Big[ \int_{\mathcal{A}} \frac{f(\mathbf{a},s)\,g(\mathbf{a},s)}{\pi_\theta(\mathbf{a}\mid s)}\; d\mathbf{a} \Big],
\end{equation}
for which the parametric tangent directions are $\pi\,\nabla_\theta\log\pi$. Throughout, we assume sufficient smoothness (C$^1$/C$^2$), dominated convergence conditions to exchange expectation and differentiation, and vanishing boundary terms so that integration by parts is valid.

\section{Derivation of WPO}
\subsection{Problem Setup}
We consider continuous control with policy $\pi_\theta(\mathbf{a}\mid s)$ and critic $Q^\pi(s,\mathbf{a})$. We adopt a reference state distribution $d^\pi(s)$ (e.g., discounted occupancy) and study policy updates that arise as projections of Wasserstein gradient flows on action densities.

\subsection{From Wasserstein Flow to Parametric Update}
Let $\mathcal{J}[\pi] = -\mathbb{E}_{s\sim d^\pi,\mathbf{a}\sim\pi(\cdot\mid s)}[Q^\pi(s,\mathbf{a})]$. We adopt a per-state time rescaling and work with the rescaled flow whose velocity uses $v=\nabla_{\mathbf a} Q^\pi$ (absorbing the $d^\pi$ factor). The 2-Wasserstein flow is
\begin{equation}\label{eq:w2flow}
\partial_t \pi \;=\; -\nabla_{\mathbf{a}}\!\cdot\big(\pi\,\nabla_{\mathbf{a}} Q^\pi\big).
\end{equation}
We note that embedding the discounted state weighting $d^\pi(s)$ into $\delta\mathcal J/\delta\pi$ scales the per-state velocity by $d^\pi(s)$. This is a per-state time reparameterization and preserves weak-form energy identities, but a Fisher-projected finite step may change unless additional conditions (e.g., state-wise collinearity of velocities or constant scaling) hold. To remove this ambiguity, we adopt the convention $\delta\mathcal J/\delta\pi=-Q^\pi$ and carry $d^\pi(s)$ only as an outer expectation in all cross terms and Fisher quantities.
We project to $\{\pi_\theta\}$ by minimizing, in the Fisher metric, the discrepancy between $\partial_t\pi$ and $\delta\pi_\theta = \pi\,(\nabla_\theta\log\pi)\cdot\Delta\theta$, leading to $F_{\theta\theta}\,\Delta\theta = \mathcal{F}_{t\theta}$ with
\begin{equation}\label{eq:Fttheta}
\mathcal{F}_{t\theta} = \mathbb{E}_{s\sim d^\pi,\mathbf{a}\sim\pi}\big[\nabla_\theta\nabla_{\mathbf{a}}\log \pi_\theta(\mathbf{a}\mid s)\;\nabla_{\mathbf{a}}Q^\pi(s,\mathbf{a})\big].
\end{equation}

\subsection{Energy Dissipation}
Under suitable regularity, and in the frozen-critic regime, the rescaled gradient flow \eqref{eq:w2flow} dissipates $\mathcal{J}$:
\begin{lemma}[Energy dissipation]\label{lem:diss}
Along solutions $\pi_t$ of \eqref{eq:w2flow}, one has $\tfrac{d}{dt}\,\mathcal{J}[\pi_t] \le 0$ with
\( \tfrac{d}{dt}\,\mathcal{J}[\pi_t] = -\,\mathbb{E}_{s\sim d^\pi,\mathbf{a}\sim\pi_t}\big[\|\nabla_{\mathbf{a}} Q^\pi(s,\mathbf{a})\|^2\big] \).
For the c-Wasserstein flow with velocity $\nabla c^*(\nabla_{\mathbf{a}}\delta\mathcal J/\delta\pi)$ one has
\( \tfrac{d}{dt}\,\mathcal{J}[\pi_t] = -\,\mathbb{E}_{s,\mathbf{a}}\big[\langle \nabla c^*(\nabla_{\mathbf{a}}Q^\pi),\,\nabla_{\mathbf{a}}Q^\pi \rangle\big] \le 0 \)
by monotonicity of $\nabla c^*$. See Appendix~(Energy Dissipation) for weak-form derivations and function-space assumptions.
\end{lemma}

% (Boundary/decay conditions and frozen-critic scope are detailed in Appendix~\ref{app:assumptions-scope}.)

\subsection{Functional Derivative of $\mathcal{J}$}
For discounted RL, under the semi-gradient convention (holding $d^\pi$ and $Q^\pi$ fixed), we adopt the rescaled functional derivative compatible with \eqref{eq:w2flow}:
\begin{equation}\label{eq:functional_derivative}
\frac{\delta \mathcal{J}}{\delta \pi}(s,\mathbf{a}) \;=\; -\, Q^\pi(s,\mathbf{a}),\quad d^\pi(s)=(1-\gamma)\sum_t \gamma^t\,\Pr(s_t=s).
\end{equation}
This corresponds to absorbing $d^\pi(s)$ via a per-state time reparameterization. Under the alternative convention that embeds $d^\pi(s)$ inside $\delta\mathcal J/\delta\pi$, the Fisher-projected finite step can differ unless state-wise collinearity or constant scaling holds; our adopted convention avoids this ambiguity while preserving the weak-form energy identities.

\subsection{KL/Fisher Projection and Normal Equations}
Equivalently, minimizing $\|\partial_t\pi - \delta\pi_\theta\|^2$ in the Fisher norm \eqref{eq:inner} yields the normal equations $F_{\theta\theta}\,\Delta\theta=\mathcal{F}_{t\theta}$. The cross term satisfies
\begin{align}
\mathcal{F}_{t\theta} &= \int \nabla_\theta\log\pi_\theta(\mathbf{a}\mid s)\,\partial_t\pi_\theta(\mathbf{a}\mid s)\,d\mathbf{a} \\
&= -\int \nabla_\theta\log\pi_\theta\; \nabla_{\mathbf{a}}\!\cdot\big(\pi_\theta\,\nabla_{\mathbf{a}}Q^\pi\big)\,d\mathbf{a} \\
&= \int \big(\nabla_{\mathbf{a}}\nabla_\theta\log\pi_\theta\big)^\top (\pi_\theta\,\nabla_{\mathbf{a}}Q^\pi)\,d\mathbf{a} \\
&= \mathbb{E}_{\mathbf{a}\sim\pi_\theta}\big[\nabla_\theta\nabla_{\mathbf{a}}\log\pi_\theta\;\nabla_{\mathbf{a}}Q^\pi\big],
\end{align}
where boundary terms vanish and derivatives commute under the stated assumptions.
Under C$^2$ regularity and vanishing boundary terms, this follows from integration by parts. The natural-gradient step is $\Delta\theta = F_{\theta\theta}^{-1}\,\mathcal{F}_{t\theta}$.

\subsection{Projection Metric: Fisher vs. $W_2$}
We project the $W_2$ flow in the Fisher/KL inner product on densities, which induces a covariant natural step on the parametric manifold and yields simple estimators. A $W_2$-parametric projection would instead align the parametric velocity with the $W_2$ tangent metric via a velocity potential on parameters.

\emph{Operator-level relation (per state).} Let $\mathcal{T}_\theta$ denote the parametric tangent span $\{\pi\,\nabla_\theta\log\pi\,u: u\in\mathbb R^{\dim\theta}\}$. The Fisher projector and the $W_2$-tangent projector are two positive self-adjoint operators on $\mathcal{T}_\theta$ induced by different inner products. They coincide when $\mathcal{T}_\theta$ is invariant and the per-state velocity is collinear with the tangent directions (e.g., exponential families with sufficient statistics affine in $\mathbf a$ under a compatible parameterization, such as Gaussian means with fixed covariance). In general they differ by a positive operator on $\mathcal{T}_\theta$. We favor Fisher projection for statistical stability, parameterization covariance, and practicality.

% (A formal statement and proof sketch are provided in Appendix~\ref{app:fisher-w2}.)

\subsection{Stability via c-Wasserstein}
For convex $c$, the c-Wasserstein flow uses the velocity $v=\nabla c^*\big(\nabla_{\mathbf{a}}(\delta\mathcal J/\delta\pi)\big)=\nabla c^*(-\nabla_{\mathbf{a}}Q^\pi)$. The corresponding flow is $\partial_t\pi = -\nabla_{\mathbf{a}}\!\cdot\big(\pi\,\nabla c^*(\nabla_{\mathbf{a}}(\delta\mathcal J/\delta\pi))\big)$. If $c^*$ is even, then $\nabla c^*(-x)=-\nabla c^*(x)$ and one may equivalently write $\partial_t\pi = -\nabla_{\mathbf{a}}\!\cdot\big(\pi\,\nabla c^*(\nabla_{\mathbf{a}}Q^\pi)\big)$. For a separable choice $c(u)=\tfrac{1}{4}\sum_i |u_i|^4$, one has $c^*(x)=\tfrac{3}{4}\sum_i |x_i|^{4/3}$ and the elementwise cube-root map $\nabla c^*(x)_i=\operatorname{sign}(x_i)|x_i|^{1/3}$, controlling large action-gradients while preserving descent/ascent directions. A rotationally invariant alternative $c(u)=\tfrac{1}{4}\|u\|_2^4$ yields $\nabla c^*(x)=\|x\|_2^{-2/3}x$ (radial shrinkage). These shrinkage maps are direction-preserving, monotone, and H\"older-continuous, ensuring the energy identity holds and often allowing larger stable steps than heuristic gradient clipping under the same frozen-critic assumptions.

\begin{theorem}[Energy decay for c-Wasserstein flows]\label{thm:cw-energy}
Under Assumption~\ref{ass:standing}, let $c$ be proper, convex, l.s.c., with differentiable $c^*$ and monotone $\nabla c^*$. The flow
\[
\partial_t\pi\;=\;-\,\nabla_{\mathbf a}\!\cdot\Big(\pi\,\nabla c^*\big(\nabla_{\mathbf a}(\delta\mathcal J/\delta\pi)\big)\Big)
\]
satisfies the intrinsic energy identity
\[
\frac{d}{dt}\mathcal J[\pi_t]
= -\,\mathbb E_{s,\mathbf a}\Big[\big\langle \nabla c^*\!\big(\nabla_{\mathbf a}(\delta\mathcal J/\delta\pi)\big),\,\nabla_{\mathbf a}(\delta\mathcal J/\delta\pi)\big\rangle\Big]\le 0,
\]
with equality iff $\nabla_{\mathbf a}(\delta\mathcal J/\delta\pi)=0$ almost everywhere. Specializing to $\delta\mathcal J/\delta\pi=-Q^\pi$ gives
\( -\,\mathbb E[\langle \nabla c^*(-\nabla_{\mathbf a}Q^\pi),\,-\nabla_{\mathbf a}Q^\pi\rangle]\le 0 \);
if in addition $c^*$ is even (hence $\nabla c^*$ odd), this equals
\( -\,\mathbb E[\langle \nabla c^*(\nabla_{\mathbf a}Q^\pi),\,\nabla_{\mathbf a}Q^\pi\rangle]\le 0 \).
For $c(u)=\tfrac{1}{4}\sum_i |u_i|^4$ one has elementwise shrinkage $\nabla c^*(x)_i=\mathrm{sign}(x_i)|x_i|^{1/3}$; for $c(u)=\tfrac{1}{4}\|u\|_2^4$ one has radial shrinkage $\nabla c^*(x)=\|x\|_2^{-2/3}x$.
\end{theorem}

\noindent Practical note: the induced map is direction-preserving and monotone, which yields energy decay under frozen critics. Unlike heuristic gradient clipping, it retains a variational interpretation and often permits larger stable steps in ill-conditioned regimes. See Theorem~\ref{thm:cw-energy} and Appendix~\ref{app:energy}.

\subsection{Estimators and variance reduction}
With a reparameterization $\mathbf a=f_\theta(\epsilon,s)$ and a frozen critic, an unbiased single-sample estimator of the cross term is
\[
\widehat g_\theta(\epsilon,s)
= \nabla_\theta\nabla_{\mathbf a}\log\pi_\theta\!\big(f_\theta(\epsilon,s)\mid s\big)\,
\widehat{\nabla_{\mathbf a}Q}(s,f_\theta(\epsilon,s)).
\]
Mixed derivatives are computed efficiently via JVP/VJP (one forward- and one reverse-mode pass suffice in most autodiff systems). The critic action-gradient $\nabla_{\mathbf a}Q$ comes from differentiating the critic wrt its action input. A score-aligned control variate subtracts $c(s)\,\nabla_\theta\log\pi_\theta(\mathbf a\mid s)$, with $c(s)$ chosen by least-squares to minimize variance, leaving $\mathbb E[\widehat g_\theta]$ unchanged. The optimal per-state coefficient is
\begin{equation}\label{eq:cv_opt}
 c^*(s) \;=\; \frac{\operatorname{Cov}\big(\nabla_\theta\log\pi_\theta,\,\widehat g_\theta\,\big|\,s\big)}{\operatorname{Var}\big(\nabla_\theta\log\pi_\theta\,\big|\,s\big)}\,,
\end{equation}
estimated online via running moments.

\subsection{Gaussian Parameter Updates}
For Gaussian policies, 1D mean/variance updates are
\begin{align}
\Delta\mu &= \mathbb{E}[\nabla_a Q],\\
\Delta(\sigma^2) &= 2\,\mathbb{E}[(a-\mu)\,\nabla_a Q].
\end{align}
With diagonal covariance, updates act coordinate-wise. For full covariance $\Sigma\in\mathbb S_{++}^d$, the Fisher-natural step simplifies to $\Delta\Sigma = M + M^\top = 2\,\sym(M)$ with $M=\mathbb{E}_{s\sim d^\pi,\,\mathbf a\sim\pi_\theta}[\,(\nabla_{\mathbf a}Q)(\mathbf a-\mu)^\top\,]$. Sampling formulas follow directly from the expectation definitions. The full-covariance update entails $\mathcal{O}(d^3)$ algebra (linear solves), whereas diagonal updates are $\mathcal{O}(d)$. In practice, full covariance helps under strong anisotropy/ill-conditioning; diagonal is preferable when samples or compute are limited.

\paragraph{Full-covariance via Cholesky.}
Write $\Sigma = L L^\top$ with $L$ lower-triangular and $\mathrm{diag}(L)>0$. To implement the natural step stably, solve for $\Delta L$ in the triangular Sylvester equation
\[
L\,\Delta L^\top + \Delta L\,L^\top \;=\; S,\qquad S \,=\, M+M^\top,
\]
by forward substitution \emph{by columns} $j=1,\dots,d$:
\begin{align*}
(2L_{jj})(\Delta L)_{jj} &= S_{jj} - 2\sum_{k<j} L_{jk}(\Delta L)_{jk},\\
(\Delta L)_{ij} &= \frac{ S_{ij}
-\sum_{k<j} \big( L_{ik}(\Delta L)_{jk}+(\Delta L)_{ik}L_{jk}\big)
- L_{ij}(\Delta L)_{jj}
}{L_{jj}},\quad i=j+1,\dots,d.
\end{align*}
Then update $L \leftarrow L + \eta\,\Delta L$ with a line search to keep $\mathrm{diag}(L)>0$. This realizes $\Delta\Sigma=S$ while preserving $\Sigma\in\mathbb S_{++}^d$. In practice, shrink $\eta$ (backtracking) until $\min_i (L+\eta\Delta L)_{ii} > 0$; this prevents loss of SPD under large steps.

% (omitted: implementation sampling/estimation; focus on derivations)

% (Algorithmic details for the Cholesky SPD update are given in Appendix~\ref{app:cholesky-algo}.)

\paragraph{Deterministic limit.} As $\Sigma\to 0$, the mean update reduces to the deterministic policy-gradient direction evaluated at the mean, while the covariance update vanishes. The Fisher metric provides a well-defined limit.

\paragraph{Quadratic toy.} If locally $\nabla_{\mathbf a}Q^\pi(s,\mathbf a)=H(s)(\mathbf a-\mu)$ with $H(s)\succeq0$, then $M=H(s)\,\Sigma$ and $\Delta\Sigma=H(s)\,\Sigma+\Sigma\,H(s)$, while $\Delta\mu=0$ at $\mathbf a=\mu$, consistent with the deterministic limit.

\subsection{Main Results}
We state the primary results under the hypotheses above; proofs are provided in the appendix. Scope: results are stated with fixed critic $Q^\pi$ and state weighting $d^\pi$ (semi-gradient convention). When both evolve, additional residual terms appear.

\begin{assumption}[Standing hypotheses]\label{ass:standing}
(i) $\log\pi_\theta(\mathbf a\mid s)\in C^2$ in $(\mathbf a,\theta)$ with mixed partials commuting almost everywhere, and $\nabla_\theta\nabla_{\mathbf a}\log\pi_\theta\in L^1(d^\pi\otimes \pi_\theta)$; (ii) $Q^\pi(\cdot,s)\in C^1$ with $\nabla_{\mathbf a}Q^\pi\in L^2(\pi_\theta(\cdot\mid s))$ uniformly in $s$; (iii) either $\mathcal A=\mathbb R^d$ with vanishing flux at infinity or a no-flux boundary on $\partial\mathcal A$; (iv) dominated convergence justifies swapping expectations and derivatives. All energy statements hold in the frozen-critic regime.
\end{assumption}

\begin{theorem}[Projection to Natural Gradient]\label{thm:projection}
Let $\partial_t\pi$ satisfy \eqref{eq:w2flow} for $\mathcal{J}[\pi] = -\mathbb{E}[Q^\pi]$. With the inner product \eqref{eq:inner}, the Galerkin orthogonality conditions $\langle \partial_t\pi - \delta\pi_\theta,\, \pi\,\nabla_\theta\log\pi\rangle = 0$ yield the normal equations $F_{\theta\theta}\,\Delta\theta = \mathcal{F}_{t\theta}$ with $\mathcal{F}_{t\theta}$ in \eqref{eq:Fttheta}. Hence $\Delta\theta = F_{\theta\theta}^{-1}\,\mathcal{F}_{t\theta}$.
\end{theorem}
\noindent Finite steps $\Delta\theta=\eta\,F_{\theta\theta}^{-1}\,\mathcal F_{t\theta}$ need not monotonically decrease $\mathcal J$ unless $\eta$ is sufficiently small; a global line search (Armijo/backtracking) can ensure descent. Locally, if $g(\theta):=\mathcal F_{t\theta}$ is $L$-Lipschitz and $F_{\theta\theta}\succeq m I$ in a neighborhood, then choosing $\eta\le m/L$ yields a guaranteed decrease for sufficiently small neighborhoods.
\begin{corollary}[Normal equations]\label{cor:normal-equations}
Under Assumption~\ref{ass:standing}, the Fisher--Galerkin projection yields $F_{\theta\theta}\,\Delta\theta=\mathcal F_{t\theta}$ with $\mathcal F_{t\theta}=\mathbb E[\nabla_\theta\nabla_{\mathbf a}\log\pi\,\nabla_{\mathbf a}Q^\pi]$.
\end{corollary}

\begin{proposition}[Baseline Invariance]\label{prop:baseline}
For any baseline $b(s)$, replacing $Q^\pi$ by $A^\pi = Q^\pi - b(s)$ leaves both the PDE and the parametric cross term $\mathcal{F}_{t\theta}$ unchanged. Equivalently, the constrained first variation under per-state normalization satisfies $\delta\mathcal{J}/\delta\pi = -\big(Q^\pi - b(s)\big)$.
\end{proposition}

\noindent Caution: action-dependent adjustments $b(s,\mathbf a)$ are not baselines in this sense; they alter $\nabla_{\mathbf a}Q$ and hence change both the PDE velocity and the projected update.

\begin{theorem}[Parameterization Covariance]\label{thm:covariance}
Let $\phi=\phi(\theta)$ be a local diffeomorphism with Jacobian $J=\partial\phi/\partial\theta$ and pullback Fisher $F_\phi = J^{-\top}F_\theta J^{-1}$. Define $g_\theta = \mathbb{E}[\nabla_\theta\nabla_{\mathbf{a}}\log\pi\,\nabla_{\mathbf{a}}Q]$ and $g_\phi = J^{-\top}g_\theta$. Then the natural-gradient step is covariant: $\Delta\phi = F_\phi^{-1}g_\phi = J\,F_\theta^{-1}g_\theta = J\,\Delta\theta$.
\end{theorem}

\begin{lemma}[Gaussian Family]\label{lem:gaussian}
For a 1D Gaussian policy with fixed variance, $\Delta\mu = \mathbb{E}[\nabla_a Q]$. With log-variance $\lambda=\log\sigma$, $\Delta\lambda = \sigma^{-2}\,\mathbb{E}[(a-\mu)\nabla_a Q]$, i.e., $\Delta(\sigma^2) = 2\,\mathbb{E}[(a-\mu)\nabla_a Q]$. For full covariance $\Sigma$, with $M=\mathbb{E}[(\nabla_{\mathbf{a}}Q)(\mathbf{a}-\mu)^\top]$ and $G=\sym(\Sigma^{-1} M \Sigma^{-1})$, the affine-invariant Fisher yields $\Delta\Sigma = 2\,\Sigma G\,\Sigma$, which reduces to $\Delta\Sigma = M+M^\top$.
\end{lemma}

\begin{proposition}[c-Wasserstein Gradient Flow]\label{prop:cw}
For convex $c$ with conjugate $c^*$, replacing $\nabla_{\mathbf{a}}Q$ by $\nabla c^*(\nabla_{\mathbf{a}}Q)$ defines the c-Wasserstein gradient flow of $\mathcal{J}$ (in the sense of \citealp{ambrosio2008gradient}). Under standard regularity, $\mathcal{J}$ decreases along solutions. The elementwise cube-root arises from $c^*(x)=\tfrac{3}{4}|x|^{4/3}$.
\end{proposition}

\begin{proposition}[Alignment with NPG for Gaussian means]\label{prop:equiv-npg}
Let $\pi_\theta(\cdot\mid s)=\mathcal N(\mu_\theta(s),\Sigma)$ with fixed $\Sigma$. Suppose $\nabla_{\mathbf a}Q^\pi(s,\mathbf a)\approx H(s)\,(\mathbf a-\mu_\theta(s))$ in a neighborhood of $\mu_\theta(s)$ with $H(s)\succeq 0$. Then the WPO mean step $\Delta\theta_\mu=F_{\theta_\mu}^{-1}\,\mathbb E[\nabla_{\theta_\mu}\nabla_{\mathbf a}\log\pi\,\nabla_{\mathbf a}Q^\pi]$ and the NPG mean step $F_{\theta_\mu}^{-1}\,\mathbb E[\nabla_{\theta_\mu}\log\pi\,A^\pi]$ are collinear for each $s$, and hence globally after averaging over $d^\pi$.
\end{proposition}

% (moved detailed statements above)


\section{Theoretical Discussion}
\paragraph{Equivalence vs. departure.} Proposition~\ref{prop:equiv-npg} formalizes alignment with natural policy gradient for Gaussian means with fixed covariance under a local quadratic assumption on $Q^\pi$. More generally, in exponential families with sufficient statistics affine in action and compatible parameterizations, the WPO and NPG directions can be collinear per state. Departures arise for non-Gaussian families (e.g., mixtures, tanh-squashed Gaussians) through the mixed derivative $\nabla_\theta\nabla_{\mathbf{a}}\log\pi$, which reflects policy-manifold geometry and can rotate the update relative to NPG.

\paragraph{Non-Gaussian example.} For a two-component Gaussian mixture with shared covariance, component responsibilities $\phi_i(\mathbf a,s)=\frac{\rho_i\,\mathcal N(\mathbf a\mid\mu_i,\Sigma)}{\sum_k \rho_k\,\mathcal N(\mathbf a\mid\mu_k,\Sigma)}$ enter $\nabla_\theta\nabla_{\mathbf a}\log\pi$ and weight the cross-moment $\mathbb E[\phi_i(\mathbf a,s)\,\nabla_{\mathbf a}Q\,(\mathbf a-\mu_i)^\top]$. These curvature terms change the step direction even after Fisher preconditioning.

\section{Related Work}
Policy gradient methods include REINFORCE \citep{williams1992simple}, natural policy gradient \citep{kakade2001natural,pascanu2013revisiting}, deterministic policy gradients \citep{silver2014deterministic}, and successors such as DDPG \citep{lillicrap2015continuous}. Wasserstein geometry has informed optimization and learning \citep{ambrosio2008gradient,benamou2000computational}, and several works explored Wasserstein in RL: robust formulations \citep{abdullah2019wassersteinrobustreinforcementlearning}, natural gradients \citep{moskovitz2020efficient}, and policy optimization views \citep{zhang2018policy}. MPO \citep{abdolmaleki2018maximum} relates as a policy iteration method with KL-regularized exponentiated targets. Score-based transport (SVGD) \citep{liu2016svgd} and its policy variants differ in using score-driven particle flows rather than density flows plus projection. For covariance updates, affine-invariant geometry on SPD matrices \citep{absil2008optimization} and second-order approximations like K-FAC \citep{martens2015kfac} provide complementary perspectives.

\section{Limitations}
Our energy and dissipation statements rely on the frozen-critic regime and matching state weighting in the projection metric; fully coupled policy–critic dynamics introduce residual terms we do not analyze. Non-smooth architectures may require weak-form interpretations. We do not analyze the variance/bias of mixed-derivative estimators beyond qualitative remarks. The Fisher vs. $W_2$ parametric projection gap is only closed under affine-in-action exponential families. Extending equivalence results beyond Gaussians and characterizing regularity under which c-Wasserstein squashing preserves optimality are important directions.

\section{Conclusion}
We clarified the WPO foundation with label-anchored derivations: Fisher–Galerkin projection to a parametric update, baseline invariance, parameterization covariance, c-Wasserstein stability, and Gaussian family updates. Future work includes extending equivalence conditions beyond Gaussians and analyzing stronger regularity requirements for non-smooth architectures.

\appendix
\section{Derivations and Assumptions}\label{app:deriv}
\subsection{Convention (Rescaled Flow)}
We work throughout with a per-state time-rescaled flow: the Eulerian velocity uses $v=\nabla_{\mathbf a} Q^\pi$ and the functional derivative is $\delta\mathcal J/\delta\pi=-Q^\pi$. The state weighting $d^\pi(s)$ enters only as an outer expectation over $s$. All projection inner products and Fisher expectations use the same $d^\pi(s)$, so the projected direction is invariant under this rescaling. Using the unrescaled convention instead would multiply the fiberwise energy identities by $d^\pi(s)^2$ inside the state integral.
\subsection{Energy Dissipation (Proofs)}\label{app:energy}
\paragraph{Weak form.} Fix a state $s$. A curve $t\mapsto \pi_t(\cdot\mid s)$ is a weak solution of the continuity equation $\partial_t \pi = -\nabla_{\mathbf a}\!\cdot(\pi v)$ if, for every $\varphi\in C_c^\infty(\mathcal A)$, the map $t\mapsto \int \varphi\,\pi_t\,d\mathbf a$ is absolutely continuous and
\[
\frac{d}{dt}\int \varphi\,\pi_t\,d\mathbf a 
\;=\; \int \nabla_{\mathbf a}\varphi(\mathbf a)\cdot v_t(\mathbf a)\,\pi_t(\mathbf a)\,d\mathbf a.
\]
Assume either vanishing flux at infinity or a no-flux boundary condition on $\partial\mathcal A$.

\paragraph{Function spaces.} Assume $Q^\pi(\cdot,s)\in C^1$, $\nabla_{\mathbf a}Q^\pi\in L^2(\pi_t(\cdot\mid s))$ uniformly in $t$, and $\pi_t v_t\in L^1$. Fubini/Tonelli applies to interchange the $s$ and $\mathbf a$ integrals under $d^\pi(s)\,\pi_t(\mathbf a\mid s)$.

\paragraph{W$_2$ case.} With the rescaled convention $v=\nabla_{\mathbf a}Q^\pi$ and $\delta\mathcal J/\delta\pi=-Q^\pi$, take $\varphi=\delta\mathcal J/\delta\pi$ as a test function and use the chain rule (justified by dominated convergence under the $L^2$ bounds):
\[
\frac{d}{dt}\,\mathcal J[\pi_t] 
= \int \frac{\delta\mathcal J}{\delta\pi}\,\partial_t\pi_t\,d\mathbf a
= -\int \frac{\delta\mathcal J}{\delta\pi}\,\nabla_{\mathbf a}\!\cdot(\pi_t\,v_t)\,d\mathbf a
= \int \nabla_{\mathbf a}\Big(\frac{\delta\mathcal J}{\delta\pi}\Big)\cdot v_t\,\pi_t\,d\mathbf a,
\]
where the boundary term vanishes. Substituting $\nabla_{\mathbf a}(\delta\mathcal J/\delta\pi)=-\nabla_{\mathbf a}Q^\pi$ and $v_t=\nabla_{\mathbf a}Q^\pi$ gives
\( \frac{d}{dt}\,\mathcal J[\pi_t] = -\int \pi_t\,\|\nabla_{\mathbf a}Q^\pi\|^2\,d\mathbf a. \)
Taking the expectation over $s\sim d^\pi$ yields Lemma~\ref{lem:diss}.

\paragraph{c-Wasserstein case.} Let $c$ be proper, convex, l.s.c., with conjugate $c^*$ differentiable and $\nabla c^*$ monotone. With $v_t=\nabla c^*(-\nabla_{\mathbf a}\,\delta\mathcal J/\delta\pi)$ the same argument yields
\[
\frac{d}{dt}\,\mathcal J[\pi_t] 
= \int \nabla_{\mathbf a}\Big(\frac{\delta\mathcal J}{\delta\pi}\Big)\cdot \nabla c^*\!\Big(-\nabla_{\mathbf a}\Big(\frac{\delta\mathcal J}{\delta\pi}\Big)\Big)\,\pi_t\,d\mathbf a
= -\int \langle \nabla c^*(\nabla_{\mathbf a}Q^\pi),\,\nabla_{\mathbf a}Q^\pi\rangle\,\pi_t\,d\mathbf a \;\le\; 0,
\]
with equality only when $\nabla_{\mathbf a}Q^\pi=0$ almost everywhere (or when $\nabla c^*(\cdot)=0$ at that argument).
\subsection{Assumptions and Boundary Conditions}\label{app:assumptions-scope}
We assume $C^1$/$C^2$ smoothness as required and vanishing boundary terms in integration by parts; interchange of expectation and differentiation follows from dominated convergence under integrable bounds. We assume either (i) $\mathcal A=\mathbb R^d$ with tails making the flux vanish at infinity, or (ii) bounded $\mathcal A$ with no-flux boundary condition $(\pi\,v)\cdot n=0$ on $\partial\mathcal A$. We assume $Q^\pi(\cdot,s)\in C^1$ in $\mathbf a$ with $\nabla_{\mathbf a}Q^\pi$ locally Lipschitz (uniformly in $s$), ensuring the weak formulation and energy identity.
\\
\noindent Sufficient boundary/decay conditions include: (i) bounded $\mathcal A$ with $(\pi v)\cdot n=0$; or (ii) $\mathcal A=\mathbb R^d$ and $\pi(\mathbf a\mid s)\,\|\nabla_{\mathbf a} Q^\pi(s,\mathbf a)\| \to 0$ as $\|\mathbf a\|\to\infty$ (uniformly in $s$).
\\
\noindent Scope of descent (frozen critic). Energy decay statements apply to the proxy functional with $Q^\pi$ and $d^\pi$ held fixed during the inner flow/projection step (semi-gradient setting). They do not by themselves imply monotone improvement of the true return when the critic or occupancy evolves between steps.

\subsection{Fisher–Galerkin Projection}
Minimizing the Fisher-weighted squared error between $\partial_t\pi$ and $\delta\pi_\theta$ yields the orthogonality conditions $\langle \partial_t\pi-\delta\pi_\theta,\,\pi\,\nabla_{\theta_k}\log\pi\rangle=0$ for each coordinate $k$, i.e., $F_{\theta\theta}\,\Delta\theta = \mathcal{F}_{t\theta}$ with $F_{\theta\theta}=\mathbb{E}[\nabla_\theta\log\pi\,\nabla_\theta\log\pi^\top]$. Using the continuity equation and integrating by parts in $\mathbf a$ (boundary terms vanish),
\[
\mathcal F_{t\theta_k} 
= \mathbb E\Big[\int \nabla_{\theta_k}\log\pi\,\partial_t\pi\,d\mathbf a\Big]
= -\mathbb E\Big[\int \nabla_{\theta_k}\log\pi\,\nabla_{\mathbf a}\!\cdot(\pi\,\nabla_{\mathbf a}Q)\,d\mathbf a\Big]
= \mathbb E\Big[\int \big(\nabla_{\mathbf a}\nabla_{\theta_k}\log\pi\big)^\top(\pi\,\nabla_{\mathbf a}Q)\,d\mathbf a\Big],
\]
which gives $\mathcal F_{t\theta}=\mathbb E[\nabla_\theta\nabla_{\mathbf a}\log\pi\,\nabla_{\mathbf a}Q]$. Commutation of $\nabla_{\theta}$ with the integral follows by dominated convergence if $\nabla_\theta\nabla_{\mathbf a}\log\pi\in L^1$ uniformly.

\subsection{G\'ateaux Variation and Baselines}
Consider the Lagrangian $\mathcal L(\pi,\lambda)= -\mathbb E_s\mathbb E_{\mathbf a\sim\pi(\cdot\mid s)}[Q^\pi(s,\mathbf a)] + \mathbb E_s\big[\lambda(s)(\int \pi(\mathbf a\mid s)\,d\mathbf a - 1)\big]$. The first variation in a direction $h$ with $\int h\,d\mathbf a=0$ per state is $\delta \mathcal L(\pi;h)= -\mathbb E_s\int Q^\pi(s,\mathbf a)\,h(\mathbf a,s)\,d\mathbf a$. Stationarity yields $\delta\mathcal J/\delta\pi = -Q^\pi+\lambda(s)$. Replacing $Q^\pi$ by $Q^\pi-b(s)$ shifts $\lambda(s)$ to $\lambda(s)+b(s)$; since $\nabla_{\mathbf a}b(s)=0$, both the Eulerian velocity and the cross term $\mathcal F_{t\theta}$ are unchanged.

\subsection{Gaussian Updates}
For 1D Gaussian, $\partial_\mu\log\pi = (a-\mu)/\sigma^2$ and $\partial_\lambda\log\pi = (a-\mu)^2/\sigma^2 - 1$ with $\lambda=\log\sigma$. Fisher blocks are $F_{\mu\mu}=1/\sigma^2$, $F_{\lambda\lambda}=2$, $F_{\mu\lambda}=0$. The natural step gives $\Delta\mu=\mathbb{E}[\nabla_a Q]$ and $\Delta\lambda=\tfrac{1}{\sigma^2}\mathbb{E}[(a-\mu)\nabla_a Q]$.

For full covariance, work on SPD($d$) with the affine-invariant inner product $\langle U,V\rangle_\Sigma=\mathrm{tr}(\Sigma^{-1}U\,\Sigma^{-1}V)$. Let $M=\mathbb E[(\nabla_{\mathbf a}Q)(\mathbf a-\mu)^\top]$. A standard calculation gives the Riemannian gradient $\mathrm{grad}\,\mathcal J(\Sigma)=\sym(\Sigma^{-1}M\,\Sigma^{-1})$. The natural step is $\Delta\Sigma = -\alpha\,\mathrm{grad}\,\mathcal J$ pulled back to Euclidean coordinates, i.e.,
\[
\Delta\Sigma 
= 2\,\Sigma\,\sym(\Sigma^{-1}M\,\Sigma^{-1})\,\Sigma 
= M+M^\top,
\]
which matches the Fisher-preconditioned update.

\subsection{Cholesky Triangular Sylvester Solve}
Linearizing $\Sigma=L L^\top$ yields $\Delta\Sigma= L\,\Delta L^\top+\Delta L\,L^\top$. Given a symmetric target $S$, the lower-triangular recursion stated in the main text solves $L\,\Delta L^\top+\Delta L\,L^\top=S$ uniquely by forward substitution (prove by induction on columns). Hence the realized increment equals $S=M+M^\top$. For SPD preservation, a sufficient condition is to backtrack $\eta$ until $\min_i (L+\eta\Delta L)_{ii}>0$; for example, if $\eta\,\|L^{-1}\Delta L\|_\infty<1$, the updated diagonal remains positive.
\\
\paragraph{Algorithm (Cholesky SPD update).}\label{app:cholesky-algo}
Given $L$ and a symmetric target $S=M+M^\top$:
\begin{enumerate}
  \item Solve $L\,\Delta L^\top + \Delta L\,L^\top = S$ for lower-triangular $\Delta L$ by forward substitution (column-wise recursion above).
  \item Line search $\eta>0$ (e.g., backtracking Armijo) until $\min_i (L+\eta\Delta L)_{ii}>0$.
  \item Update $L\leftarrow L+\eta\Delta L$ and return $\Sigma\leftarrow L L^\top$.
\end{enumerate}
Cost: $\mathcal O(d^3)$ per update; diagonal $\Sigma$ reduces to $\mathcal O(d)$.

\subsection{Parameterization Covariance}
Let $\phi=\phi(\theta)$ be a local diffeomorphism with Jacobian $J=\partial\phi/\partial\theta$. The Fisher transforms as $F_\phi=\mathbb E[\nabla_\phi\log\pi\,\nabla_\phi\log\pi^\top]=J^{-\top}F_\theta J^{-1}$. The cross term obeys $g_\phi=\mathbb E[\nabla_\phi\nabla_{\mathbf a}\log\pi\,\nabla_{\mathbf a}Q]=J^{-\top}g_\theta$ by the chain rule $\nabla_\phi=J^{-\top}\nabla_\theta$. Therefore $\Delta\phi=F_\phi^{-1}g_\phi=J\,F_\theta^{-1}g_\theta=J\,\Delta\theta$.

\subsection{c-Wasserstein}
Assume $c$ is proper, convex, l.s.c., with conjugate $c^*$ differentiable and $\nabla c^*$ monotone. The c-Wasserstein flow uses velocity $\nabla c^*(-\nabla_{\mathbf{a}} \delta\mathcal{J}/\delta\pi)$. The elementwise cube-root mapping used in the main text arises from the \emph{separable} choice $c(u)=\tfrac{1}{4}\sum_i |u_i|^4$, whose conjugate has $c^*(x)=\tfrac{3}{4}\sum_i |x_i|^{4/3}$ and $\nabla c^*(x)_i=\operatorname{sign}(x_i)|x_i|^{1/3}$. This map is direction-preserving and Hölder-continuous, supporting the stated stability. A rotationally invariant alternative $c(u)=\tfrac{1}{4}\|u\|_2^4$ yields $\nabla c^*(x)=\|x\|_2^{-2/3}x$.

\section{Proofs of Main Results}
\begin{proof}[Proof of Theorem~\ref{thm:projection}]
Projecting $\partial_t\pi$ onto $\delta\pi_\theta$ in the Fisher inner product yields normal equations $\langle \partial_t\pi - \delta\pi_\theta,\, \pi\,\nabla_\theta\log\pi\rangle=0$, which rearrange to $F_{\theta\theta}\,\Delta\theta=\mathcal{F}_{t\theta}$. Integration by parts under the stated regularity gives the cross term in \eqref{eq:Fttheta}.
\end{proof}

\begin{proof}[Proof of Proposition~\ref{prop:baseline}]
Constrained variation with per-state normalization introduces a Lagrange multiplier $\lambda(s)$, yielding $\delta\mathcal{J}/\delta\pi=-\,Q^\pi+\lambda(s)$. Any baseline $b(s)$ can be absorbed into $\lambda$, so both the PDE and $\mathcal{F}_{t\theta}$ are invariant to $Q^\pi\mapsto Q^\pi-b(s)$.
\end{proof}

\begin{proof}[Proof of Theorem~\ref{thm:covariance}]
Under the pullback metric, $F_\phi^{-1}=J\,F_\theta^{-1}J^\top$ and $g_\phi=J^{-\top}g_\theta$. Thus $F_\phi^{-1}g_\phi=J\,F_\theta^{-1}g_\theta=J\,\Delta\theta$.
\end{proof}

\begin{proof}[Proof of Lemma~\ref{lem:gaussian}]
Use $\nabla_a\log\pi=-(a-\mu)/\sigma^2$ and $\nabla_\theta\nabla_a\log\pi=(\nabla_\theta\mu)/\sigma^2$ to compute $g_\theta$ and Fisher blocks; apply $\Delta\theta=F^{-1}g$ to obtain the stated updates. For full $\Sigma$, the affine-invariant Fisher yields $\Delta\Sigma=M+M^\top$.
\end{proof}

\begin{proof}[Proof of Proposition~\ref{prop:cw}]
For convex $c$, $c^*$ is convex and $\nabla c^*$ is monotone. Replacing $\nabla_{\mathbf{a}}Q$ by $\nabla c^*(\nabla_{\mathbf{a}}Q)$ corresponds to the Eulerian form of the c-Wasserstein gradient flow, preserving descent/ascent while taming large gradients.
\end{proof}

\begin{proof}[Proof of Proposition~\ref{prop:equiv-npg}]
\emph{Proof sketch.} For fixed covariance, the mean-block Fisher is $F_{\theta_\mu\theta_\mu}=\mathbb E[\nabla_{\theta_\mu}\log\pi\,\nabla_{\theta_\mu}\log\pi^\top]$. Under the local quadratic model $\nabla_{\mathbf a}Q^\pi\approx H(s)(\mathbf a-\mu)$ and Gaussian $\pi$, one has $\mathbb E[\nabla_{\mathbf a}Q^\pi\,(\mathbf a-\mu)^\top]=H(s)\,\Sigma$. Using $\nabla_{\theta_\mu}\nabla_{\mathbf a}\log\pi=(\partial\mu/\partial\theta_\mu)\,\Sigma^{-1}$, the WPO cross term becomes $g_{\theta_\mu}=\mathbb E[(\partial\mu/\partial\theta_\mu)\,\Sigma^{-1} H(s)\,\Sigma]$. The NPG mean direction with advantage $A^\pi$ linearized around $\mu$ yields the same factor $H(s)$ multiplying $\nabla_{\theta_\mu}\log\pi$, so after preconditioning by $F_{\theta_\mu\theta_\mu}^{-1}$ both steps are collinear. Averaging over $d^\pi$ preserves collinearity.\qedhere
\end{proof}

\paragraph{Acknowledgments} Omitted for double-blind review.

% (end-of-main marker removed)

\section*{Responsible AI Statement}
This is a theoretical contribution. It contains no human or animal subjects, no personal or sensitive data, and no deployed systems. All results are formal statements with explicit assumptions and proofs; we discuss limitations, scope, and failure modes where applicable. The work adheres to the Agents4Science Code of Ethics (and the NeurIPS Code of Ethics in spirit): we avoid prohibited practices, dual-use concerns, and undisclosed human subject data; environmental impact is negligible as no large-scale compute is used.

\section*{Reproducibility Statement}
All claims are formal theorems, propositions, or lemmas, each with clearly stated assumptions and proofs. Key equations are label-anchored for unambiguous cross-referencing (e.g., \eqref{eq:w2flow}, \eqref{eq:Fisher}, \eqref{eq:inner}, \eqref{eq:Fttheta}). Weak-form and boundary assumptions are made explicit; derivations are given both in the main text and in the appendix. No datasets or empirical experiments are involved. A minimal prototype used in separate notes mirrors the Gaussian mean/covariance updates but is not required to verify the theoretical results.

% Bibliography
\small
\bibliographystyle{plainnat}
\bibliography{wpo_refs}

\newpage
\section*{Agents4Science AI Involvement Checklist}
\begin{enumerate}
    \item \textbf{Hypothesis development}: Hypothesis development includes the process by which you came to explore this research topic and research question. This can involve the background research performed by either researchers or by AI. This can also involve whether the idea was proposed by researchers or by AI.
    
    Answer: \involvementA{} % Developed by authors; no AI ideation
    
    Explanation: The theoretical questions, hypotheses, and scope (projection of Wasserstein flows, invariance properties, and c-Wasserstein stability) were developed by the authors without AI assistance beyond standard search/reading tools.
    
    \item \textbf{Experimental design and implementation}: This category includes design of experiments that are used to test the hypotheses, coding and implementation of computational methods, and the execution of these experiments.
    
    Answer: \involvementNA{} % No experiments/code artifacts in this theory paper
    
    Explanation: The paper presents mathematical derivations and proofs only; there are no empirical experiments or released implementations.
    
    \item \textbf{Analysis of data and interpretation of results}: This category encompasses any process to organize and process data for the experiments in the paper. It also includes interpretations of the results of the study.
    
    Answer: \involvementNA{}
    
    Explanation: No datasets were used; results are theoretical statements with formal proofs and stated assumptions.
    
    \item \textbf{Writing}: This includes any processes for compiling results, methods, etc. into the final paper form. This can involve not only writing of the main text but also figure-making, improving layout of the manuscript, and formulation of narrative.
    
    Answer: \involvementA{}
    
    Explanation: Writing and editing were performed by the authors using LaTeX. No generative AI systems were used to draft or edit the scientific content.
    
    \item \textbf{Observed AI Limitations}: What limitations have you found when using AI as a partner or lead author?
    
    Description: Not applicable (no AI systems were used in ideation, analysis, coding, or writing).
\end{enumerate}

\newpage
\section*{Agents4Science Paper Checklist}
\begin{enumerate}
\item {\bf Claims}
    \item[] Question: Are the claims made in the abstract and introduction supported by the results?
    \item[] Answer: \answerYes{}
    \item[] Justification: Claims (projection to natural gradient, baseline invariance, parameterization covariance, Gaussian/covariance updates, and c-Wasserstein energy decay) are supported by Theorems/Propositions with proofs and stated assumptions (see Theorem~\ref{thm:projection}, Proposition~\ref{prop:baseline}, Theorem~\ref{thm:covariance}, Lemma~\ref{lem:gaussian}, Theorem~\ref{thm:cw-energy}).

\item {\bf Tasks and baselines}
    \item[] Question: Did you describe the limitations of your work?
    \item[] Answer: \answerYes{}
    \item[] Justification: A dedicated Limitations section enumerates scope and assumptions (e.g., frozen-critic regime, smoothness/weak-form conditions, and when equivalences fail).

    \item[] Question: Did you discuss any potential negative societal impacts of your work?
    \item[] Answer: \answerNA{}
    \item[] Justification: The work is purely theoretical and does not involve deployment, datasets, or application domains with direct societal impact.

    \item[] Question: Did you discuss the failure modes of your method?
    \item[] Answer: \answerYes{}
    \item[] Justification: The paper discusses conditions where alignment with NPG holds vs. fails (e.g., mixtures/squashing), boundary/regularity requirements, and stability caveats beyond the frozen-critic setting.

\item {\bf Reproducibility}
    \item[] Question: Is your code and data publicly available?
    \item[] Answer: \answerNA{}
    \item[] Justification: No code or datasets were produced; the paper contains only theoretical results.

    \item[] Question: Does the paper provide sufficient details to reproduce the main results (either in the text or as supplemental material)?
    \item[] Answer: \answerYes{}
    \item[] Justification: Full derivations with assumptions, weak-form statements, and proofs are provided in the main text and appendix.

\item {\bf Experimental setting/details}
    \item[] Question: Does the paper specify all the training and test details (e.g., data splits, hyperparameters, how they were chosen, type of optimizer, etc.) necessary to understand the results?
    \item[] Answer: \answerNA{}
    \item[] Justification: Not applicable; there are no experiments.

\item {\bf Experiment statistical significance}
    \item[] Question: Does the paper report error bars suitably and correctly defined or other appropriate information about the statistical significance of the experiments?
    \item[] Answer: \answerNA{}
    \item[] Justification: Not applicable; there are no experiments.

\item {\bf Experiments compute resources}
    \item[] Question: For each experiment, does the paper provide sufficient information on the computer resources (type of compute workers, memory, time of execution) needed to reproduce the experiments?
    \item[] Answer: \answerNA{}
    \item[] Justification: Not applicable; there are no experiments.

\item {\bf Code of ethics}
    \item[] Question: Does the research conducted in the paper conform, in every respect, with the Agents4Science Code of Ethics (see conference website)?
    \item[] Answer: \answerYes{}
    \item[] Justification: Theoretical work with no human/animal subjects, sensitive data, or prohibited practices.

\item {\bf Broader impacts}
    \item[] Question: Does the paper discuss both potential positive societal impacts and negative societal impacts of the work performed?
    \item[] Answer: \answerNA{}
    \item[] Justification: Not applicable to this purely theoretical contribution; no deployment or application domain is studied.
\end{enumerate}
\end{document}
