\begin{figure}[t]
  \centering
  \scalebox{0.9}{\input{wss_framework_tikz}}  % your .tex file
  \caption{
 Overview of the weakly supervised segmentation (WSS) framework. A predictor-guided rectified flow model generates a counterfactual reconstruction, and the residual image with respect to the input yields the segmentation mask.}
 % Overview of the weakly-supervised segmentation (WSS) framework, where a predictor-guided rectified flow model performs counterfactual image translation to obtain weakly-supervised segmentations.}
  \label{fig:wss_framework}
\end{figure}

Our method leverages pretrained foundation models in a plug-and-play manner to extract weakly supervised segmentations of lung nodules. Specifically, we combine MAISI-V2, a state-of-the-art 3D rectified flow model for medical image synthesis, with two alternative predictor models pretrained on large-scale medical imaging data: MedSAM \cite{ma2024segment} and RadImgNet \cite{mei2022radimagenet}. The predictor model is used to guide the generative sampling process towards a counterfactual image corresponding to the absence of lung nodules. Concretely, given a CT volume, we steer the generative trajectory such that the predicted probability of nodule presence is reduced. The weak segmentation mask is then obtained by computing the voxel-wise absolute difference between the original image and the guided counterfactual sample. An overview of the framework is shown in Fig.~\ref{fig:wss_framework}.

\paragraph{Rectified flow.} 
Rectified flow learns a transport map between a source distribution $\pi_0$ and a target distribution $\pi_1$~\cite{rectified-flow}. The model parameterizes a time-dependent vector field $v_\theta(X_t, t)$, represented by a neural network with learnable parameters $\theta$, which transforms samples $X_0 \sim  \pi_0$ into $X_1 \sim \pi_1$ by solving the ordinary differential equation (ODE): 
\begin{equation} \label{ode}
    \mathrm{d}X_t = v_\theta(X_t, t)\, \mathrm{d}t.
\end{equation}
The vector field $v_\theta(X_t, t)$ is learned by minimizing the least squares regression objective:
\begin{equation}\label{fm-loss}
    \mathcal{L \left ( \theta \right )} =  \left\|  (X_1 - X_0) - v_\theta(X_t, t)  \right\|^{2},
\end{equation} 
where $X_t = tX_1 + (1-t)X_0$. This formulation encourages %more
linear flows, enabling
high quality results with few sampling steps when solving the ODE in Eq.~\ref{ode}. 

In practice, rectified flow can be performed in a learned latent space using an autoencoder that maps images $X \in \mathbb{R}^{H\times W\times D}$ to latent representations $z \in \mathbb{R}^{h\times w\times d}$. The rectified flow is then applied in the lower-dimensional latent space, resulting in improved scalability for high-dimensional data~\cite{esser2024scaling}.

\begin{figure}[h]
  \centering
  \scalebox{0.65}{\input{tfg_framework_tikz}}  % your .tex file
  \caption{Overview of the proposed plug-and-play framework for predictor-guided rectified flow in latent space, performed at inference. The symbol \small{\textcolor{cyan!40!white}{\faSnowflake}} indicates that the models are frozen.}
  \label{fig:tfg_framework}
\end{figure}

\paragraph{Training-free guidance.}
In order to avoid costly retraining of the generative model, we adapt the TFG framework \cite{bansal2023universal,ye2024tfg,yu2023freedom}, which enables guiding an arbitrary generative model using a predictor model, rather than training a new conditional model from scratch. We fine-tune a pretrained backbone to serve as a target predictor.  

We guide the unconditional MAISI-V2 rectified flow model using a guidance strategy inspired by FlowChef~\cite{patel2025flowchef}. A brief outline of the method is provided below; see also Algorithm~\ref{alg:tfg} and Fig.~\ref{fig:tfg_framework} for an overview. We omit all time step indices for brevity. First, instead of starting from pure noise, a CT volume $X$ is encoded into a lower dimensional latent representation $z$ using the variational encoder $\mathcal{E}$. The latent representation $z$ is then perturbed using the backward Euler method, Eq. \ref{ode}, to a predetermined intermediate time step $\tau$, in order to preserve anatomical structures during reconstruction. A clean latent estimate is computed as 
\begin{equation}\label{clean_latent_estimation}
     \hat{z} \gets z +  v_\theta(z, t) \cdot (T-t)  \cdot dt,
\end{equation}
which is then decoded by the variational decoder $\mathcal{D}$ to obtain a reconstruction $\hat{X}$ in image space. This allows us to use the reconstruction $\hat{X}$ as input to the target predictor, yielding predictions $\hat{y}$ used to compute the loss $\mathcal{L}(\hat{y}, y)$, where $y$ denotes the guiding label and $\mathcal{L}$ is the binary-cross entropy loss. The intermediate latent variable is then guided via the gradient update
\begin{equation}\label{guidance}
     z \gets z - s \cdot \nabla_{\hat{z}} \mathcal{L}(\hat{y}, y),
\end{equation}
where $s$ denotes the guidance strength. The guided latent is subsequently updated according to Eq.~\ref{ode}, and this procedure is repeated until the final time step is reached. The weakly supervised segmentation is finally obtained as the absolute difference between the guided generated image and the original image.  

\begin{algorithm}
\fontsize{8pt}{10pt}\selectfont
\caption{Weak lung nodule segmentation via plug-and-play guidance}\label{alg:tfg}
\begin{algorithmic}
\Statex \textbf{Hyperparameters:} strength $s$, discretization $T$, intermediate time $\tau$, steps $m$
\Statex \textbf{Input:} generative model $v_\theta$, encoder $\mathcal{E}$, decoder $\mathcal{D}$, predictor $f$, label $y$, loss $\mathcal{L}$, CT $X$
\State $z \gets \mathcal{E}(X)$, $dt \gets 1/T$ \Comment{Encode input volume}

\For{$t \in \{T...\tau \}$} \Comment{Perturb to intermediate step $\tau$}
\State $z\gets \textsc{BackwardEuler}(v_\theta, z, t, dt)$
\EndFor
\For{$t \in \{\tau...T \}$}
\If{$t < \tau + m$} \Comment{Apply guidance for $m$ steps}
\State $\hat{z} \gets z +  v_\theta(z, t) \cdot (T-t)  \cdot dt$ \Comment{One-step clean latent estimate}
\State $\hat{X} \gets \mathcal{D}(\hat{z})$ \Comment{Decode clean latent estimate}
\State $\hat{y} \gets f(\hat{X})$ \Comment{Compute predictor output}
\State $z \gets z - s \cdot \nabla_{\hat{z}} \mathcal{L}(\hat{y}, y)$ \Comment{Latent guidance update}
\EndIf
\State $z \gets \textsc{ForwardEuler}(v_\theta, z, t, dt)$
\EndFor
\State $X^* \gets \mathcal{D}(z)$ \Comment{Final reconstruction}
\Statex \textbf{Return} $|X^* - X|$ \Comment{Weak segmentation mask} 
\end{algorithmic}
\end{algorithm}

%\subsubsection{Weakly-supervised segmentation} Our method utilizes weakly-supervised training of a predictor model, and in our case, it is composed of a pretrained backbone and a classification head. We first finetune the predictor model, where it is learned to distinguish between slices that contain lung nodules or not, and it is done using image-level labels only. In this work, we use the pretrained encoder backbone of RadImgNet \cite{mei2022radimagenet} and TinyVit MedSAM \cite{ma2024segment}, as they have already been trained on a large amount of medical images.   
%Extracting the weakly-supervised segmentation of lung nodules from the predictor model in the TFG framework is then performed by 
%- Lyft att det är svårare att få fram implicit segmentering på mindre strukturer som noduler etc? \\
%- Arguementera för att vår metod har kräver lite handpåläggning. Endast weakly-supervised träning på förtränade MedSAM. Annars plug-and-play med förtränade modeller. Ytterst lite post-processing (enkel tröskling).  \\
%- Beskriv classifier-guidance \\
%- Beskriv weakly-supervised \\
%- Beskriv flow matching (diffusion?) \\