\begin{figure}[ht]
  \centering
  \fbox{\textbf{LoRA-CLIP}}\\[0.5ex]
  \subfigure[All layers]{%
    \includegraphics[width=0.3\textwidth]{images/lora_attn_all}
    \label{fig:clip_all}
  }
  \quad
  \subfigure[Layers 5--7]{%
    \includegraphics[width=0.3\textwidth]{images/lora_attn_5_6_7}
    \label{fig:lora_layers}
  }
  
  \vspace{1ex}
  \hrule
  \vspace{1ex}
  
  \fbox{\textbf{CXR-BERT}}\\[0.5ex]
  \subfigure[All Layers]{%
    \includegraphics[width=0.3\textwidth]{images/cxr-bert-all}
    \label{fig:cxrbert_all}
  }
  \quad
  \subfigure[Layers 5--7]{%
    \includegraphics[width=0.3\textwidth]{images/cxr-bert-5-6-7}
    \label{fig:cxrbert_layers}
  }
  
  \caption{Comparison of cross-attention maps between conditioning provided by a CLIP model fine-tuned via LoRA~\cite{hu2022lora} and one conditioned by CXR-BERT. It can be seen that LoRA provides no localization capabilities and can consequently profit from the lower resolution through selecting the middle layers, while this does not work on CXR-BERT.}
  \label{fig:layers}
\end{figure}
