\begin{figure}[t]
    \centering
    \resizebox{0.85\textwidth}{!}{%
    \begin{tabular}{c c c c c c c c}
        \includegraphics[width=0.11\textwidth, height=0.11\textwidth]{images/tokens/original} &
        \includegraphics[width=0.11\textwidth]{images/tokens/start} &
        \includegraphics[width=0.11\textwidth]{images/tokens/patchy} &
        \includegraphics[width=0.11\textwidth]{images/tokens/consolidation} &
        \includegraphics[width=0.11\textwidth]{images/tokens/in} &
        \includegraphics[width=0.11\textwidth]{images/tokens/the} &
        \includegraphics[width=0.11\textwidth]{images/tokens/right} &
        \includegraphics[width=0.11\textwidth]{images/tokens/mid} \\
        \textit{} & \textit{[start]} & \textit{patchy} & 
        \textit{consolid.} & \textit{in} & \textit{the} & \textit{right} & \textit{mid} \\
        \rule{0pt}{0.2cm} \\
        
        \includegraphics[width=0.11\textwidth]{images/tokens/and} &
        \includegraphics[width=0.11\textwidth]{images/tokens/lower} &
        \includegraphics[width=0.11\textwidth]{images/tokens/lung} &
        \includegraphics[width=0.11\textwidth]{images/tokens/and} &
        \includegraphics[width=0.11\textwidth]{images/tokens/mid} &
        \includegraphics[width=0.11\textwidth]{images/tokens/left} &
        \includegraphics[width=0.11\textwidth]{images/tokens/lung} &
        \includegraphics[width=0.11\textwidth]{images/tokens/end} \\
        \textit{and} & \textit{lower} & \textit{lung} & \textit{and} & \textit{mid} & \textit{left} & \textit{lung} & \textit{[end]} \\

        
        
    \end{tabular}
    }
    \caption{Shows an input CXR image in posterior-anterior view (so note that left and right are mirrored) together with the cross-attention maps for the tokens of the corresponding text report. Red shows high activations, while blue shows low activations. The white boxes indicate the ground-truth bounding boxes.}
    \label{fig:tokens}
    %\vspace{-2.0cm}
\end{figure}
