% !TEX root = ../main.tex
\begin{figure}
  \centering
  \includegraphics[width=1\linewidth]{figures/method/first-figure-compressed-2.pdf}
  \caption{
  \textbf{(Left):}
      Illustration of our approach.
      We start by obtaining tags for images in the dataset.
      We then analyze the effect of those tags on the model's perfomence.
      In particular, we consider different combination of tags and assess model's performace on images representing those tags.
      Combination of tags that leads to significant drop in accuracy are identified as failure modes.
  \textbf{(Right):}
    Previous research leverages the vision-language representation space to extract failure modes.
    However, it's worth noting that the representation space may not necessarily serve as an ideal proxy for the human-understandable (semantic) space.
    We refer to Section~\ref{subsec:rev} for more details.
  }
  \label{fig:main_fig}
\end{figure}
