\appendix

\section{Datasets} In this work, we use 4 publicly available datasets. Here, we provide detailed description of the datasets. We use Eye Gaze Data for Chest X-rays \cite{karargyris2021creation} (n=1083) and for evaluation use REFLACX dataset\cite{bigolin2022reflacx,lanfredi2021reflacx}. For zero-shot classification, we use \textbf{Cell Pneumonia Classification}: In this dataset, the CXRs were selected from pediatric patients of 1-5 years old from Guangzhou Women and Children’s Medical Center, Guangzhou. We use the test set that consists of 234 normal CXR images and 390 pneumonia (viral and bacterial) CXR images. \textbf{RSNA Pneumonia Detection Challenge Dataset} This dataset was made public by the Radiological Society of North America (RSNA). We have used the test set that consists of 4527 CXR images. \textbf{China} This dataset was obtained from Shenzhen No. 3 People’s Hospital in China. We use the test set that has 284 normal CXRs and 378 CXRs with tuberculosis. \textbf{Montgomery County dataset} This dataset was acquired from the Department of Health and Human Services, Montgomery County, Maryland, USA. We use the test set that has 80 normal CXRs and 58 CXRs with tuberculosis. In Figure \ref{fig:appendix_datasets}, we show images containing different pathologies from these mentioned datasets.

\begin{figure}[htbp]
\floatconts
  {fig:appendix_datasets}
  {\caption{\textbf{Dataset description.} The CXR images from different datasets with different disease are shown.}}
  {\includegraphics[width=1.0\linewidth]{figures/appendix_datasets.pdf}}
\end{figure}

\section{Human Visual Attention (HVA) computation}
\label{appendix_hva}
\textbf{Focal HVA.} Given radiologists' eye gaze patterns $\mathbb{G}_r$, for a radiologist $i$, the fixation points set is represented as $g_i\in\mathbb{R}^{\mathbb{G}_i}$. For focal HVA calculation, we select the cluster of points $\mathcal{C}_k^I\subset g_i$, where $k\in\{1,2,..., K\}$ is the total number of random clusters, such that $\forall (x_k,y_k)\in\mathcal{C}_k^I:\norm{x_k- y_k}_\mathcal{D}\leq\norm{x_j-y_j}_\mathcal{D}$, here $k\cap j=\emptyset$, shown in \figureref{fig:appendix_hva}.\\ 
\textbf{Global HVA.} Similar to Focal HVA, in Global HVA calculation, we select the cluster of points $\mathcal{C}_k^D\subset g_i$, such that $\forall (x_k,y_k)\in\mathcal{C}_k^D:\norm{x_k- y_k}_\mathcal{D}\geq \textbf{c}$, where $\textbf{c}\in\mathbb{R}$. Then, a multi-dimensional Gaussian filter with standard deviation, $\sigma=64$, is used to generate these attention heatmaps, shown in \figureref{fig:appendix_hva}.

\begin{figure}[htbp]
\floatconts
  {fig:appendix_hva}
  {\caption{\textbf{HVA computation.} The HVA maps along with the HVA edge maps are shown.}}
  {\includegraphics[width=0.8\linewidth]{figures/appendix_hva.pdf}}
\end{figure}

\newpage

\section{Additional quantitative results}
\input{tables/table_appendix_1}
\input{tables/table_appendix_2}

\newpage

\section{Additional figures}
\begin{figure}[htbp]
\floatconts
  {fig:appendix_finetune}
  {\caption{\textbf{Finetuning diffusion models.} We show that the generated CXRs for Stable Diffusion and ControlNet models without finetuning look unrealistic, whereas after finetuning the x-rays look realistic for the mentioned prompt.}}
  {\includegraphics[width=0.8\linewidth]{figures/appendix_1.pdf}}
\end{figure}

\begin{figure}[htbp]
\floatconts
  {fig:appendix_normal}
  {\caption{\textbf{Qualitative Result-Normal} We show a normal CXR generated by \textit{GazeDiff} based on the radiologist's transcript as text condition.}}
  {\includegraphics[width=1.0\linewidth]{figures/appendix_normal.pdf}}
\end{figure}