\documentclass{midl} % Include author names

% The following packages will be automatically loaded:
% jmlr, amsmath, amssymb, natbib, graphicx, url, algorithm2e
% ifoddpage, relsize and probably more
% make sure they are installed with your latex distribution

\usepackage{todonotes}
\usepackage{booktabs}
\usepackage{xcolor}
\usepackage{algorithm}
\usepackage{algpseudocode}

\usepackage[final]{changes}
%\definechangesauthor[name={Luis Jesús Marhuenda Tendero}, color=teal]{LJMT}
%\definechangesauthor[name={Miquel Obrador Reina}, color=brown]{MOR}

\definecolor{myred}{RGB}{255, 0, 0}
\definecolor{mygreen}{RGB}{0, 150, 0}

\usepackage{mwe} % to get dummy images
\jmlryear{2025}
\jmlrworkshop{Full Paper -- MIDL 2025}
\jmlrvolume{-- 100}
\editors{Accepted for publication at MIDL 2025}

\title[A VED Model for Difference Medical VQA]{Unveiling Differences: A Vision Encoder-Decoder Model for Difference Medical Visual Question Answering}

 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % Two authors with the same address
 % \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\and
 %  \Name{Author Name2} \Email{xyz@sample.edu}\\
 %  \addr Address}

 % Three or more authors with the same address:
 % \midlauthor{\Name{Author Name1} \Email{an1@sample.edu}\\
 %  \Name{Author Name2} \Email{an2@sample.edu}\\
 %  \Name{Author Name3} \Email{an3@sample.edu}\\
 %  \addr Address}


% Authors with different addresses:
% \midlauthor{\Name{Author Name1} \Email{abc@sample.edu}\\
% \addr Address 1
% \AND
% \Name{Author Name2} \Email{xyz@sample.edu}\\
% \addr Address 2
% }

%\footnotetext[1]{Contributed equally}

% More complicate cases, e.g. with dual affiliations and joint authorship
\midlauthor{\Name{Luis-Jesus Marhuenda\midlotherjointauthor\nametag{$^{1}$}} \orcid{0009-0009-3213-1308} \Email{ljmarten@prhlt.upv.es}\\
\Name{Miquel Obrador-Reina\midlotherjointauthor\nametag{$^{1}$}} \orcid{0009-0009-8463-7749} \Email{mobrrei@prhlt.upv.es}\\
\Name{Mohamed Aas-Alas\midljointauthortext{Contributed equally}\nametag{$^{1}$}} \orcid{0009-0005-2413-8164} \Email{maasala@prhlt.upv.es}\\
\Name{Alberto Albiol\nametag{$^{1}$}} \orcid{0000-0002-1970-3289} \Email{alalbiol@prhlt.upv.es}\\
\Name{Roberto Paredes\nametag{$^{1}$}} \orcid{0000-0002-5192-0021} \Email{rparedes@prhlt.upv.es}\\
\addr $^{1}$ Campus de Vera, Universitat Politècnica València, Camí de Vera s/n, 46022 Valencia, Spain\\
}

\begin{document}

\maketitle

\begin{abstract}
	Difference Medical Visual Question Answering (Diff-VQA), a specialized subfield of Medical VQA, tackles the critical task of identifying and describing differences between pairs of medical images. This study introduces a novel Vision Encoder-Decoder (VED) architecture tailored for this task, focusing on the comparison of chest X-ray images to detect and explain changes. The proposed model incorporates two key innovations: (1) a light-weight Transformer text decoder architecture capable of generating precise and contextually relevant answers to complex medical questions, and (2) an enhanced fusion mechanism that improves the model’s ability to distinguish between two input images, enabling more accurate comparison of radiological findings. Our approach excels in identifying significant changes, such as pneumonia and lung opacity, demonstrating its utility in automating preliminary radiological assessments. By leveraging large-scale, domain-specific datasets and employing advanced training strategies, our VED architecture achieves state-of-the-art performance on standard VQA metrics, setting a new benchmark in diagnostic accuracy. These advancements highlight the potential of Diff-VQA to enhance clinical workflows and support radiologists in making more precise, informed decisions.
\end{abstract}

\begin{keywords}
	Difference Visual Question Answering, Vision Encoder-Decoder Model, Transformers, Medical Imaging
\end{keywords}

\input{sections/intro}
\input{sections/sota}
\input{sections/approach}
\input{sections/results}
\input{sections/conclu}

\clearpage  % Acknowledgements, references, and appendix do not count toward the page limit (if any)
% Acknowledgments---Will not appear in anonymized version
%\midlacknowledgments{We would like to express our sincere gratitude to the PROMETEO project, supported by the Conselleria d'Educació, Universitats i Ocupació under the framework of the PROMETEO Program for Research Excellence Groups. This work was developed within the scope of the LightVED project (CIPROM 2023/17), which aims to advance efficient Vision Encoder-Decoder models for multimodal transcription tasks. We also thank all the collaborators and institutions that contributed to this investigation through their insight and resources.}
\midlacknowledgments{This work was supported by the Generalitat Valenciana under the grant CIPROM/2023/17.}

\bibliography{midl25_100}

\appendix


\section{IDE Implementation}
\label{appendix:ide}
\begin{algorithm}[h!]
\caption{IDE PyTorch Pseudocode}
\begin{verbatim}
    # DEFINE IDE
    IDE = torch.nn.Parameter(2, decoder_dim) # Learnable IDE tensors

def forward():
    imgs_features = encode(images) # (b, 2, 144, 1024)
    # IMAGES FEATURES ARE (batch_size, num_images, seq_len, decoder_dim)
    IDE = IDE.unsqueeze(1) # (2, 1024) -> (2, 1, 1024)
    imgs_features += IDE
    features_concat = rearrange(imgs_features, `b d1 d2 d3 -> b (d1 d2) d3')
\end{verbatim}
\end{algorithm}

\end{document}
