% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{booktabs,makecell, multirow, tabularx}
\usepackage{amssymb} 



%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Overcoming Language Priors for Visual Question Answering via Loss Rebalancing Label and Global Context}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,2]{Runlin Cao}
\author[1,2]{Zhixin Li{\thanks{Zhixin Li is the corresponding author.}}} 
% Add affiliations after the authors

\affil[1]{%
    Key Lab of Education Blockchain and Intelligent Technology, Ministry of Education, \newline Guangxi Normal University, Guilin 541004, China
}
\affil[2]{%
    Guangxi Key Lab of Multi-source Information Mining and Security, Guangxi Normal University, Guilin 541004, China
}


  \begin{document}
\maketitle

\begin{abstract}
Despite the advances in Visual Question Answering (VQA), many VQA models currently suffer from language priors (i.e. generating answers directly from questions without using images), which severely reduces their robustness in real-world scenarios. We propose a novel training strategy called Loss Rebalancing Label and Global Context (LRLGC) to alleviate the above problem. Specifically, the Loss Rebalancing Label (LRL) is dynamically constructed based on the degree of sample bias to accurately adjust losses across samples and ensure a more balanced form of total losses in VQA. In addition, the Global Context (GC) provides the model with valid global information to assist the model in predicting answers more accurately. Finally, the model is trained through an ensemble-based approach that retains the beneficial effects of biased samples on the model while reducing their importance. Our approach is model-agnostic and enables end-to-end training. Extensive experimental results show that LRLGC (1) improves performance for various VQA models and (2) performs competitively in the VQA-CP v2 benchmark test.
\end{abstract}

\section{Introduction}
Visual Question Answering (VQA) aims to answer questions based on a given image. It is a multimodal task that combines vision and language. The fundamental approach involves identifying the image region that is most relevant to the question and generating the most suitable answer by leveraging the information within the image. In recent years, VQA has made significant progress, thanks to the ongoing advancements in computer vision and natural language processing \citep{ref1, ref2, ref3, ref4, ref5}.


\begin{figure}[t!] 
	\centering
	\includegraphics[width=\linewidth,scale=1.00]{myfig1_2}
	\caption{(A) Displays the language priors in the VQA model. The model always tends to predict answers with larger samples in the dataset, e.g., ``white'' sky and ``green'' grass. (B) LRLGC can overcome the class imbalance in the VQA dataset by rescaling the total VQA loss to a more balanced form.}
	\label{myfig1}
\end{figure}


Despite the extant models performing well on many VQA datasets,  such as VQA v2 \citep{ref7}, most machine learning datasets are inevitably biased. Some researchers have recently found that most existing VQA models rely heavily on language priors, which are apparent statistical correlations between questions and answers \citep{ref6,ref7,ref8,ref9,ref10}. They always tend to ignore the image content in predicting the correct answer. As shown in Fig. \ref{myfig1}(A), for questions like ``What color is...'' the model always tends to answer the more distributed answers in the training set and ignore the content of the images. For example, in the training set, ``white'' skies are more common than ``red'' skies, and ``green'' grass is more common than ``brown'' grass. This fragile generalization becomes very poor when the distribution of answers in the training and test sets is different, which greatly limits the application of existing models in the real world. To alleviate the language priors problem in VQA models, Agrawal et al.  \citep{ref10} suggested the VQA-CP dataset, which features question-answer distributions that vary between training and test splits. As the majority of the state-of-the-art VQA models acquire language biases from the training data, their accuracy on the VQA-CP dataset is significantly reduced.



In order to overcome the adverse effects of language priors on VQA models, the research done in recent years can be broadly classified into three categories. The first are annotation-based approaches \citep{ref12,ref33}, which use additional visual information to increase the significance of the image, i.e. they try to match the visual attention of the VQA model with human visual attention to ensure that the model can successfully use visual information. However, annotation-based methods are expensive to annotate manually \citep{ref12,ref33}, and these annotations are scarce and not readily available. Moreover, recent work \citep{ref35,ref42} has demonstrated that the improvement in accuracy arises from regularization rather than an improved visual basis. The second category is data balance methods \citep{ref13,ref32,ref43}, which balance the dataset's bias by constructing new training samples. Counterfactual data augmentation techniques \citep{ref44,ref32,ref36,ref43} balance the training data by constructing counterfactual samples.  To balance the training data,  \citep{ref13,ref29} used a self-supervised framework to generate irrelevant question-image combinations. The data-balance methods generally perform well and do not require additional manual annotation. However, the data-balance methods are likely to introduce new biases due to the inability to guarantee the quality of data generation. Also, the increase in training samples leads to longer training time.



However, it is still a major challenge to make VQA models generalize well under unbalanced training data. Unlike the methods mentioned above, the third type of method is the ensemble-based method \citep{ref11,ref22,ref23,ref24,ref31,ref45}, which is a more efficient solution. Ensemble-based methods do not require additional manual annotations and do not require the generation of new training data. It usually uses an ensemble strategy to combine the predicted outputs of the bias-only model and the VQA model to derive the training gradient based on the fused answers. However, we believe the previous ensemble-based methods have shortcomings: 1) they tend to overcorrect for language biases. Because they do not discriminate the degree of bias of samples well, they also take larger penalties for less biased samples. 2) Few models effectively
use the global context. Fusing context and local information, models can predict more accurately. 3) Most of them gain in out-of-distribution on the VQA-CP v2 dataset at the cost of degrading the model's in-distribution performance on the VQA v2 dataset. Ideally, a robust VQA model should overcome the language priors while maintaining its performance on the in-distribution dataset.


Inspired by \citep{ref23,ref39}, we consider it crucial to rebalance the proportion of the loss value of each answer in the total VQA loss (cf. Fig. \ref{myfig1}(B)). We propose a novel model-agnostic training scheme called  Loss Rebalancing Label and Global Context (LRLGC), as shown in Fig. \ref{myfig2}. It can overcome language priors and fully use global context. It mainly consists of three modules. (1)  Loss Rebalancing Label Module: LRLGC uses the bias-only model's prediction output and ground-truth semantic similarity to determine sample bias. The Loss Rebalancing Label (LRL) is dynamically constructed based on the sample bias. It can assign lower weights to biased samples and ensure a more balanced total VQA loss. (2) Global Context Module: We propose the Global Context (GC) module to utilize the global context effectively. It focuses on globally valid information in images and questions and retains beneficial context priors in biased samples. (3) Ensemble Training Module: We use the ensemble training method \citep{ref11,ref23,ref31} to merge the debiased and global context module's predicted outputs into a single training. By training with an ensemble-based approach, the beneficial effects of biased samples on the model are preserved while their importance is reduced. Following \citep{ref11,ref23,ref31}, our approach only keeps the base VQA module.




This paper's contributions are summarized as follows:

\begin{itemize}
    \item We propose a novel model-agnostic generic framework LRLGC that enables end-to-end training and can be easily integrated into various VQA models.
    \item We propose LRL and Global Context Module, which can effectively help the model overcome the language priors while preserving the contextual information.
    \item Experimental results show that LRLGC achieves competitive performance on the bias-sensitive VQA-CP v2 (60.91$\%$) without sacrificing performance on the in-distribution VQA v2 (60.81$\%$).
\end{itemize}



\section{Relate Work}

\subsection{Visual Question Answering}
VQA attempts to understand visual content and natural language questions in order to predict appropriate answers \citep{ref1}. With the increasing demand for multimodal information understanding and the potential of VQA for its powerful applications, VQA tasks have recently attracted a lot of attention in recent years \citep{ref1,ref3,ref21}. VQA has made significant progress and achieved good results in real images due to the development of deep learning techniques and the proposal of large-scale VQA datasets \citep{ref1,ref7,ref9}. Existing methods include attention-based \citep{ref3,ref15,ref16}, graph-based \citep{ref5,ref14,ref17}, and knowledge-based \citep{ref18,ref19}. However, most existing models remember language priors during training and neglect visual information, resulting in poor performance on a test set from a different domain.



\subsection{Overcoming Language Priors in VQA}
Many studies  \citep{ref10,ref11,ref12,ref13,ref22,ref25,ref31,ref32} have found that the majority of VQA models have serious language priors, which greatly limit the ability of VQA models to understand and generalize multimodal information. Although most existing models can achieve good results on datasets with the same answer distribution for both training and test sets, applying them to real-world scenarios is difficult due to the models' fragile generalization capabilities. In recent years, much work has been proposed to overcome language priors in VQA. These methods can be divided into three categories: annotation-based methods, data-balanced methods, and ensemble-based methods.

\textbf{Annotation-Based Methods.} The annotation-based method has shown its effectiveness in improving model generalization under external visual supervision. The importance of image regions is increased by HINT \citep{ref12} using the annotation of the VQA-HAT \citep{ref41} dataset. SCR \citep{ref33} matches correct answers and influential image regions to human text interpretation, thus reducing the sensitivity of incorrect answers to influential objects. The annotation-based method strengthens the visual foundation by introducing additional human visual supervision. Annotation-based methods focus on strengthening the visual foundation by introducing additional human visual supervision, but all of them require manual annotation, which is very expensive. Furthermore, the study  \citep{ref42} showed that the performance improvement is not a result of visual basis improvement but rather a regularization effect of preventing overfitting the language priors.


\textbf{Data-Balanced Methods.} SSL-VQA \citep{ref13} introduces a self-supervised framework to balance data bias by replacing relevant question-image pairs with irrelevant ones to generate additional data. CSS \citep{ref32} generates counterfactual training samples by masking critical objects in the images and words in the questions and assigning different ground true answers. Based on CSS \citep{ref32}, CL-VQA \citep{ref36} constructs positive and negative samples for counterfactual samples and uses contrast learning for training. Augmenting the data to balance dataset biases does not require additional manual annotation. However, the additional data generated may introduce new biases and make it challenging to ensure the quality of the generated data.

\textbf{Ensemble-Based Methods.} The ensemble-based approach attempts to include an additional branch to account for language priors in order to mitigate their negative impact on the model. AdvReg \citep{ref37} uses an adversarial learning approach to prevent VQA models from capturing language biases in their question encoding. RUBi \citep{ref11} and LMH \citep{ref31} are fusion-based methods. This approach combines the two predicted outputs of the VQA model and the question-only branch together and serves as the final output of the VQA model in the training phase. It effectively prevents the VQA model from using bias for answer prediction. LPF \citep{ref23} and LP-Focal \citep{ref24} use the bias model's output distribution to reduce the bias samples' weight when calculating the VQA loss.


However, the ensemble-based method also compromises the ability of the model to learn context to some extent \citep{ref40}. To improve this problem, we propose our LRLGC training strategy that can reduce language biases while preserving the model's ability to learn context.


\begin{figure*}[t!]   
	\centering
	\includegraphics[width=\linewidth,scale=1]{myfig2_2}
	\caption{Overview of LRLGC training strategy. (A) An arbitrary VQA model. (B) A Bias Model captures language biases, and the Loss Rebalancing Label Constructor dynamically generates Loss Rebalancing Labels (LRL) for each biased sample. (C) A gated multi-headed self-attention mechanism captures global context. (D) Learning by the ensemble.}
	\label{myfig2}
\end{figure*}



\section{Method}


\subsection{Base VQA Module}
We denote the VQA dataset with $N$ training instances as $\mathcal{D}=\{I_i,Q_i,a_i\}_{i=1}^N$, where $I_i\in\mathcal{I}$, $Q_i\in\mathcal{Q}$ and $a_i\in\mathcal{A}$ denote the $i^{th}$ instance image, question, and ground truth answer. The visual encoder $e_v$ and the question encoder $e_q$ encode $I_i$ and $Q_i$ to generate the  embedding vectors $v_i=e_v\left(I_i\right)$ and $q_i=e_q\left(Q_i\right)$, respectively. The goal of the VQA model is to train a mapping function ${f}_{VQA}:\mathcal{I}\times\mathcal{Q}\rightarrow\mathcal{R}^\mathcal{A}$ that produces a correct distribution across answer space $\mathcal{A}$. The VQA work can generally be considered a multi-class classification task \cite{ref3,ref16}. We train the VQA model using binary cross-entropy loss to optimize its learning parameters:
\begin{equation}
P_{VQA}\left(\mathcal{A}|v_i,q_i\right)=Softmax\left({f}_{VQA}\left(\mathcal{A}|v_i,q_i\right)\right) 
\end{equation}
\begin{equation}
\begin{aligned}
\mathcal{L}_{VQA}=-\frac{1}{N}\sum_{i=1}^{N}t_ilog\left(\sigma\left({f}_{VQA}\left(\mathcal{A}|v_i,q_i\right)\right)\right) \\ +\left(1-t_i\right)log\left(1-\sigma\left({f}_{VQA}\left(\mathcal{A}|v_i,q_i\right)\right)\right) 
\end{aligned}
\end{equation}
where soft target score $t_i$ is denoted by $t_i \in {[0,1]}^{\left \| \mathcal{A} \right \| }$ for $a_i$, and $\sigma\left(\cdot\right)$ denotes the sigmoid function.




\subsection{Loss Rebalancing Label Module}
In this part, we try to construct Loss Rebalancing Labels (LRL) for each biased sample and use the LRL to train the VQA model to reduce the negative impact of biased samples, i.e., to reduce the negative bias of multimodal features ${f}_{VQA}\left(\mathcal{A}|v_i,q_i\right)$. Before that, we need to capture the presence of language biases in the training samples.

\textbf{Bias Model captures the biases.} An intuitive method is to train a unimodal model that accepts only one of the two modes as input to capture the biases in the VQA dataset \citep{ref11}. It is common practice to use the question-only model as a branch of the VQA model  \citep{ref11,ref23,ref24}, but this unimodal feature contains only language modality information and lacks the use of image information. Inspired by \cite{ref13}, we swap the original image $I$ with image $I^\prime$, which is chosen randomly from the image set $\mathcal{I}$. Considering the vast size of $\mathcal{I}$, the probability that $\left(Q,I^\prime\right)$ are related is extremely remote, i.e., the input question and the image are unrelated. A comparison of the effects of the question-only model and the bias model is shown in Table \ref{table:4}. 
Specifically, our bias model ${f}_{Bias}:\mathcal{R}^{{d}_{v}}\times\mathcal{R}^{{d}_{q}}\rightarrow\mathcal{R}^\mathcal{A}$ consists of a question-only model ${f}_{QO}:\mathcal{R}^{{d}_{q}}\rightarrow\mathcal{R}^{{d}_{q}}$ , which can be formalized as:
\begin{equation}
     q_i^{'}={f}_{QO}(q_i)\text{, } v_i^{'}=e_v(I_i^{'})
\end{equation}
\begin{equation}
{f}_{Bias}(\mathcal{A}|v_i^{'},q_i^{'})={clf}_{Bias}(m_{Bias}(v_i^{'}\odot q_i^{'}))
\end{equation}
where $\odot$ denotes element-wise product and $m_{Bias}:\mathcal{R}^{d_q}\times\mathcal{R}^{d_v}\rightarrow\mathcal{R}^{{d}_{m}}$ denotes multi-layer perceptron (MLP), and $clf_{Bias}:\mathcal{R}^{d_m}\rightarrow\mathcal{R}^A$ denotes the classifier. Formally, binary cross-entropy loss is used to optimise the parameters of the bias model and the question-only model:
\begin{equation}
\begin{aligned}
\mathcal{L}_Q=-\frac{1}{N}\sum_{i=1}^{N}t_ilog\left(\sigma\left({f}_{QO}\left(q_i\right)\right)\right)\\+\left(1-t_i\right)log\left(1-\sigma\left({f}_{QO}\left(q_i\right)\right)\right)
\end{aligned}
\end{equation}
\begin{equation}
\begin{aligned}
\mathcal{L}_B=-\frac{1}{N}\sum_{i=1}^{N}t_ilog\left(\sigma\left({f}_{Bias}\left(\mathcal{A}|v_i^\prime,q_i^\prime\right)\right)\right)\\+\left(1-t_i\right)log\left(1-\sigma\left({f}_{Bias}\left(\mathcal{A}|v_i^\prime,q_i^\prime\right)\right)\right)
\end{aligned}
\end{equation}

\textbf{Filter Biased Samples (FBS).} According to \citep{ref11,ref25}, we classify the training samples as biased and unbiased. Among them, biased samples are those in which the model relies on the questions alone to predict the answer, while unbiased samples require images and questions to infer the answer.

\begin{equation}
sim(\mathrm {f}_{Bias},\mathrm {f}_{GT})=\frac{\mathrm {{f}_{Bias}}^{\top}  \mathrm {f}_{GT}}{\left \| \mathrm {f}_{Bias} \right \| \left \|\mathrm {f}_{GT}\right \|}
\end{equation}
where $sim\left(\cdot,\cdot\right)$ denotes a similarity score function, and a higher value indicates a higher probability that the sample is biased. In practice, we use the similarity function made by cosine similarity. $\mathrm{f}_{Bias}$ and $\mathrm{f}_{GT}$ denote the vectors for word embedding using Glove 300, where $\mathrm{f}_{Bias}$ is the predicted answer for the bias model and $\mathrm{f}_{GT}$ is the ground true answer. We assume that all candidate answers are independent of each other. Therefore they cannot be effectively modelled by a recurrent neural network (e.g., GRU) similar to the question sentence. For answers containing multiple words (e.g., ``knife and spoon''), we use the sum of these word vectors to represent them.

The VQA dataset answer types are ``yes/no'', ``number'' and ``other''. When the answer type is ``other'', we consider a sample biased if $\mathrm{f}_{Bias}$ has a similar semantic space distance to $\mathrm{f}_{GT}$. Unlike the ``other'' questions, the predicted answers to the ``yes/no'' and ``number'' questions have a strong semantic correlation with the underlying ground true answers, which is not conducive to determining biased samples through similarity in the semantic space. Therefore, the sample is biased when the answer type is ``y/n'' or ``num'' and $\mathrm{f}_{Bias}$ equals $\mathrm{f}_{GT}$.
\begin{equation}
\text{biased sample}\in\begin{cases} sim(\mathrm {f}_{Bias},\mathrm {f}_{GT})>\alpha,\,at \text{ is other}
 \\ \mathrm {f}_{Bias}=\mathrm {f}_{GT},\,at \text{ is y/n or num }
\end{cases}
\end{equation}
where $at$ denotes the answer type and $\alpha$ is a hyperparameter. 

\textbf{Construct LRL and train the model using LRL.} The bias model captures the language biases, and we want to use the language biases to construct LRLs for biased samples. LRL helps the model to focus more on the training samples that only make the language modal and cannot answer correctly. If the training samples are unbiased, the LRLs are ground true answers. In contrast, for the biased samples, we use LRL as the label to train the model, thus reducing the harmful effects of biased samples on the model. 
\begin{equation}
LRL=\left\{\begin{matrix}t_i\left(1-P_{Bias}\left(\mathcal{A}|v_i^\prime,q_i^\prime\right)\right)^\beta,&biased\\t_i,&unbiased\\\end{matrix}\right.
\end{equation}
\begin{equation}
P_{Bias}(\mathcal{A}|v_i^{'},q_i^{'})=Softmax({f}_{Bias}(\mathcal{A}|v_i^{'},q_i^{'})) 
\end{equation}
where $\beta$ is a hyperparameter. A larger $\beta$ indicates a stronger penalty for that biased sample. Using LRLs for training, we can obtain the model debiased predicted output ${P}_{VQA}\left(LRL|v_i,q_i\right)$.


\subsection{Global Context Module}

\textbf{Multi-head Self-Attention.} We use a multi-headed self-attention mechanism \citep{ref26} to capture the correlation between the image and the question.
\begin{equation}
    Att(Q,K,V)=Softmax(\frac{QK^{\top}}{\sqrt{d_k} })V
\end{equation}
where $Att\left(\cdot,\cdot,\cdot\right)$ indicates self-attention mechanism. $Softmax\left(\cdot\right)$ operation is performed for each row. $Q$, $K$ and $V$ denote query, key and value respectively and $\sqrt{d_k}$ is the channel number of $Q$ and $K$.
\begin{equation}
    h_i=Att\left(XW_i^Q,XW_i^K,XW_i^V\right)
\end{equation}
\begin{equation}
{f}_{MSA}\left(X\right)=Concat\left(h_1,...,h_H\right)+X
\end{equation}
where $h_i$ refers to the output of the $i^{th}$ head, and $H$ denotes the number of heads. $Concat\left(\cdot\right)$ represents concatenating the results of multiple heads.

\textbf{Gate Mechanism.} We can make visual modality and text modality interact and integrate better through ${f}_{MSA}\left(\cdot\right)$. However, the visual and textual modalities of ${f}_{MSA}\left(\cdot\right)$ projections may contain noisy or meaningless information. We added a sigmoid-gating mechanism to pass information adaptively and suppress useless details \citep{ref11}. It consists in multiplying the output of a newly added multi-head self-attention layer by $sigmoid\left(w_g\right)$ before adding it to the input representation from the residual connection, where $w_g$ is a learnable scalar and is initialized at 0.
\begin{equation}
\begin{aligned}
{f}_{Context}\left(\mathcal{A}|v_i,q_i\right)=y+\sigma\left(w_g\right)\\ \times{f}_{MSA}\left(y\right)\text{, }where\text{ }y=v_i\odot q_i
\end{aligned}
\end{equation}
\begin{equation}
P_{Context}\left(\mathcal{A}|v_i,q_i\right)=Softmax\left({f}_{Context}\left(\mathcal{A}|v_i,q_i\right)\right)
\end{equation}
This gating mechanism improves both the stability of the training and the final performance. Note that here $v_i$ is not calculated with $q_i$ for local attention, because we want to have access to all information that may affect context features ${f}_{Context}$.

We convert $sim(\mathrm {f}_{Bias},\mathrm {f}_{GT})$ to a binary vector $b_i$ as the label for computing $\mathcal{L}_C$ to learn context priors with language biases, as shown below:
\begin{equation}
\begin{aligned}
    \mathcal{L}_C=-\frac{1}{N}\sum_{i=1}^{N}b_ilog\left(\sigma\left({f}_{Context}\left(\mathcal{A}|v_i,q_i\right)\right)\right)\\+\left(1-b_i\right)log\left(1-\sigma\left({f}_{Context}\left(\mathcal{A}|v_i,q_i\right)\right)\right)
\end{aligned}
\end{equation}

\begin{equation}
b_i=\left\{\begin{matrix}1,&sim(\mathrm {f}_{Bias},\mathrm {f}_{GT})\geq\gamma\\0,&sim(\mathrm {f}_{Bias},\mathrm {f}_{GT})<\gamma\\\end{matrix}\right.
\end{equation}
Based on empirical values, we fixed $\gamma$ to 0.1.






\subsection{Ensemble Training Module}

To combine the debiased prediction output with contextual information, inspired by \citep{ref31}, we trained an ensemble of ${f}_{VQA}\left(LRL|v_i,q_i\right)$ and ${f}_{Context}\left(\mathcal{A}|v_i,q_i\right)$ and computed a new prediction distribution $P_{Ensemble}\left(\mathcal{A}|v_i,q_i\right)$. 

\begin{equation}
\begin{aligned}
{f}_{E}\left(\mathcal{A}|v_i,q_i\right)=log\left({f}_{VQA}\left(LRL|v_i,q_i\right)\right)\\+log\left({f}_{Context}\left(\mathcal{A}|v_i,q_i\right)\right)
\end{aligned}
\end{equation}
\begin{equation}
P_{Ensemble}\left(\mathcal{A}|v_i,q_i\right)=Softmax\left({f}_{E}\left(\mathcal{A}|v_i,q_i\right)\right)
\end{equation}
A binary cross-entropy loss can be used to optimize the parameters of the $\mathcal{L}_E$:
\begin{equation}
\begin{aligned}
\mathcal{L}_E=-\frac{1}{N}\sum_{i=1}^{N}t_ilog\left(\sigma\left({f}_{E}\left(\mathcal{A}|v_i,q_i\right)\right)\right)\\+\left(1-t_i\right)log\left(1-\sigma\left({f}_{E}\left(\mathcal{A}|v_i,q_i\right)\right)\right)
\end{aligned}
\end{equation}

Note that all other modules are involved in the training stage. All extra modules are removed during the inference stage, and we use only ${f}_{VQA}\left(\cdot\right)$ to make accurate predictions.

Finally, the total loss function can be defined as follows:
\begin{equation}
    \mathcal{L}=\mathcal{L}_{VQA}+\mathcal{L}_Q+\mathcal{L}_B+\mathcal{L}_C+\mathcal{L}_E
\end{equation}



\begin{table*}[t!]
  \centering
  \caption{Comparison results for the VQA-CP v2 test split and the VQA v2 validation split. The highest score is displayed in \textbf{bold}, while the second-highest score is \underline{underlined}. All models below use UpDn \citep{ref1} as the backbone. I – IV denote plain methods, methods based on strengthening visual information (annotation-based), methods based on data augmentation (data-balanced), and methods based on training strategies (ensemble-based), respectively.}
\resizebox{1\linewidth}{!}{  % Here 1/2
    \begin{tabular}{cccccccccccc}
    \toprule
    \multirow{2}[2]{*}{Case} & \multirow{2}[2]{*}{Model} & \multicolumn{4}{c}{VQA-CP v2 test} & \multicolumn{4}{c}{VQA v2 val} & \multicolumn{2}{c}{Comparison} \\
          &       & Overall & Yes/No & Number & Other & Overall & Yes/No & Number & Other & Gap$\downarrow$    & Mean \\
    \midrule
    \multirow{3}[2]{*}{\textbf{I}} & SAN \citep{ref21}   & 24.96  & 38.35  & 11.10  & 21.74  & 52.41  & 70.06  & 39.28  & 47.84  & 27.45  & 38.69  \\
          & BAN \citep{ref16}  & 37.03  & 41.55  & 12.43  & 41.40  & 63.90  & \underline{81.42}  & \textbf{45.18}  & 55.54  & 26.87  & 50.47  \\
          & UpDn \citep{ref3}  & 39.74  & 42.27  & 11.93  & 46.05  & 63.48  & 81.18  & 42.14  & \textbf{55.66} & 23.74  & 51.61  \\
    \midrule
    \multirow{3}[2]{*}{\textbf{II}} & AttAlign \citep{ref12} & 39.37  & 43.02  & 11.89  & 45.00  & 63.24  & 80.99  & 42.55  & 55.22  & 23.87  & 51.31  \\
          & HINT \citep{ref12}  & 46.73  & 67.27  & 10.61  & 45.88  & 63.38  & 81.18  & 42.99  & \underline{55.56}  & 16.65  & 55.06  \\
          & SCR \citep{ref33}   & 49.45  & 72.36  & 10.93  & 48.02  & 62.20  & 78.80  & 41.60  & 54.50  & 12.75  & 55.83  \\
    \midrule
    \multirow{5}[2]{*}{\textbf{III}} & Unshuffling \citep{ref34} & 42.39  & 47.72  & 14.43  & 47.24  & 61.08  & 78.32  & 42.16  & 52.71  & 18.69  & 51.74  \\
          & RandImg \citep{ref35} & 55.37  & 83.89  & 41.60  & 44.20  & 57.24  & 76.53  & 33.87  & 48.57  & 1.87  & 56.31  \\
          & CSS \citep{ref32}   & 58.95  & 84.37  & 49.42  & 48.21  & 59.91  & 73.25  & 39.77  & 55.11  & 0.96  & 59.43  \\
          & CL-VQA \citep{ref36} & 59.18  & 86.99  & 49.89  & 47.16  & 57.29  & 67.27  & 38.40  & 54.71  & 1.89  & 58.24  \\
          & SSL-VQA \citep{ref13} & 57.59  & 86.53  & 29.87  & \textbf{50.03} & \textbf{63.73} & -     & -     & -     & 6.14  & 60.66  \\
    \midrule
    \multirow{11}[4]{*}{\textbf{IV}} & AdvReg \citep{ref37} & 41.17  & 65.49  & 15.48  & 35.48  & 62.75  & 79.84  & 42.35  & 55.16  & 21.58  & 51.96  \\
          & RUBi \citep{ref11}  & 45.42  & 63.03  & 11.91  & 44.33  & 58.19  & 63.04  & 41.00  & 54.43  & 12.77  & 51.81  \\
          & LMH \citep{ref31}   & 52.01  & 72.58  & 31.12  & 46.97  & 56.35  & 65.06  & 37.63  & 54.69  & 4.34  & 54.18  \\
          & CF-VQA \citep{ref22} & 53.55  & \textbf{91.15} & 13.03  & 44.97  & \underline{63.54}  & \textbf{82.51} & \underline{43.96} & 54.30  & 9.99  & 58.55  \\
          & GGE-DQ \citep{ref38} & 57.32  & 87.04  & 27.75  & \underline{49.59}  & 59.11  & 73.27  & 39.99  & 54.39  & 1.79  & 58.22  \\
          & LPF \citep{ref23}   & 55.34  & 88.61  & 23.78  & 46.57  & 55.01  & 64.87  & 37.45  & 52.08  & 0.33  & 55.18  \\
          & Loss-Rescaling \citep{ref39} & 53.26  & 72.82  & 48.00  & 44.46  & 56.81  & 68.21  & 36.37  & 52.29  & 3.55  & 55.04  \\
          & LP-Focal \citep{ref24}  & 58.45  & 88.34  & 34.67  & 49.32  & 62.45  & -     & -     & -     & 4.00  & 60.45  \\
          & CCB-VQA \citep{ref40} & 59.12  & 89.12  & \underline{51.04}  & 45.62  & 59.17  & 77.28  & 33.71  & 52.14  & \textbf{0.05} & 59.15  \\
          & SBS \citep{ref25}   & \underline{59.57}  & 87.44  & \textbf{52.96} & 46.79  & 61.97  & 78.80  & 42.17  & 54.41  & 2.40  & \underline{60.77}  \\
\cmidrule{2-12}          & \textbf{LRLGC (Ours)} & \textbf{60.91} & \underline{89.95}  & 45.13  & \textbf{50.03} & 60.81  & 77.65  & 39.25  & 53.71  & \underline{0.10}  & \textbf{60.86} \\
    \bottomrule
    \end{tabular}%
}
  \label{table:1}%
\end{table*}%




\section{Experiments}

\subsection{Datasets}
We evaluate our method using standard evaluation metrics \cite{ref1} on the most widely used out-of-distribution benchmark VQA-CP v2 \cite{ref10} test set and the standards-based in-distribution VQA v2 \cite{ref7} validation set. The VQA-CP v2 training and test sets contain around 121k and 98k images and 245k and 220k questions, respectively.

\subsection{Baselines}
To demonstrate the effectiveness of our LRLGC, we used different backbones, including UpDn \citep{ref3}, SAN \citep{ref21}, BAN \citep{ref16}.  None of these three models is designed to mitigate language priors. In addition, several different methods are compared to our approach: (1) annotation-based methods: AttAlig \citep{ref12}, HINT \citep{ref12}, SCR \citep{ref33}. (2) data-balanced methods: Unshuffling\citep{ref34}, RandImg \citep{ref35}, CSS \citep{ref32}, CL-VQA \citep{ref36}, SSL-VQA \citep{ref13}. (3) ensemble-based methods: AdvReg\citep{ref37}, RUBi \citep{ref11}, LMH \citep{ref31}, CF-VQA \citep{ref22},GGE-DQ \citep{ref38}, LPF \citep{ref23}, Loss-Rescaling \citep{ref39}, LP-Focal \citep{ref24}, CCB-VQA \citep{ref40}, SBS \citep{ref25}.




\subsection{Implementation details}
We use pre-trained Faster-RCNN to extract object features. Specifically, we extract 36 object features with dimensions of 2048 for each image. All questions are padded to the same length 14. Each word vector is embedded with 300-dimensional Glove. Then, they are fed into a single-level GRU to produce a 1280-dimensional representation of the sentence level. Inspired by \cite{ref13}, we set a Batch Normalization layer in front of each classifier and use a binary cross-entropy loss to train all branches during training.The Adam optimizer is used with an initial learning rate of 0.001. After ten epochs, we halve the learning rate every five epochs. The batch size is set to 512, and we train our LRLGC for 30 epochs. The $\alpha=0.5$ and $\beta=4$ settings are used in all tests in this work. In later sections, we will also study the hyperparameter setting. Besides, we set the Global Context Module to use one layer and 64 headers.




\subsection{Performance Comparison}
On the VQA-CP v2 test set and the VQA v2 validation set, our LRLGC and state-of-the-art approaches are compared, and the experimental outcomes are presented in Table \ref{table:1}. The following conclusions can be drawn from these results.

On the VQA-CP v2 test set, (1) data-balanced and ensemble-based methods perform similarly (59.18$\%$ vs. 59.57$\%$), significantly outperforming HINT \cite{ref12} and SCR \cite{ref33} that require additional manual annotation. Although data-balanced methods perform well, they change the training prior, making it difficult to tell if the VQA model is still driven by a memory prior. (2) LRLGC outperforms all compared methods, achieving an advanced performance of 60.91$\%$. Specifically, LRLGC outperforms LMH \cite{ref31}, CL-VQA \cite{ref36}, and SBS \cite{ref25} by approximately 9$\%$,1.7$\%$, and 1.3$\%$, respectively. Results showed LRLGC improved (+21.17$\%$) compared to UpDn \cite{ref3}. Notably, our method doesn't use additional data. (3) For the ``Other'' questions, LRLGC is consistent with SSL-VQA \cite{ref13} and outperforms the other methods. For the ``Yes/No'' questions, LRLGC is second to CF-VQA \cite{ref22}. These results further validate the effectiveness of our LRLGC training strategy.



The VQA v2 results validate the debiasing strategy on the in-distribution dataset. On VQA v2, most ensemble-based methods do worse than UpDn \cite{ref3}. Although LRLGC performs less well than UpDn \cite{ref3}, it still outperforms the majority of ensemble-based models, including LMH \cite{ref31}, RUBi \cite{ref11}, and LPF \cite{ref23}. This suggests that LRLGC has some potential to address the overcorrection issue. To avoid significant performance degradation, LRLGC can reduce most VQA CP v2 statistical priors while retaining most VQA v2 global information.

From the combined results of the two datasets, (1) our method can effectively reduce the performance gap on both datasets to 0.1$\%$. (2) Among all the compared methods, our LRLGC has the highest average score of 60.86$\%$ on both datasets. All these results further show that our LRLGC not only decreases training bias but also enhances model robustness.



\subsection{Ablation Studies}


\textbf{Effect of different backbones.} To demonstrate that our LRLGC works effectively on a variety of VQA models, we built LRLGC frameworks on SAN  \citep{ref21}, BAN  \citep{ref16}, and UpDn  \citep{ref3} and ran experiments on the VQA-CP v2. From the results in Table \ref{table:2}, LRLGC significantly improves the model's accuracy regardless of the backbone, indicating that LRLGC is model-agnostic.



\textbf{Performance on different scales of the dataset.} We ran experiments on VQA-CP v2 with varying training sizes to further prove our method's superiority. As shown in Table \ref{table:3}, the percentage results of the training split variables show that our LRLGC improves the three benchmark models by 18.8$\%$ on average. Even with less training data (20$\%$ and 40$\%$), LRLGC can overcome the language priors and exploit the global context to improve overall average performance (12.7$\%$ and 17.4$\%$).


\textbf{Each LRLGC module's effect on the model performance.} We conducted an ablation study on the VQA-CP v2 to demonstrate the effectiveness of each component in our LRLGC. The results are shown in Table \ref{table:4}. The following findings can be drawn from these results: (1) LRL can help the model overcome the language priors (rows 1-3), reducing the proportion of biased samples in the total loss. (2) Including random images in the bias model is better than question-only (rows 2-3), proving the importance of the visual modality in capturing language priors.  (3) LRL+FBS has essentially no effect on performance (rows 2-5), demonstrating that unbiased samples need not be overly penalized. (4) Global context can help the model perform better (rows 4-7). In particular, FBS+Context works better (rows 6-9), indicating that biased samples need more context. Overall, these results are evidence of the effectiveness of each component of our LRLGC in the improvement of model performance.

\textbf{Effect of hyperparameters $\alpha$ and $\beta$.} As Table \ref{table:5} shows, we tested different combinations of $\alpha$ and $\beta$ on the VQA-CP v2 split. $\alpha$ is used to control the judgment threshold of biased samples, and $\beta$ indicates the strength of penalizing biased samples. An appropriate ratio between $\alpha$ and $\beta$ will lead to better performance of LRLGC. From the experimental results, the highest performance is the combination of $\alpha$ = 0.5 and $\beta$ = 4.

\begin{table}[htbp]
  \centering
  \caption{The effect of different backbones on model performance on the VQA-CP v2 test set.}
    \resizebox{1\linewidth}{!}{  % Here 1/2
    \begin{tabular}{cccccc}
    \toprule
    Model & Yes/No & Number & Other & Overall & Gap$\uparrow$ \\
    \midrule
    SAN$\dagger$ \citep{ref21}  & 40.86  & 13.43  & 46.98  & 40.08  & \multirow{2}[2]{*}{\textbf{+18.48}} \\
    SAN+LRLGC & 88.03  & 42.05  & 47.65  & 58.56  &  \\
    \midrule
    BAN$\dagger$ \citep{ref16}  & 43.53  & 13.60  & 46.35  & 40.53  & \multirow{2}[2]{*}{\textbf{+18.66}} \\
    BAN+LRLGC & 89.85  & 42.74  & 47.64  & 59.19  &  \\
    \midrule
    UpDn$\dagger$ \citep{ref3} & 43.32  & 13.41  & 48.32  & 41.54  & \multirow{2}[2]{*}{\textbf{+19.37}} \\
    UpDn+LRLGC & 89.95  & 45.13  & 50.03  & 60.91  &  \\
    \bottomrule
    \end{tabular}%
    }
  \label{table:2}%
\end{table}%

\begin{table}[htbp]
  \centering
  \caption{Results of the LRLGC on the VQA-CP v2 test set with different proportions of training split. $\dagger$ denotes the model we have re-implemented.}
    \resizebox{1\linewidth}{!}{  % Here 1/2
    \begin{tabular}{cccccc}
    \toprule
    \multirow{2}[4]{*}{Model} & \multicolumn{5}{c}{Proportion of Training Set} \\
\cmidrule{2-6}          & 20\%  & 40\%  & 60\%  & 80\%  & 100\% \\
    \midrule
    SAN$\dagger$ \citep{ref21}  & 33.15  & 36.62  & 39.11  & 39.71  & 40.08  \\
    SAN+LRLGC & 43.80  & 53.19  & 56.67  & 57.13  & \textbf{58.56} \\
    \midrule
    BAN$\dagger$ \citep{ref16}  & 33.05  & 37.28  & 38.52  & 40.00  & 40.53  \\
    BAN+LRLGC & 42.66  & 54.16  & 56.91  & 58.65  & \textbf{59.19} \\
    \midrule
    UpDn$\dagger$ \citep{ref3} & 36.37  & 38.72  & 39.91  & 40.53  & 41.54  \\
    UpDn+LRLGC & 54.10  & 57.57  & 59.02  & 59.96  & \textbf{60.91} \\
    \bottomrule
    \end{tabular}%
    }
  \label{table:3}%
\end{table}%



\begin{table}[htbp]
  \centering
  \caption{Each LRLGC module's effect on the model performance. UpDn$\dagger$ as the backbone. And q denotes question-only, qv denotes question and random image.}
   \resizebox{0.88\linewidth}{!}{  % Here 1/2
    \begin{tabular}{ccccc}
    \toprule
          & LRL   & FBS   & GC & VQA-CP v2 test ($\%$)  \\
    \midrule
    1     &       &       &       & 41.54  \\
    2     &  q  &       &       & 57.83  \\
    3     &  qv &       &       & 58.90  \\
    4     &  q  & \checkmark     &       & 58.17  \\
    5     &  qv & \checkmark     &       & 58.77  \\
    6     &  q  &       & \checkmark     & 59.43  \\
    7     &  qv &       & \checkmark     & 59.81  \\
    8     &  q  & \checkmark     & \checkmark     & 59.84  \\
    9     &  qv & \checkmark     & \checkmark     & 60.91  \\
    \bottomrule
    \end{tabular}%
    }
  \label{table:4}%
\end{table}%



\begin{table}[t!]
  \centering
  \caption{Results for various $\alpha$ and $\beta$ combinations.}
  \resizebox{.88\linewidth}{!}{
    \begin{tabular}{ccc}
    \toprule
    Model & $\alpha$ vs. $\beta$ & VQA-CP v2 test ($\%$) \\
    \midrule
    \multirow{6}[2]{*}{LRLGC} & 0.1 : 4 & 59.58  \\
          & 0.3 : 4 & 60.08  \\
          & 0.5 : 4 & \textbf{60.91} \\
          & 0.7 : 4 & 60.28  \\
          & 0.5 : 3 & 60.65  \\
          & 0.5 : 5 & 60.08  \\
    \bottomrule
    \end{tabular}%
    }
  \label{table:5}%
\end{table}%




\begin{table}[t!]
  \centering
  \caption{Comparison of LRLGC and other re-weighting methods}
    \resizebox{1\linewidth}{!}{
    \begin{tabular}{ccccccc}
    \toprule
    Model & Adaptive & q     & v     & FBS   & GC & VQA-CP v2 test ($\%$) \\
    \midrule
    Loss-Rescaling \citep{ref39} &       &   \checkmark    &       &       &       & 53.26  \\
    LPF \citep{ref23}  &  \checkmark     &  \checkmark     &       &       &       & 55.34  \\
    LP-Focal \citep{ref24} &  \checkmark     &  \checkmark     &       &       &       & 58.45  \\
    LRLGC (Ours)   &  \checkmark     &   \checkmark    &  \checkmark     &  \checkmark     &   \checkmark    & 60.91  \\
    \bottomrule
    \end{tabular}%
    }
  \label{table:6}%
\end{table}%





\subsection{LRLGC vs. Other Re-Weighting Methods}
In this part, we further compare our LRLGC with other re-weighting methods, and the results are shown in Table \ref{table:6}. Loss-Rescaling \citep{ref39} takes full advantage of the spurious statistical relationship between question types and answers, from which bias values are calculated for each sample. However, the bias values are pre-calculated based on the dataset and are not adaptively adjusted during training. Both LPF \citep{ref23} and LP-Focal \citep{ref24} use question-only branches to dynamically capture language biases during training and are able to adaptively adjust loss values for each sample. However, LPF and LP-Focal lack the filtering of biased samples, which can easily lead to over-correction of unbiased samples and degrade the performance in in-distribution datasets. In addition, the introduction of the visual modality captures the bias in the sample more adequately than language modality alone (rows 2-3 of Table \ref{table:4}). Moreover, our LRLGC incorporates global context and, through ensemble-based training, can minimize the negative effects of biased samples on the model while retaining their useful information.

\begin{figure}[t!]  
    \centering
	\includegraphics[width=\linewidth,scale=1.00]{myfig3_2}
	\caption{Qualitative comparison results of UpDn  \citep{ref3} and LRLGC on VQA-CP v2 test set.}
	\label{myfig3}
 
\end{figure}


\subsection{Qualitative Analysis}
We provide qualitative results from the VQA-CP v2 in Fig. \ref{myfig3} to demonstrate our LRLGC's validity further. The prediction results are based on UpDn and LRLGC. These examples cover ``yes/no,'' ``num,'' and ``other''. We visualize the model's top 3 important regions, output attention weights, and show the top 4 answers. For the question ``What color is the mouse pad?'', LRLGC accurately located the key visual object and answered. For the ``yes/no'' questions, ``yes'' has relatively more priors than other answers. For example, ``Does this horse have a saddle on its back?'' the baseline predicted answer is ``yes'' but it does not locate the critical visual object (there is no ``look'' image), indicating language priors interference. Even for counting questions like ``How many planes are there?'' that require visual understanding, LRLGC gives the right answer. By using our LRLGC, the VQA model can avoid overfitting of data biases and show better results on out-of-distribution datasets.  



\section{Conclusion}
This paper proposes a general training strategy called LRLGC to address the language priors problem in VQA. LRLGC applies dynamic weighting to each biased sample and integrates global context to guide the model in answering questions. Experimental results show that our method achieves promising results on both VQA-CP v2 and VQA v2. In the future, we plan to improve our method and use it for other multimodal deep-learning tasks with single-peak bias.



\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option

This work is supported by National Natural Science Foundation of China (Nos. 62276073, 61966004), Guangxi Natural Science Foundation (No. 2019GXNSFDA245018), Innovation Project of Guangxi Graduate Education (YCSW2023141), Guangxi ``Bagui Scholar'' Teams for Innovation and Research Project, and Guangxi Collaborative Innovation Center of Multi-source Information Integration and Intelligent Processing. (Corresponding author: Zhixin Li.)

\end{acknowledgements}

% References
\bibliography{cao_608}
\end{document}
