%% The first command in your LaTeX source must be the \documentclass command.
%%
%% Options:
%% twocolumn : Two column layout. Do not use twocolumn for papers submitted to CEUR-WS!
%% hf: enable header and footer.
\documentclass[
% twocolumn,
% hf,
]{ceurart}

%%
%% One can fix some overfulls
\sloppy

%%
%% Minted listings support 
%% Need pygment <http://pygments.org/> <http://pypi.python.org/pypi/Pygments>
\usepackage{listings}
%% auto break lines
\lstset{breaklines=true}
\usepackage{cleveref}

\lstset{
    basicstyle=\ttfamily\small,
    breaklines=true,
    backgroundcolor=\color{gray!10},
    frame=single
}

\definecolor{jsonstring}{RGB}{200, 80, 60} % The reddish-orange color
\definecolor{linenumbers}{RGB}{100, 100, 100} % Grey for line numbers

\lstdefinelanguage{json}{
    basicstyle=\ttfamily\small,
    numbers=left,               % Line numbers on the left
    numberstyle=\tiny\color{linenumbers}, % Style of line numbers
    stepnumber=1,               % Number every line
    numbersep=8pt,              % Distance between number and code
    showstringspaces=false,     % Don't show the little cup in spaces
    breaklines=true,            % Wrap long lines automatically
    frame=single,               % The box around the code
    backgroundcolor=\color{white},
    string=[s]{"}{"},           % Define what a string is (between quotes)
    stringstyle=\color{jsonstring}, % Apply the reddish color to strings
    literate=
     *{:}{{{\color{black}:}}}{1} % Ensure colons are black
      {,}{{{\color{black},}}}{1} % Ensure commas are black
      {\{}{{{\color{black}\{}}}{1} % Ensure brackets are black
      {\}}{{{\color{black}\}}}}{1}
      {[}{{{\color{black}[}}}{1}
      {]}{{{\color{black}]}}}{1},
}

\newcommand{\quickcharcount}[1]{%
  \immediate\write18{texcount -1 -sum -merge -char -q #1.tex output.bbl > #1-chars.sum }%
  \input{#1-chars.sum} characters (not including spaces)%
}

%%
%% end of the preamble, start of the body of the document source.
\begin{document}

%%
%% Rights management information.
%% CC-BY is default license.
\copyrightyear{2026}
\copyrightclause{Copyright for this paper by its authors.
  Use permitted under Creative Commons License Attribution 4.0
  International (CC BY 4.0).}

%%
%% This command is for the conference information
\conference{EVALITA 2026: 9th Evaluation Campaign of Natural Language
Processing and Speech Tools for Italian, Feb 26 – 27, Bari, IT}

%%
%% The "title" command
\title{EVWSD-ITA at EVALITA 2026: Overview of the Enhanced Visual Word Sense Disambiguation for Italian Task}

%%
%% The "author" command and its associated commands are used to define
%% the authors and their affiliations.
\author[1,2]{Elio Musacchio}[%
orcid=0009-0006-9670-9998,
email=elio.musacchio@uniba.it,
]
\cormark[1]
\address[1]{Dept. of Computer Science, University of Bari Aldo Moro,
  Via E. Orabona, 4 - 70125 Bari (ITALY)}


\author[1]{Lucia Siciliani}[%
orcid=0000-0002-1438-280X,
email=lucia.siciliani@uniba.it,
]
\address[2]{Dept. of Computer Science, University of Pisa,
    Largo Bruno Pontecorvo, 3, Pisa (ITALY)}

\author[1]{Pierpaolo Basile}[%
orcid=0000-0002-0545-1105,
email=pierpaolo.basile@uniba.it,
]

\author[1]{Giovanni Semeraro}[%
orcid=0000-0001-6883-1853,
email=giovanni.semeraro@uniba.it,
]

%% Footnotes
\cortext[1]{Corresponding author.}

%%
%% The abstract is a short summary of the work to be presented in the
%% article.

\begin{abstract}
    EVWSD-ITA (Enhanced Visual Word Sense Disambiguation for Italian) is a shared task proposed at the EVALITA 2026 campaign. While traditional Visual Word Sense Disambiguation (VWSD) focuses on broad semantic distinctions, EVWSD-ITA introduces a more rigorous challenge by requiring systems to perform fine-grained disambiguation. The task involves selecting the most appropriate image for a target word within a given context, specifically designed to include "hard negatives"—co-hyponyms that share a common hypernym but represent distinct concepts. The dataset was meticulously constructed, comprising a manually validated test set. In this report, we showcase the dataset construction procedure and the results of the task.
\end{abstract}

%%
%% Keywords. The author(s) should pick words that accurately describe
%% the work being presented. Separate the keywords with commas.
\begin{keywords}
  Word Sense Disambiguation \sep
  Visual Word Sense Disambiguation \sep
  Dataset \sep
  Evaluation
\end{keywords}

%%
%% This command processes the author and affiliation and title
%% information and builds the first part of the formatted document.

\maketitle

\section{Introduction}

Word Sense Disambiguation (WSD) is a historical task in the Natural Language Processing field of research \cite{navigli2009word}.
In this task, the objective is to select the correct sense of a target word in a sentence, out of all its possible meanings.
Given its longevity, several works have explored algorithms and neural networks capable of solving the task \cite{bevilacqua2021recent}.
Nevertheless, this task still cannot be solved entirely, thus capturing all the nuances of natural language remains an intriguing challenge.
The introduction of Large Language Models (LLMs) has further ignited interest in resolution of this task, with several works proposing usage of LLMs to perform WSD \cite{sumanathilaka2024assessing}.
In light of the success and interest of WSD, Visual Word Sense Disambiguation (VWSD) \cite{raganato-etal-2023-semeval} has been proposed.
In this task, the objective is to select an image, out of ten possible candidates, which correctly represents the sense of a target word in a given sentence.
The sentence consists of a word of interest (that is, the target to disambiguate) and additional context words to support disambiguation (e.g., "music keyboard", where "keyboard" is the target to disambiguate and "music" is the context word).
The gold standard is represented by examples annotated manually, where human experts have selected images that are meaningful with respect to the text, exploiting BabelDomains from BabelNet.
Furthermore, the original VWSD dataset is split into three languages: English, Italian and Farsi.
However, the task has a very clear limitation: it does not consider fine-grained semantics.
Considering the previous example, for the target word "music keyboard", any image not related to music can be discarded, which favors systems with a good understanding of high-level semantic relationships across language and vision.
The task does not include recognition of specific senses (e.g., disambiguating the image of a piano from the image of a harpsichord).
In light of this, we propose a new task that combines both high-level and fine-grained semantics. 
The goal is not only to identify the broad sense of the target word, but also to accurately recognise its specific sense.
In light of this, we propose the Enhanced Visual Word Sense Disambiguation for Italian task (EVWSD-ITA) at EVALITA 2026 \citep{evalita2026overview}. 

The proposed task is significantly different from others in the state-of-the-art for the following reasons:
\begin{itemize}
    \item We incorporate both high-level and fine-grained semantics by combining images representing different senses of the target word and co-Hyponyms (synsets that share the same Hypernym)
    \item We focus on the Italian language and manually annotate the test set to guarantee the quality and rigorousness of the benchmark
\end{itemize}

\section{Related Works}

The original WSD task has been extensively studied in previous research.
In this context, one of the most used datasets is XL-WSD \citep{pasini2021xl}, a cross-lingual evaluation benchmark for the WSD task featuring sense-annotated development and test sets in 18 languages from six different linguistic families.
Originally, WSD disambiguation models leveraged the BERT model.
For example, \citet{huang2019glossbert} proposed GlossBERT, a family of models fine-tuned on context-gloss pairs and then converting WSD to a sentence-pair classification task.
More recent approaches started to focus on leveraging LLMs for WSD. 
For example, \citet{yae2025leveraging} proposed three techniques to leverage LLMs in WSD. 
They format the task in the following formats: 1) multiple-choice questions; 2) binary questions; 3) multiple-choice questions for unseen words.
Furthermore, \citet{meconi2025large} propose an extensive study on the capabilities of LLMs in the WSD task, with a focus on the lexical understanding capabilities of the best performing models.

Following the success of WSD, the VWSD task was proposed for SemEval 2023 and, even after the completion of the challenge, several solutions and extensions of the dataset were proposed.
\citet{kritharoula-etal-2023-large} tested several approaches using both generative LLMs for phrase enhancement and encoder-based vision-language models for retrieval. 
Specifically, the authors explore several possible solutions to address the VWSD task, they also consider image captioning and image retrieval, in order to cast the task as a unimodal retrieval task, as well as a learning to rank system and question-answering with chain-of-thought prompting.
\citet{kwon-etal-2023-vision} proposed an unsupervised VWSD approach exploiting definition generation with GPT-3 and Bayesian inference. 
Specifically, the authors perform context-aware definition generation with GPT-3 to overcome the out of vocabulary problem (that is, not all words having their definitions available in a lexical knowledge-base). 
Then they perform Bayesian style inference for Image-Text Matching. 
\citet{zhang2023srcb} proposed SRCB, which employs a three stage pipeline: a context retrieval module, which predicts the correct definition using a bi-encoder architecture, a image retrieval module, which retrieves the relevant images from an image dataset, and a matching module, which decides to use either text or images and ranks them. 
\citet{yang2024mta} proposed MTA, which employs self-distillation to align fine-grained textual features to fixed vision features and align non-English textual features to English textual momentum features.
Furthermore, they introduced a trilingual image-text dataset for VWSD, encompassing a fine-grained network of 85,754 word-sense associations and 120,131 images.
\citet{laba2024ukrainian} extended the original dataset to include the Ukrainian language.
The authors extended the dataset through a semi-supervised approach leveraging Wikipedia articles and expert annotators. 
Furthermore, they test the performance of eight multilingual and multimodal LLMs using this dataset.
\citet{setitra2025leveraging} generated visually representative images from textual descriptions, as well as rich textual descriptions from images. 
Then, an ensemble of deep models is used to perform classification.
\citet{musacchio2025assessing} proposed to use strictly hard negative samples exploiting the co-Hyponym relation from semantic networks.
The authors then propose a new benchmark on a dataset created using this strategy and extensively evaluate both Vision-Language Encoders and Large Language Models supporting multimodal inputs.

Despite the research interest and popularity of the task, no work considers the proposal of a benchmark that combines high-level and fine-grained semantics on this task.

\section{Task Description}

We propose a single task, that is Visual Word Sense Disambiguation.
Given a query and ten possible candidate images for it, we want to learn a system that is capable of selecting the relevant images for the query.
In the task, only one candidate image is relevant for the query.
Hence, the system must be able to either: 1) predict the one relevant image; 2) provide a score for each image based on its relevance to the query.

In EVWSD-ITA, we combine two different types of images: 1) images representing a different sense of the target word, for example the word "keyboard" may be paired to both the image of a computer keyboard and the image of a piano; 2) images representing a different specific sense for the target word while sharing the same broad sense, for example the word "keyboard" may be paired to both the image of a piano and the image of a harpsichord.
We illustrate this in \Cref{fig:example}.
We use co-Hyponyms extracted from a semantic network to find \textit{hard negatives} \cite{robinson2020contrastive, musacchio2025assessing}. 
That is, the images are related to the general sense of the target word, but that represent a different specific sense.
We propose to improve the task by mixing the two types of images: 1) images for other senses of the target word; 2) images that share the same broad sense as the target word.
This will test the ability of the proposed solutions to understand both fine-grained and high-level semantics.

In the original VWSD challenge, two metrics were used to assess the ability of the proposed solutions: HIT@1 and MRR (Mean Reciprocal Rank). 
It may be impossible to compute MRR in some cases, since it expects a ranking (e.g., systems trained for prediction only rather than ranking cannot be benchmarked using MRR).
Hence, we will consider two different leaderboards: one focusing on MRR and the other focusing on HIT@1.
Thanks to this, we can distinguish the goodness of the proposed solution on two different aspects (precise classification and overall ranking quality).
We will describe the metrics in detail in \Cref{sec:metrics}.

\section{Dataset} 

In order to propose the EVWSD-ITA task, we need a dataset where images satisfy the properties we are interested in (images representing synsets from that co-Hyponym of the target and synsets that share the same lemma as the target).

\begin{figure*}[ht]
  \centering
   \includegraphics[width=\textwidth]{images/example.png}
   \caption{Example of the proposed approach. The synset in green is the correct one, while the ones in red are negatives.}
   \label{fig:example}
\end{figure*}

\noindent This section describes the creation of the dataset and provides quantitative statistics.

\subsection{Dataset Construction}

\noindent Our data collection relies on BabelNet \cite{navigli2010babelnet}. 
Since it provides images for each synset, it is a valuable resource to obtain data for our task.
For each synset in BabelNet with an English gloss, we extracted the synset identifier and the corresponding Italian gloss. 
Since a synset can have multiple glosses, we select only one gloss according to its source.
In particular, for selecting only one gloss, we prioritised each source according to this rank: WordNet, Wikitionary, Wikipedia, Wikidata, and other sources.
Then, the URL's image from BabelNet is extracted for each synset. 
Since more images can be assigned to each synset, we used the image tagged with the attribute ``best image''. 
Each synset $s_i \in S$, where $S$ is the set containing all synsets in BabelNet, has multiple lemmas $L(s_i) = \{l_1, ..., l_{m_i}\}$, where $m_i$ is the number of lemmas for synset $s_i$, and a gloss $g_{i}$ associated to it.
We define a relationship $Hyp : S \rightarrow S$ which connects each synset to its Hyponym.
Furthermore, we define two sets associated to $s_i$, the co-Hyponyms set $CH(s_i) = \{s_j \mid Hyp(s_j) = Hyp(s_i) \land s_j \not= s_i \land s_j \in S\}$ and the set of synsets with the same lemma $SL(s_i, l_i) = \{s_j \mid \exists l_j \in L(s_j) : l_j = l_{i} \land s_j \not= s_i \land s_j \in S\}$.

For the test set, we created the data using the following methodology: we iterated over all lemmas in the collection and iterated over all possible synsets for each lemma.
We discarded instances where there were no synsets in the co-Hyponyms set or in the synsets with the same lemma.
Then, we manually checked each instance and the candidate images of each synset.
We removed synsets from the candidates when: 1) the distinct synset specific sense was too similar to that of the target sense; 2) visual elements of the image associated to the synset were not enough to distinguish it with respect to the target synset.
Specifically, for the first case, we found some synsets in BabelNet that shared the same gloss or represented the same sense as the target synset but with a different Hyponym. 
For the second case, we found cases where distinct synsets were represented using the same image or there were not enough visual elements to distinguish it with respect to the image of the target synset (e.g., two nearly identical electrical appliances without any distinguishing visual characteristic).
We also removed instances where the semantic category of the target synset represents an abstract concept (e.g., "year" may be ambiguous to disambiguate through visual features).
If after removing the synsets there were less than ten candidate synsets, we marked the instance to be augmented. 
After completing the test set manual validation, we augmented all instances with less than ten candidate synsets with random synsets selected from the other instances. 
Furthermore, we also manually created the query.
To create the query, we combined: the lemma of the correct synset (the one we iterated over), one of the lemmas of the hypernym of the correct synset, and a word from the gloss of the correct synset.
Using this methodology, we are sure that there is enough information to properly disambiguate the correct image.
Furthermore, we randomize the order of each part of the query, in order to avoid participants being able to reverse engineer the construction of the query and obtaining information they should not have access to (without randomizing the order the first word of the query would always be the lemma of the correct synset).
The word from the gloss should help disambiguate fine-grained semantics, while the lemma from the hypernym the high-level sense of the input.
To increase the degree of diversity of the test split with respect to the train split, we also performed additional steps.
We included synsets where glosses in Italian are not present. 
When using that synset as target, we manually translated the gloss from the original English one to Italian and selected the word to be used in the query.
Furthermore, for all target synsets, we manually replaced the image from BabelNet with a different image sourced from sites with a permissive license.

For the train set, we do not provide a query directly. 
While creating the query using the lemmas of the target synset and its hypernym is easy, selecting a meaningful word from the gloss associated to the synset is not trivial.
Participants are asked to develop a strategy to extract the word from the gloss to help with disambiguation.
This also allows participants to have more freedom to use the dataset as they see fit, as long as the final model is capable of performing VWSD given only a query and a candidate pool of images.

\pagebreak
\subsection{Dataset Statistics}

\begin{lstlisting}[language=json, frame=single,
    float=h,
    caption={Example of Train Instance},
    label={lst:train_example},
    rulecolor=\color{black},
    literate=
      {è}{{\`e}}1
      {à}{{\`a}}1
      {ì}{{\`i}}1
      {ò}{{\`o}}1
      {ù}{{\`u}}1
      {É}{{\'E}}1
      {:}{{{\color{black}:}}}{1}  % Your existing color fixes
      {,}{{{\color{black},}}}{1}
      {\{}{{{\color{black}\{}}}{1}
      {\}}{{{\color{black}\}}}}{1}
      {[}{{{\color{black}[}}}{1}
      {]}{{{\color{black}]}}}{1}]
{
    "id": "bn:00022412n",
    "hyp_id": "bn:00017670n",
    "gloss": "Atto del cuocere",
    "lemma": "cucina",
    "hyp_lemma": [
      "cambiamento di stato"
    ],
    "bns": [
      "bn:00018237n", ..., "bn:00049248n"
    ],
    "is_co_hyp": [
      true, ..., false
    ],
    "images": [
      "F14/bn:00018237n", ..., "F0/bn:00049248n"
    ],
    "all_lemmas": [
      ["masticazione", "masticare"],
        ...,
      ["cucine", "cucina", "cucina attrezzata", "cucina aperta", "cucinotto"]
    ],
    "all_glosses": [
      "La masticazione è il processo mediante il quale il cibo è frantumato e preparato dai denti.", 
      ..., 
      "Una stanza attrezzata per la preparazione dei cibi."
    ],
    "img": "F22/bn:00022412n"
}
\end{lstlisting}

\begin{lstlisting}[language=json, frame=single,
    float=h,
    caption={Example of Test Instance},
    label={lst:test_example},
    rulecolor=\color{black},
    literate=
      {è}{{\`e}}1
      {à}{{\`a}}1
      {ì}{{\`i}}1
      {ò}{{\`o}}1
      {ù}{{\`u}}1
      {É}{{\'E}}1
      {:}{{{\color{black}:}}}{1}  % Your existing color fixes
      {,}{{{\color{black},}}}{1}
      {\{}{{{\color{black}\{}}}{1}
      {\}}{{{\color{black}\}}}}{1}
      {[}{{{\color{black}[}}}{1}
      {]}{{{\color{black}]}}}{1}]
{
    "query": "patologia calcolo organo",
    "candidates": [
      "1133.jpg",
      "850.jpg",
      "743.jpg",
      "1266.jpg",
      "1367.jpg",
      "948.jpg",
      "549.jpg",
      "695.jpg",
      "1504.jpg",
      "1148.jpg"
    ],
}
\end{lstlisting}

For the train split, we provide a total of 10,000 instances. 
These instances are not formatted for the task, we provide the data of interest from BabelNet directly. 
Specifically, we provide for each instance the following fields:
\begin{itemize}
    \item \textbf{`id`}: Synset id
    \item \textbf{`hyp\_id`}: Synset id for the hypernym
    \item \textbf{`gloss`}: Gloss in Italian
    \item \textbf{`lemma`}: Lemma in Italian
    \item \textbf{`hyp\_lemma`}: All possible lemmas for the hypernym
    \item \textbf{`bns`}: The list will contain both: co-hyponyms, synsets that have the "lemma" as a possible lemma
    \item \textbf{`is\_co\_hyp`}:  True if the corresponding synset in "bns" is a co-hyponym, False otherwise
    \item \textbf{`images`}: Path to the image for each synset in "bns"
    \item \textbf{`all\_lemmas`}: Each list contains all the Italian lemmas associated to each synset in "bns"
    \item \textbf{`all\_glosses`}: All Italian glosses associated to each synset in "bns"
    \item \textbf{`img`}: Path to the image
\end{itemize}
Each instance is a unique synset in BabelNet. 
Participants are asked to format the dataset leveraging the provided data.
We decided to implement this approach to let participants have complete freedom over the data and how they wanted to format the train set.
For example, using this dataset to train a CLIP model (vision-language encoder) would require a different formatting with respect to a LLM supporting multimodal inputs.
A complete example of train instance is reported in \Cref{lst:train_example}.

\noindent For the test split, we provide a total of 222 instances, following the methodology previously described. 
Furthermore, image paths have been anonymized (since in the train split the image path was matched to the synset id of the instance) in order to avoid participants being able to extract additional information from Babel synsets at test time.
A complete example of test instance is reported in \Cref{lst:test_example}.

\noindent For both the train and test splits, all images are provided in a square size of 336. 
This is done in order to provide participants with the same experimental setting for visual inputs. 
Furthermore, this allows easier use of Large Language Models supporting multimodal inputs, since they treat visual inputs as sequences of tokens, smaller images are processed more efficiently with respect to higher resolutions.

\section{Evaluation} 

\subsection{Metrics}
\label{sec:metrics}

We will evaluate the models using the two metrics considered in the original challenge: HIT@1 and MRR.
Given $r = [r_1, ..., r_n]$, where $n$ is the cardinality of the test set and $r_i$ is the rank of the correct image given as output by the model, MRR is defined as:

\begin{equation}
    MRR = \frac{1}{n} \sum_{i=1}^{n}{\frac{1}{r_i}}
\end{equation}

\noindent This metric is used to evaluate the goodness of the ranking; the closer $r_i$ is to 1 (i.e., the first position in the ranking), the better the result. HIT@1 is defined as:

\begin{equation}
    HIT@1 = \frac{1}{n} \sum_{i=1}^{n}{I(r_i)}
\end{equation}

\noindent where $I$ is a function that returns 1 if $r_i == 1$ (i.e., the correct image is ranked first), and 0 otherwise. 
Therefore, this metric assesses the model's ability to select the correct image as the best possible candidate. 

\subsection{Baseline}

We use the same baseline that was used in VWSD for SemEval 2023, that is a multilingual CLIP model\footnote{\url{https://huggingface.co/sentence-transformers/clip-ViT-B-32-multilingual-v1}} trained using the sentence-transformers library \citep{reimers-2019-sentence-bert}.
We do not perform additional fine-tuning of the model, we use the pre-trained model directly.
We extract the embedding for the query text in the multimodal embedding space of the multilingual CLIP model, then we extract the embeddings for all candidate images for that query.
We compute the cosine similarity for the query embedding with respect to all image embeddings and rank the candidate images in descending order with respect to cosine similarity (image with highest similarity at the top of the list).
After obtaining the ranked list, we can compute HIT@1 and MRR.
This method provides us with a strong baseline that doesn't include additional information from external sources or ensemble models.

\subsection{Participants}

One team - UniTor (University of Tor Vergata) - participated in the task and submitted three distinct runs.
In this section, we briefly describe the system proposed by said participants.

\subsubsection{UniTor}

The team proposed a five-stage pipeline: semantic analysis and weighted binary question generation, visual assessment, visual question-answering confidence estimation, multimodal semantic matching, multi-channel fusion and ranking.
The final scoring used in the ranking is computed using different scores obtained in the pipeline.
Specifically, the scoring combines a visual-question answering score and a semantic similarity score obtained from a CLIP model.
Semantic analysis and question generation is done by using a LLM (GPT-5.1).
The visual-question answering score is obtained by leveraging the probability of generating a "Yes/No" answers of a smaller LLM (Qwen3-VL), while the similarity score is obtained by leveraging a textual description w.r.t. candidate image.
We define the scoring methodology to report the results as follows:
\begin{itemize}
    \item Score VQA: visual question-answering score obtained by leveraging the probability of generating a "Yes/No" answer;
    \item Score VQA (Two-Step): visual question-answering score obtained by leveraging the probability of generating a "Yes/No" answer. The questions for this score are generated using a more strict parsing approach;
    \item Score CLIP (Query): similarity score obtained by a CLIP model using the candidate image and the original query;
    \item Score CLIP (Description): similarity score obtained by a CLIP models using the candidate image and the description generated by a LLM.
\end{itemize}
Complete details regarding the participants solutions are detailed in their report \citep{unitorev}.

\section{Results}

We report results for HIT@1 in \Cref{tab:hit_leaderboard} and for MRR in \Cref{tab:mrr_leaderboard}.
Overall the second system proposed by UniTor performed better than all others in both the HIT@1 and MRR leaderboards.
Still, the first proposed system performed remarkably well in terms of MRR (.8100 against the .8182 of the best system).
This highlights that the first system is still capable of providing good rankings with respect to the best system.
Furthermore, we highlight that all proposed systems by UniTor significantly outperformed the baseline.
This highlights that their proposed solution was capable of overcoming a system based on pure semantic similarity scoring. 
Finally, the system proposed by UniTor leveraged Large Language Models supporting multimodal inputs.
This highlights current interest in multimodal research, where researchers are not strictly interested in extraction of embeddings from vision-language encoder models, but are interested in leveraging Large Language Models supporting multimodal inputs for task resolution.

\begin{table}[htb]
\centering
\large
\begin{tabular}{lll} 
\toprule
\textbf{Team} & \textbf{Model} & \textbf{Score} \\
\toprule
baseline & clip-ViT-B-32-multilingual-v1 & .4505 \\
\midrule
UniTor & Score VQA + Score CLIP (Query) & .6937 \\
UniTor & Score VQA + Score CLIP (Query) + Score CLIP (Description) & \textbf{.7117} \\
UniTor & Score VQA (Two-Step) + Score CLIP (Query) + Score CLIP (Description) & .6667 \\
\bottomrule
\end{tabular}
\caption{HIT@1 Leaderboard}
\label{tab:hit_leaderboard}
\end{table}

\begin{table}[htb]
\centering
\large
\begin{tabular}{lll} 
\toprule
\textbf{Team} & \textbf{Model} & \textbf{Score} \\
\toprule
baseline & clip-ViT-B-32-multilingual-v1 & .6244 \\
\midrule
UniTor & Score VQA + CLIP (Query) & .8100 \\
UniTor & Score VQA + CLIP (Query) + CLIP (Description) & \textbf{.8182} \\
UniTor & Score VQA (Two-Step) + CLIP (Query) + CLIP (Description) & .7770 \\
\bottomrule
\end{tabular}
\caption{MRR Leaderboard}
\label{tab:mrr_leaderboard}
\end{table}

\section{Conclusions}

In EVWSD-ITA, we proposed the first dataset for VWSD for fine-grained and high-level semantics in Italian.
Given a query containing a target word and additional context, we combine both images representing the same sense as the target word (high-level semantics) and hard negatives obtained by exploiting the co-Hyponyms relation (fine-grained semantics).
We provide an extensive train set obtained from BabelNet and a manually annotated test set. 
The EVWSD-ITA task was approached by a single team that proposed a solution based on Visual Question-Answering and semantic similarity obtained from a CLIP model.
All participants' solutions were able to surpass the proposed baseline, for both metrics considered in this task (HIT@1 and MRR), highlighting the impact of their scoring strategy in VWSD.


%%
%% The acknowledgments section is defined using the "acknowledgments" environment
%% (and NOT an unnumbered section). This ensures the proper
%% identification of the section in the article metadata, and the
%% consistent spelling of the heading.
\begin{acknowledgments}
We acknowledge the support of the PNRR project FAIR - Future AI Research (PE00000013), Spoke 6 - Symbiotic AI (CUP H97G22000210007) under the NRRP MUR program funded by the NextGenerationEU. 
\end{acknowledgments}

%% The declaration on generative AI comes in effect
%% in Janary 2025. See also
%% https://ceur-ws.org/GenAI/Policy.html
\section*{Declaration on Generative AI}

The author(s) have not employed any Generative AI tools.

%%
%% Define the bibliography file to be used
\bibliography{sample-ceur}

%%
%% If your work has an appendix, this is the place to put it.
\appendix

\end{document}

%%
%% End of file
