%% The first command in your LaTeX source must be the \documentclass command.
%%
%% Options:
%% twocolumn : Two column layout. Do not use twocolumn for papers submitted to CEUR-WS!
%% hf: enable header and footer.
\documentclass[
% twocolumn,
% hf,
]{ceurart}

%%
%% One can fix some overfulls
\sloppy

%%
%% Minted listings support 
%% Need pygment <http://pygments.org/> <http://pypi.python.org/pypi/Pygments>
\usepackage{listings}
\usepackage{xcolor}
\usepackage{booktabs} 
\usepackage{multirow}
\usepackage{tabularx}
\usepackage{colortbl}

%% auto break lines
\lstset{breaklines=true}

\newcommand{\claudio}[1]{ {\color{cyan} CS #1}} 
\newcommand{\moreno}[1]{ {\color{blue} MLQ #1} }
\newcommand{\flavio}[1]{ {\color{olive} FG #1} }
\newcommand{\alkis}[1]{ {\color{red} AK: #1}} 

\definecolor{catBio}{RGB}{230,245,255}
\definecolor{catCareer}{RGB}{232,255,236}
\definecolor{catAch}{RGB}{255,242,230}
\definecolor{catPers}{RGB}{255,235,245}

%%
%% end of the preamble, start of the body of the document source.
\begin{document}

%%
%% Rights management information.
%% CC-BY is default license.
\copyrightyear{2026}
\copyrightclause{Copyright for this paper by its authors.
  Use permitted under Creative Commons License Attribution 4.0
  International (CC BY 4.0).}

%%
%% This command is for the conference information
\conference{EVALITA 2026: 9th Evaluation Campaign of Natural Language Processing and Speech Tools for Italian, Feb 26-27, Bari, IT}

%%
%% The "title" command
\title{SVELA at EVALITA 2026: Overview of the Selective Verification of Erasure from LLM Answers Task}

% \tnotemark[1]
% \tnotetext[1]{You can use this document as the template for preparing your
  % publication. We recommend using the latest version of the ceurart style.}

%%
%% The "author" command and its associated commands are used to define
%% the authors and their affiliations.
\author[1]{Claudio Savelli}[%
orcid=0000-0002-0877-7063,
email=claudio.savelli@polito.it,
]
\cormark[1]
\address[1]{Politecnico di Torino,
  Corso Duca degli Abruzzi, 24, 10129 Torino, Italy}
\address[2]{Università degli Studi di Enna "Kore", Piazza dell'Università, 94100 Enna, Italy}

\author[2]{Moreno {La Quatra}}[%
orcid=0000-0001-8838-064X,
email=moreno.laquatra@unikore.it,
]

\author[1]{Alkis Koudounas}[%
email=alkis.koudounas@polito.it,
]
\author[1]{Flavio Giobergia}[%
email=flavio.giobergia@polito.it,
]

%% Footnotes
\cortext[1]{Corresponding author.}
% \fntext[1]{These authors contributed equally.}

%%
%% The abstract is a short summary of the work to be presented in the
%% article.
\begin{abstract}
This paper presents SVELA (Selective Verification of Erasure from LLM Answers), a shared task at EVALITA 2026.
SVELA challenges participants to develop methods that verify whether a Large Language Model has successfully forgotten specific information.
Given models that have undergone unlearning, participants must classify fictional identities or individual facts as retained, forgotten, or never seen during training.
The task provides two complementary subtasks: entity-level detection, where entire identities are classified, and instance-level detection, where individual question-answer pairs are evaluated.
The task attracted eight registered teams, four of which submitted system description papers, and resulted in more than fifty valid submissions across the two subtasks. 
The evaluation highlights the intrinsic difficulty of unlearning verification, particularly at the instance level, where less aggregated information and more fine-grained distinctions between retain, forget, and never-seen information are required. 

% \moreno{Add: number of participants, number of submissions, key findings after the evaluation phase.}
\end{abstract}

%%
%% Keywords. The author(s) should pick words that accurately describe
%% the work being presented. Separate the keywords with commas.
\begin{keywords}
  Machine Unlearning \sep
  Evaluation Metrics \sep
  Large Language Models \sep
  Italian NLP \sep
  EVALITA 2026
\end{keywords}

%%
%% This command processes the author and affiliation and title
%% information and builds the first part of the formatted document.
\maketitle



\section{Motivation}
Large Language Models (LLMs) acquire vast amounts of information during training.
This information may include personal data, copyrighted content, or other sensitive material that users or regulations may require to be removed~\cite{carlini2022quantifying}.
Privacy regulations, such as the European Union's General Data Protection Regulation (GDPR), grant individuals the right to request the deletion of their personal data from automated systems~\cite{mantelero2013eu}.
However, retraining these models from scratch after removing specific data is prohibitively expensive, often requiring weeks of computation and millions of dollars~\cite{hoffmann2022training}.

Machine Unlearning (MU) offers a practical alternative by removing the influence of specific data from trained models without full retraining~\cite{bourtoule2021machine}.
Several unlearning methods have been proposed for LLMs, including gradient-based approaches, optimization techniques, and preference-based alignment~\cite{liu2025rethinking,yao-etal-2024-machine}.
These methods aim to make a model behave as if it had never seen the targeted data.

 

% While unlearning methods continue to improve, a critical challenge remains: how do we verify that unlearning actually worked?
% Current evaluation approaches face several limitations.
% First, most benchmarks rely on comparing against models retrained from scratch, which defeats the purpose of efficient unlearning~\cite{maini2024tofu}.
% Second, existing metrics often fail to provide reliable sample-level assessments, making it difficult to verify whether individual facts have been forgotten~\cite{thaker2025position}.
% Third, multilingual scenarios remain largely unexplored, despite evidence that unlearning in one language does not necessarily transfer to others~\cite{choi-etal-2024-cross}.

% SVELA addresses these gaps by providing a controlled setting for developing and comparing unlearning verification methods.
% The task uses synthetic data about fictional identities, ensuring that no real personal information is involved and that the ground truth is perfectly known.
% By covering multiple languages, including Italian, Spanish, French, and German, SVELA enables evaluation of verification methods in multilingual contexts.
% The task invites participants to develop robust metrics that can reliably distinguish between information that a model remembers, has forgotten, or never learned.

MU is commonly evaluated along three complementary dimensions: utility, measuring the extent to which model performance on retained data is preserved; efficacy, assessing whether the influence of the targeted data has been effectively removed; and efficiency, capturing the computational cost of the unlearning procedure. Recent benchmarking efforts have formalized this perspective by proposing unified evaluation frameworks that explicitly account for all three axes \cite{koudounas25c_interspeech, andrea2025make, d2025erasure}. 

In this work, SVELA focuses specifically on the problem of verifying the efficacy of unlearning in LLMs. The task targets the downstream verification setting, where a trained model is given and the goal is to determine, through post-hoc analysis, whether specific information has been successfully forgotten. To this end, SVELA provides a controlled experimental setting for the development and comparison of unlearning verification methods.

The task relies on synthetic data describing fictional identities, ensuring that no real personal information is involved and that the ground truth is perfectly known. By covering multiple languages, including Italian, Spanish, French, and German, SVELA enables the evaluation of verification methods in multilingual contexts. Participants are invited to design robust verification approaches capable of reliably distinguishing between information that a model retains, has forgotten, or has never been exposed to.


SVELA is proposed within the framework of EVALITA 2026~\cite{evalita2026overview}, the evaluation campaign for Natural Language Processing and Speech tools for Italian. This positioning underscores the importance of developing privacy-preserving AI technologies specifically tailored to the needs of the Italian NLP community. As European institutions, and Italian ones in particular, navigate the complexities of GDPR compliance, the ability to selectively forget data becomes a critical capability. By benchmarking unlearning methods on Italian data alongside those from other languages, SVELA supports EVALITA's mission to advance the robustness and reliability of Italian-language technologies in an era of increasing regulatory scrutiny.

% \moreno{Consider adding a paragraph on why this matters for the Italian NLP community specifically, connecting to EVALITA's mission. Plus the citation of course, like something: 
% SVELA is part of EVALITA 2026~\cite{evalita2026overview}, the evaluation campaign for Natural Language Processing and Speech tools for Italian.
% }

% \alkis{what about something like: 
% SVELA is proposed within the framework of EVALITA 2026~\cite{evalita2026overview}, the evaluation campaign for Natural Language Processing and Speech tools for Italian. This positioning underscores the importance of developing privacy-preserving AI technologies specifically tailored to the needs of the Italian NLP community. As European institutions, and Italian ones in particular, navigate the complexities of GDPR compliance, the ability to selectively forget data becomes a critical capability. By benchmarking unlearning methods on Italian data alongside those from other languages, SVELA supports EVALITA's mission to advance the robustness and reliability of Italian-language technologies in an era of increasing regulatory scrutiny.}

\section{Task Definition}
SVELA evaluates the ability of participant-developed methods to verify whether MU has succeeded.
Unlike traditional unlearning tasks, where participants apply forgetting techniques, SVELA focuses on the complementary problem of detecting what a model knows or has forgotten.

\subsection{Task Setup}
\label{sec:task-setup}

Participants receive two components.
First, they receive access to a set of LLMs $\mathcal{M} = \{M_1, M_2, \ldots, M_k\}$ of different sizes.
Each model has been fine-tuned on a set of fictional actor biographies and subsequently processed with a state-of-the-art unlearning method.
The specific unlearning method applied to each model is hidden from participants.
Second, participants receive a set of fictional identities, $\mathcal{I}$, along with questions that can be asked about them.
Each identity $i \in \mathcal{I}$ is associated with a set of questions $\mathcal{Q} = \{q_1, q_2, \ldots, q_{20}\}$, one for each atomic fact in the biography.

For a given model $M$, each identity (Subtask 1) or question-identity pair (Subtask 2) belongs to exactly one of three classes:
\begin{itemize}
    \item \textit{Retain}: the information was used to train the model and was not targeted for unlearning.
    \item \textit{Forget}: the information was in the training data but was later unlearned.
    \item \textit{Never-used}: the information was never part of training.
\end{itemize}

\vspace{2mm}
\noindent
\textbf{Problem Definition.}
Let $\mathcal{Y} = \{\texttt{retain}, \texttt{forget}, \texttt{never-used}\}$ denote the set of possible labels.
For Subtask~1, participants must develop a method $f_1: \mathcal{M} \times \mathcal{I} \rightarrow \mathcal{Y}$ that takes a model and a sequence of questions about an identity as input and outputs a predicted label.
For Subtask~2, participants must develop a method $f_2: \mathcal{M} \times \mathcal{I} \times \mathcal{Q} \rightarrow \mathcal{Y}$ that takes a model, an identity, and a question as input and outputs a predicted label.
The objective is to maximize macro-averaged F1 score across all the models.

Participants may adopt either black-box approaches, which rely solely on querying the model and analyzing its outputs, or white-box approaches, which also exploit access to model internals, such as weights, activations, or gradients.

\subsection{Subtask 1: Entity-Level Detection}
In the first subtask, participants classify entire identities.
Each fictional actor $i \in \mathcal{I}$ belongs exclusively to one category with respect to a given model $M$.
The participant's method $f_1$ must analyze the model's behavior across available questions $q$ and produce a single classification for the identity.
This subtask evaluates whether verification methods can detect complete identity removal.

\subsection{Subtask 2: Instance-Level Detection}
In the second subtask, participants classify individual question-identity pairs.
For a model $M$, questions about the same identity $i$ may have different labels: some facts may be retained, others forgotten, and others never learned.
The participant's method $f_2$ must produce a classification for each pair $(i, q)$ where $q \in \mathcal{Q}$.
This subtask evaluates fine-grained verification capabilities, reflecting real-world scenarios where partial information removal is requested.

\subsection{Submission Format}

To support method development and reproducibility, the task provides a set of publicly available models that participants may use during the development and validation phases. In addition, a separate set of hidden models $\mathcal{M}' \supset \mathcal{M}$ is reserved for the final evaluation. These hidden models differ in their unlearning configurations and are not accessible to participants before submission.

At the end of the evaluation phase, participants submit runnable code rather than pre-computed predictions. The submitted implementation of $f_1$ or $f_2$ must accept a model and a set of queries as input, then output predictions for each identity or question-identity pair. Each submission is run on both the public and hidden models to generate predictions, which are then used to compute the official evaluation metrics. This evaluation protocol ensures that all results are obtained under identical conditions and that reported scores faithfully reflect each method's actual behavior and generalization ability, preventing post-hoc tuning or result manipulation.

% Participants submit runnable code rather than pre-computed predictions.
% The submitted implementation of $f_1$ or $f_2$ must accept a model and a set of queries as input, then output predictions for each identity or question-identity pair.
% The organizers execute all submissions on hidden model configurations $\mathcal{M}' \supset \mathcal{M}$ to produce the final rankings.
% This design aims to prevent overfitting to specific models or unlearning methods and tests the generalizability of proposed approaches.


% \moreno{Add technical details: programming language requirements (?), execution environment, time limits (?), API specifications for submissions.}

\section{Dataset}
SVELA uses the FAME (Fictional Actors for Multilingual Erasure) dataset~\cite{savelli2025fame} as its evaluation data.
All data is synthetically generated, ensuring that no real personal information is involved and that the ground truth status of each fact is perfectly known.

\subsection{Data Structure}
Each fictional actor is described through a structured biography containing exactly 20 atomic facts. 
These facts are organized into four semantic categories:
\begin{itemize}
    \item \colorbox{catBio}{\textit{Biography}} (5 facts): birthplace, birthdate, high school, family background, and education.
    \item \colorbox{catCareer}{\textit{Career}} (7 facts): first role, breakthrough project, genre specialization, notable award, major collaboration, film festival participation, and international project.
    \item \colorbox{catAch}{\textit{Achievements}} (3 facts): box-office success, critical acclaim, and directorial award.
    \item \colorbox{catPers}{\textit{Personal}} (5 facts): life event, hobby or interest, address, phone number, and email.
\end{itemize}

Each atomic fact corresponds to exactly one question-answer pair.
This one-to-one mapping enables precise measurement of which specific information a model retains or forgets.
Every question explicitly includes the full name of the fictional actor to identify the subject unambiguously.
Table \ref{tab:example_qa_identity} provides a concrete example of this structure, illustrating all 20 question-answer pairs associated with a single identity, grouped by semantic category.


\input{tables/qas_table_example}

\subsection{Multilingual Coverage}

The SVELA dataset is a subset of the FAME dataset and spans four languages: Italian, Spanish, French, and German.
Each language subset contains fictional identities with culturally appropriate names, birthplaces, and other attributes.
Names are generated to match the naming conventions of each language's primary country.
Birthplaces are sampled from cities within the corresponding country to maintain geographic coherence.

\subsection{Data Splits}
For each model configuration, identities and facts are divided according to the three classes defined in Section~\ref{sec:task-setup}: retain, forget, and never-used.
For Subtask 1, each identity belongs entirely to one class.
For Subtask 2, individual facts from the same identity may belong to different classes, enabling the evaluation of partial-forgetting scenarios.

\begin{table}[t]
\centering
\caption{Dataset statistics. Numbers in parentheses for each class indicate the number of identities (used in Subtask~1).}
\label{tab: svela_dataset_stats}
\begin{tabular}{lccccccc}
\toprule
Split & \# Identities & Facts / Identity & \# QA pairs & Retain (IDs) & Forget (IDs) & Never-used (IDs) \\
\midrule
Single Language  &  200 & 20 & 8000 & 2560 & 640 & 800 \\
\textbf{Total} & 800 & -- & 32000 & 10240 & 2560 & 3200 \\
\bottomrule
\end{tabular}
\end{table}


% \moreno{Add table with dataset statistics: number of identities per language, number of QA pairs, split sizes for each subtask.}

\section{Evaluation Measures}
Participant methods are evaluated on their ability to correctly classify identities or question-answer pairs into the three categories: retain, forget, and never-used.

\subsection{Primary Metrics}
We treat the verification task as a three-class classification problem.
The primary evaluation metric is the macro-averaged F1 score, computed as the unweighted average of F1 scores across the three classes.
This choice ensures that performance of all the classes are weighted equally.

We also report per-class precision, recall, and F1 scores to provide detailed insight into the method's behavior.
A method that achieves high overall F1 but fails on one specific class (e.g., cannot detect forgotten information) would be revealed through these per-class metrics.

\subsection{Generalization Assessment}
A key goal of SVELA is to assess whether verification methods generalize across different models and unlearning conditions, rather than overfitting to a specific configuration. To this end, the submitted methods are evaluated on multiple model configurations that vary along two orthogonal dimensions:
\begin{itemize}
    \item \textit{Model size}: evaluation is performed on two instruction-tuned backbone models with different parameter counts, namely Llama-3.2-1B-Instruct\footnote{\href{https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct}{\texttt{meta-llama/Llama-3.2-1B-Instruct}}} (1B parameters) and Llama-3.2-3B-Instruct\footnote{\href{https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct}{\texttt{meta-llama/Llama-3.2-3B-Instruct}}} (3B parameters), to assess the scalability of verification approaches across model capacity.
    \item \textit{Unlearning method}: for each backbone, models are processed using one of five distinct unlearning techniques, namely \textit{Fine-Tuning} (FT), \textit{Gradient Ascent} (GA) \cite{golatkar2020eternal}, \textit{Gradient Difference} (GD) \cite{choi2023towards,kurmanji2024towards}, \textit{KL Minimization} (KLM) \cite{maini2024tofu}, and \textit{Preference Optimization} (PO) \cite{maini2024tofu}, covering different strategies to evaluate robustness with respect to the specific forgetting mechanism applied.
\end{itemize}

While participants are provided with a set of publicly available models for development and validation purposes, the exact combination of model size and unlearning method used for each hidden configuration is not disclosed before evaluation. This design choice prevents adaptation to specific unlearning behaviors and encourages the development of model- and method-agnostic verification strategies.

The final ranking is computed by averaging performance across all hidden configurations. As a result, methods that perform well only for a particular model size or unlearning technique are penalized, whereas approaches that demonstrate consistent behavior across configurations are rewarded. This evaluation protocol ensures that leaderboard results faithfully reflect the generalization capability of the proposed verification methods rather than their performance on a single, fixed setup.

Additional details on the model architectures, unlearning methods, and experimental configurations underlying the hidden setups are found in the original work \cite{savelli2025fame}.

% \moreno{Specify: how many model sizes? which unlearning methods? what are the exact configurations? Or keep this hidden?}
% \moreno{I may have missed something here}

\section{Baseline System}
We provide a simple black-box baseline that frames unlearning verification as a supervised classification problem over model behavioral signatures. The same approach is used for both subtasks, differing only in the final aggregation step required for entity-level predictions.

Given a model $M$ and a query $q$, we generate a short (40 tokens) deterministic response, using greedy decoding. For each generation step, we average the model's logit across the entire vocabulary to produce a scalar value. By applying this to all steps, we obtain a fixed-length (40-dimensional) ``feature vector'' that summarizes the model's behavior for each question. Feature vectors are extracted for all training samples belonging to the three classes (\texttt{retain}, \texttt{forget}, and \texttt{never-used}). 
Based on classical MIA evaluation in an unlearning setting \cite{mu_survey}, a multinomial logistic regression classifier is trained on the extracted feature vectors to predict one of the three classes. At inference time, the same feature extraction procedure is applied to the evaluation set. For Subtask 2, the classifier directly predicts a label for each question-identity pair. For Subtask 1, predictions are pooled across all questions associated with the same identity with majority voting, to produce a single label per entity.

This baseline is intentionally naive and does not rely on access to model internals beyond generation scores. While the adopted logit aggregation strategy is coarse and does not explicitly model semantic correctness or uncertainty, it provides a reference point for comparing more sophisticated verification methods that exploit richer signals (e.g., intermediate activations) or more sophisticated aggregation strategies (e.g., the entropy of the output distribution).

\section{Participant Methods}
This section provides a high-level overview of the approaches proposed by task participants. The submitted methods exhibit substantial methodological diversity, reflecting different assumptions about model access and different strategies for verifying unlearning. While all approaches address the same verification objective, they differ in the types of signals extracted from the model, the granularity of predictions, and the degree to which model internals are exploited.

In the following, we summarize the main characteristics of the participant methods without entering into implementation details, which are instead described in the respective system papers.

\paragraph{Team priyam\_saha17 \cite{priyam_saha17}}
proposes a verification approach based on transformer-derived feature extraction followed by lightweight supervised classification. Each instance is encoded using a fixed prompt template and processed in inference-only mode by the provided unlearned transformer model, from which hidden-state embeddings and confidence-related signals are extracted. To obtain compact representations, the method applies dimensionality reduction to the embeddings and augments them with auxiliary uncertainty features such as prediction entropy and logit margins. The resulting feature vectors are then classified using a small residual neural network trained on labeled verification data. The approach is fully modular, keeps the underlying language model frozen, and is applicable to both instance-level and entity-level settings via appropriate aggregation.

\paragraph{Team MALTO \cite{malto}} 
proposes a white-box verification method inspired by entropy-based membership inference attacks, extended to exploit attention-level signals. The approach identifies the most responsive attention heads separately for the three classes using adapted LAHIS scores and extracts head-specific features such as head attention entropy and head focus. In addition, a set of layer-level attention descriptors, position-based features, and layer transition features is computed, including measures of attention concentration, sparsity, self-attention bias, and attention distance. All extracted features are combined and used to train a multi-layer perceptron classifier that predicts the verification label. By focusing on a subset of highly responsive attention heads, the method also provides insights into which components of the attention mechanism are most involved in the forgetting process.

\paragraph{Team ItaLib \cite{ita-lib}}
proposes a white-box verification approach that directly analyzes the language model's internal representations to detect residual knowledge after unlearning. For each query, the method performs a forward pass through the backbone LLM and extracts hidden states from the last four transformer layers, which are then aggregated using mean pooling to obtain a compact representation of the model’s internal state. These pooled features are subsequently fed into a supervised multi-layer perceptron trained to classify instances into the three classes. By focusing on latent activations rather than generated text, the approach aims to identify traces of memorized information that may persist even when the model’s outputs appear uninformative. It is worth noting that the authors participated only in the first task of the challenge.

\paragraph{Team Eraserhead \cite{eraserhead}}
proposes a white-box verification pipeline based on the analysis of token-level logit distributions produced by a causal language model. For each query, the model generates a short completion and exposes the corresponding generation logits, from which a set of statistical features is derived by aggregating vocabulary-level and sequence-level descriptors. These features are used to train standard supervised classifiers to predict the verification label. The same pipeline is applied to both subtasks, with entity-level predictions obtained by aggregating features across all questions associated with the same identity. Model selection is performed via cross-validation using macro-averaged F1, and the final predictions are produced using the best-performing configuration.

\section{Results and Discussion}

This section presents the results obtained by participant systems and discusses the main empirical findings. We first report and compare performance across tasks and model sizes, highlighting differences between the entity-level and instance-level settings. We then analyze the observed performance trends in more detail.


\input{tables/results_task_a}
\input{tables/results_task_b}

\begin{figure}[t]
    \centering
    \includegraphics[width=\linewidth]{images/combined_f1_analysis_2.pdf}
    \caption{Distribution of macro-F1 scores across classes and unlearning methods. 
    \textbf{Left}: per-class F1 scores for the three classes, aggregated over all participant methods and model configurations. 
    \textbf{Right}: average F1 score obtained by each unlearning method, with error bars indicating standard deviation across configurations.}
    \label{fig:f1_analysis}
\end{figure}


\subsection{Task-wise and Model-wise Results}

Tables \ref{tab:task1_results} and \ref{tab:task2_results} report the official results for Task 1 and Task 2, respectively, following the final leaderboard ranking. Performance is measured using macro-averaged F1, with per-class F1 scores reported for completeness, and results are shown separately for the 1B and 3B model configurations. Across both tasks, the relative ordering of participant methods is consistent mainly between the two model sizes, with the 3B models yielding modest but consistent improvements in macro-F1. Notably, \textit{priyam\_saha17} achieves the highest macro-F1 in both Task 1 and Task 2, for both the 1B and 3B settings.

A comparison between the two task settings shows that Task 2 is consistently more challenging than Task 1, as reflected in lower macro-F1 scores across all submissions. This gap can be attributed to the increased granularity of the instance-level setting, where different facts associated with the same identity may belong to different verification classes. In contrast, the aggregation inherent to the entity-level task reduces local ambiguity by pooling evidence across multiple queries and makes the evaluation easier, an effect also observed in prior unlearning benchmarks \cite{maini2024tofu}.

\subsection{Discussion}

A closer inspection of the per-class F1 distributions, summarized in Figure \ref{fig:f1_analysis} and reflected in Tables \ref{tab:task1_results} and \ref{tab:task2_results}, reveals an apparent asymmetry across verification classes. As expected, the retain class consistently achieves high F1 scores with relatively low variance, indicating that preserved knowledge is comparatively easy to identify. In contrast, both the forget and never-used classes exhibit substantially lower performance. This behavior is consistent with the unlearning setting. After forgetting, model outputs for deleted facts are intentionally degraded and often resemble responses for information that was never observed during training, making these two classes more complex to disentangle \cite{shi2024muse, savelli2025malto}.

Finally, the aggregation of results by unlearning method (Figure \ref{fig:f1_analysis}, right) reveals limited but consistent differences across techniques, with largely overlapping performance distributions. Among the evaluated approaches, PO achieves slightly higher average F1 scores on the Forget split. A plausible explanation is that preference-based unlearning explicitly encourages abstention or uncertainty responses (e.g., ``Non conosco la risposta a questa domanda''/``I don't know the answer to this question'') for forgotten content, thereby producing more distinguishable behavioral patterns than optimization-based methods that primarily reduce likelihood or confidence, and making it easier to split forget from never-seen set.


\section{Conclusions}

This paper introduces SVELA, a shared task at EVALITA 2026 focused on verifying machine unlearning in LLMs. The task addressed a core yet underexplored problem: determining whether a model has effectively forgotten specific information. To this end, SVELA defined two complementary subtasks — entity-level and instance-level verification — and evaluated submitted methods across multiple model sizes and unlearning strategies.

The results highlight several consistent trends. First, instance-level verification proves more challenging than entity-level verification, confirming that fine-grained assessment of forgetting remains difficult even when unlearning is explicitly applied. Second, per-class analysis reveals an expected marked asymmetry: retained information is comparatively easy to detect, whereas forgotten and never-used content is significantly harder to disentangle. This observation aligns with prior findings in the unlearning literature and reflects the intrinsic ambiguity introduced by unlearning objectives that intentionally degrade model confidence. Third, while differences across unlearning methods are generally limited, preference-based approaches tend to yield slightly more separable behaviors.

Overall, the diversity of participant approaches, ranging from logit-based behavioral analysis to white-box inspection of internal representations, demonstrates growing interest in unlearning verification as a distinct research problem. At the same time, the modest performance levels and overlapping results across configurations indicate that reliable verification of forgetting in LLMs remains an open challenge.

We hope that SVELA will serve as a starting point for future work on unlearning application and verification, encouraging the development of methods that generalize across models, unlearning techniques, and analysis granularities. 

\begin{acknowledgments}
This study was carried out within the FAIR - Future Artificial Intelligence Research and received funding from the European Union Next-GenerationEU (PIANO NAZIONALE DI RIPRESA E RESILIENZA (PNRR) – MISSIONE 4 COMPONENTE 2, INVESTIMENTO 1.3 – D.D. 1555 11/10/2022, PE00000013). This manuscript reflects only the authors' views and opinions, neither the European Union nor the European Commission can be considered responsible for them. 
\end{acknowledgments}

\section*{Declaration of Generative AI}

During the preparation of this work, the authors used GPT-5.2 for grammar and spelling checks.
After using these tools, the authors reviewed and edited the content as needed and took full responsibility for the publication’s content.

%%
%% Define the bibliography file to be used
\bibliography{sample-ceur}

%%
%% If your work has an appendix, this is the place to put it.
% \appendix

% \section{Online Resources}


% The sources for the ceur-art style are available via
% \begin{itemize}
% \item \href{https://github.com/yamadharma/ceurart}{GitHub},
% % \item \href{https://www.overleaf.com/project/5e76702c4acae70001d3bc87}{Overleaf},
% \item
%   \href{https://www.overleaf.com/latex/templates/template-for-submissions-to-ceur-workshop-proceedings-ceur-ws-dot-org/pkfscdkgkhcq}{Overleaf
%     template}.
% \end{itemize}

\end{document}

%%
%% End of file
