%File: aaai2026-unified-supp.tex
%
% UNIFIED AAAI 2026 SUPPLEMENTARY MATERIAL TEMPLATE
% To switch between anonymous submission and camera-ready versions,
% simply change the next line:
%
% For ANONYMOUS SUBMISSION: uncomment the next line
% \def\aaaianonymous{true}
%
% For CAMERA-READY VERSION: comment out or delete the next line
\def\aaaianonymous{true}
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\documentclass[letterpaper]{article} % DO NOT CHANGE THIS

% Conditional package loading based on version
\ifdefined\aaaianonymous
    \usepackage[submission]{aaai2026}  % Anonymous submission version
\else
    \usepackage{aaai2026}              % Camera-ready version
\fi

\usepackage{times}  % DO NOT CHANGE THIS
\usepackage{helvet}  % DO NOT CHANGE THIS
\usepackage{courier}  % DO NOT CHANGE THIS
\usepackage[hyphens]{url}  % DO NOT CHANGE THIS
\usepackage{graphicx} % DO NOT CHANGE THIS
\urlstyle{rm} % DO NOT CHANGE THIS
\def\UrlFont{\rm}  % DO NOT CHANGE THIS
\usepackage{natbib}  % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
\usepackage{caption} % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
\frenchspacing  % DO NOT CHANGE THIS
\setlength{\pdfpagewidth}{8.5in} % DO NOT CHANGE THIS
\setlength{\pdfpageheight}{11in} % DO NOT CHANGE THIS

% These are recommended to typeset algorithms but not required.
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{amsmath}
\usepackage{amssymb}

% These are recommended to typeset listings but not required.
\usepackage{newfloat}
\usepackage{listings}
\DeclareCaptionStyle{ruled}{labelfont=normalfont,labelsep=colon,strut=off} % DO NOT CHANGE THIS
\lstset{% 
	basicstyle={\footnotesize\ttfamily},
	numbers=left,numberstyle=\footnotesize,xleftmargin=2em,
	aboveskip=0pt,belowskip=0pt,
	showstringspaces=false,tabsize=2,breaklines=true}
\floatstyle{ruled}
\newfloat{listing}{tb}{lst}{}
\floatname{listing}{Listing}

% Extra packages
\usepackage{float}
\usepackage{booktabs}
\usepackage{array}

\newcommand{\tablesizeAAAI}{%
  % 9 pt text on 10.5 pt baselineskip = AAAI‑legal minimum
  \fontsize{9}{10.5}\selectfont
  % tighten inter‑column spacing (default is 6pt)
  \setlength{\tabcolsep}{3pt}%
}
\newcolumntype{Y}{>{\centering\arraybackslash}p{0.6cm}}

\pdfinfo{
/TemplateVersion (2026.1)
}

\setcounter{secnumdepth}{0} %May be changed to 1 or 2 if section numbers are desired.

% Title - conditionally set based on version
\ifdefined\aaaianonymous
    \title{AAAI 2026 Supplementary Material\\Anonymous Submission}
\else
    \title{AAAI 2026 Supplementary Material\\Camera Ready}
\fi

% Author and affiliation information
\ifdefined\aaaianonymous
\author{
    Anonymous Submission
}
\affiliations{
    % Leave affiliations empty for anonymous submission
}
\else
\author{
    %Authors
    Written by AAAI Press Staff\textsuperscript{\rm 1}\thanks{With help from the AAAI Publications Committee.}\\
    AAAI Style Contributions by Pater Patel Schneider,
    Sunil Issar,\\
    J. Scott Penberthy,
    George Ferguson,
    Hans Guesgen,
    Francisco Cruz\equalcontrib,
    Marc Pujol-Gonzalez\equalcontrib
}
\affiliations{
    \textsuperscript{\rm 1}Association for the Advancement of Artificial Intelligence\\
    1101 Pennsylvania Ave, NW Suite 300\\
    Washington, DC 20004 USA\\
    proceedings-questions@aaai.org
}
\fi

\begin{document}

\maketitle\onecolumn

% \begin{abstract}
% This document provides supplementary material for the main paper, including additional experiments, derivations, data, figures, algorithms, and other relevant content. Please add detailed information as needed. This supplementary material is submitted together with the main paper to further support and complement the main findings.
% \end{abstract}

% ----------- Supplementary Content Starts Here -----------

\section{Appendix A.1: Dataset Overview}

Table~\ref{tab:tbl_a01_dataset_sources} lists the sources and licenses for all datasets used in the BTZSC benchmark. All datasets are publicly available on Hugging Face Datasets\footnote{\url{https://huggingface.co/datasets}}. We use the same label verbalizers as used by \cite{LaurerZSC2023}.  

% ----------  appendix table ----------
\begin{table}[H]
\centering
\begin{tabular}{llll}
\toprule
Domain & Dataset & Source & License \\
\midrule
\textbf{Emotion}\\
dialogue        & empathetic\_dialogues & \cite{Rashkin2019} & CC BY-NC 4.0 \\
social-media    & emotiondair          & \cite{Saravia2018}           & Other (research/education) \\[3pt]
\textbf{Intent}\\
banking         & banking77            & \cite{Casanueva2020} & CC BY 4.0 \\
social-media    & biasframes\_intent   & \cite{Sap2020}     & CC BY 4.0 \\[3pt]
\textbf{Sentiment}\\
apps            & appreviews            & \cite{Grano2017}                & Unknown \\
e-commerce      & amazonpolarity       & \cite{Zhang2015}   & Apache-2.0 \\
finance         & financialphrasebank  & \cite{Malo2014}    & CC BY-NC-SA 3.0 \\
local-business  & yelpreviews           & \cite{Zhang2015Yelp} & Terms of Use (non-commercial, 21 Feb 2020) \\
movies          & imdb                 & \cite{Maas2011} & IMDb Non-Commercial Dataset Terms \\
movies          & rottentomatoes        & \cite{Pang2005} & CC0 1.0 (Public Domain) \\[3pt]
\textbf{Topic}\\
assistant       & massive              & \cite{FitzGerald2022} & CC BY 4.0 \\
education       & trueteacher          & \cite{Gekhman2023} & CC BY-NC 4.0 \\
news            & agnews                & \cite{Zhang2015} & Non-commercial (AG Corpus terms) \\
politics        & capsotu               & \cite{Jones2023SOTU,LaurerZSC2023}            & CC BY-NC-SA 4.0 \\
politics        & manifesto            & \cite{Lehmann2024} & see Terms of Use \\
qa-forum        & yahootopics          & \cite{Zhang2015Yahoo} & Unknown \\
social-media    & biasframes\_offensive & \cite{Sap2020}    & CC BY 4.0 \\
social-media    & biasframes\_sex       & \cite{Sap2020}    & CC BY 4.0 \\
wikipedia       & wikitoxic\_insult     & \cite{Wulczyn2017} & CC0 1.0 \\
wikipedia       & wikitoxic\_obscene    & \cite{Wulczyn2017} & CC0 1.0 \\
wikipedia       & wikitoxic\_threat     & \cite{Wulczyn2017} & CC0 1.0 \\
wikipedia       & wikitoxic\_toxicaggregated & \cite{Wulczyn2017} & CC0 1.0 \\
\bottomrule
\end{tabular}
\caption{Source and licenses for all BTZSC datasets.}
\label{tab:tbl_a01_dataset_sources}
\end{table}


\section{Appendix A.2: Model overview}

\begin{table}[H]
\centering
\tablesizeAAAI               % 9 pt font, \tabcolsep=3 pt
\renewcommand{\arraystretch}{1.1}

\begin{tabular}{@{}lY p{1.35cm} l l r c@{}}
\toprule
Model & Yr & Arch. & Backbone & FT / train data$^\dag$ & \# P & Pool / dim \\
\midrule
\multicolumn{7}{l}{\textbf{Base encoders}}\\
bert-large-uncased          & 2018 & enc.     & BERT        & none                    & 340 M & – \\
deberta-v3-large            & 2021 & enc.     & DeBERTa v3  & none                    & 304 M & – \\
ModernBERT-large            & 2024 & enc.     & ModernBERT  & none                    & 395 M & – \\
\midrule
\multicolumn{7}{l}{\textbf{NLI cross‑encoders}}\\
bart-large-mnli             & 2020 & enc–dec. & BART        & SNLI, MNLI             & 406 M & – \\
nli-roberta-base            & 2020 & enc.     & RoBERTa     & SNLI,  MNLI             & 125 M & – \\
bert-base-uncased-nli               & —    & enc.     & BERT        & MNLI, ANLI, WANLI, FEVERNLI, LINGNLI          & 110 M & – \\
bert-large-uncased-nli              & —    & enc.     & BERT        & same as above                    & 340 M & – \\
bert-large-uncased-nli-triplet      & —    & enc.     & BERT        & same as above                    & 340 M & – \\
deberta-v3-base-nli         & —    & enc.     & DeBERTa v3  & same as above                    & 184 M & – \\
deberta-v3-large-nli        & —    & enc.     & DeBERTa v3  & same as above                    & 304 M & – \\
deberta-v3-large-nli-triplet    & —    & enc.     & DeBERTa v3  & same as above                    & 304 M & – \\
modernbert-base-nli         & —    & enc.     & ModernBERT  & same as above                    & 149 M & – \\
modernbert-large-nli        & —    & enc.     & ModernBERT  & same as above                    & 395 M & – \\
modernbert-large-nli-triplet& —    & enc.     & ModernBERT  & same as above                    & 395 M & – \\
\midrule
\multicolumn{7}{l}{\textbf{Rerankers}}\\
ms-marco-MiniLM-L6-v2       & 2021 & enc.     & MiniLM      & MS MARCO                & 22.7 M & – \\
gte-reranker-modernbert-base& 2024 & enc.     & ModernBERT  & large multiling. pairs  & 149 M & – \\
bge-reranker-base           & 2023 & enc.     & XLM‑RoB.\,B & large multiling. pairs  & 278 M & – \\
bge-reranker-large          & 2023 & enc.     & XLM‑RoB.\,L & large multiling. pairs  & 560 M & – \\
Qwen3-Reranker‑0.6B         & 2025 & dec.     & Qwen3       & synthetic yes/no ranking data   & 0.6 B & – \\
Qwen3-Reranker‑8B           & 2025 & dec.     & Qwen3       & synthetic yes/no ranking data   & 8 B   & – \\
\midrule
\multicolumn{7}{l}{\textbf{Embedding models}}\\
all-MiniLM-L6-v2            & 2021 & enc.     & MiniLM      & 1 B paired sentences   & 22.7 M & mean / 384 \\
e5-base-v2                  & 2023 & enc.     & E5 (BERT)   & 270 M synthetic contrastive      & 110 M  & mean / 768 \\
e5-large-v2                 & 2023 & enc.     & E5 (BERT)   & same as above & 335 M  & mean / 1024 \\
e5-mistral-7b-instruct      & 2024 & dec.     & Mistral‑7B  & synthetic multiling. contrastive     & 7 B    & last / 4096 \\
bge-base-en-v1.5            & 2023 & enc.     & BGE (RoB.)  & 1.5 B pair data, contrastive     & 137 M  & CLS / 768 \\
bge-large-en-v1.5           & 2023 & enc.     & BGE (RoB.)  & same as above                    & 434 M  & CLS / 1024 \\
gte-base-en-v1.5            & 2024 & enc.+    & GTE         & MLM + contrastive pre‑train    & 137 M  & CLS / 768 \\
gte-large-en-v1.5           & 2024 & enc.+    & GTE         & same as above                    & 434 M  & CLS / 1024 \\
gte-modernbert-base         & 2024 & enc.     & ModernBERT  & same as above            & 149 M  & CLS / 768 \\
Qwen3-Embedding‑0.6B            & 2025 & dec.     & Qwen3       & synthetic multiling. contrastive     & 0.6 B  & last / 1024 \\
Qwen3-Embedding‑8B              & 2025 & dec.     & Qwen3       & synthetic multiling. contrastive     & 8 B    & last / 4096 \\
\bottomrule
\end{tabular}
\caption{Architectural and training overview of the 31 models evaluated. Columns list publication year (Yr), encoder/decoder architecture (Arch.), backbone, principal fine-tuning or pre-training data, parameter count (\#P), and pooling strategy with embedding dimensionality.}
\label{tab:tbl_a02_models_overview}
\end{table}

\section{Appendix A.3: Experimental Setup}

\subsection{Cross-Encoder Architectures for NLI}
Let a paired input sequence (premise $\|\,$hypothesis) be tokenised as
$\mathbf{x}\!=\!(x_0\!=\![\text{CLS}],x_1,\dots, [SEP], \dots, x_{S-1})$ and encoded by a
pre-trained Transformer backbone
$f_\theta:\mathbb{N}^S\!\to\!\mathbb{R}^{S\times E}$ with hidden size $E$:
\[
H = f_\theta(\mathbf{x}) \in\mathbb{R}^{S\times E},\qquad
h = H_{0}\in\mathbb{R}^{E}\quad(\text{CLS row}).
\]

\vspace{2pt}
\noindent A two-layer classification head with dropout $p=0.1$ transforms $h$:
\begin{align}
\tilde h &= \mathrm{Dropout}_{0.1}(h),\\
u &= \mathrm{GELU}\bigl(W_1\tilde h + b_1\bigr), & W_1\!\in\!\mathbb{R}^{E\times E},\; b_1\!\in\!\mathbb{R}^{E},\\
z &= \mathrm{LayerNorm}(u),\\
\ell &= W_2 z + b_2, & W_2\!\in\!\mathbb{R}^{E\times C},\; b_2\!\in\!\mathbb{R}^{C},
\end{align}
where $C$ is the number of label logits returned by the head. This setup mimicks the standard classification head of \cite{ModernBERT2024}

\paragraph{Binary variant.}
Here $C=1$ and $\ell\in\mathbb{R}$ is an \emph{entailment logit}.
The probability of entailment is $\sigma(\ell)=\bigl(1+e^{-\ell}\bigr)^{-1}$ and the model is
optimised with binary-cross-entropy:
\[
\mathcal{L}_{\text{BCE}}(y,\ell)=
-y\log\sigma(\ell)-(1-y)\log\bigl(1-\sigma(\ell)\bigr),\quad y\in\{0,1\}.
\]

\paragraph{Three-way variant (\textit{triplet}).}
Now $C=3$ with logits $\ell=(\ell_{\text{ent}},\ell_{\text{neut}},\ell_{\text{contra}})$.
During \emph{training} the standard multi-class cross-entropy is used:
\[
\mathcal{L}_{\text{CE}}(y,\ell)
= -\log\frac{\exp(\ell_y)}{\sum_{c=1}^{3}\exp(\ell_{c})},
\qquad y\in\{1,2,3\}.
\]

\noindent During \emph{evaluation} the scalar entailment score is
\[
s \;=\; \ell_{\text{ent}} - \log\!\bigl(e^{\ell_{\text{neut}}}+e^{\ell_{\text{contra}}}\bigr),
\]
which is the log-odds of the \textsc{entailment} class versus the union of the other two classes.
The corresponding probability is $\sigma(s)$.

\medskip
\noindent\textbf{Dimensions.}  
\begin{itemize}
    \item $B$: batch size (omitted above for clarity),  
    \item $S$: sequence length,  
    \item $E$: hidden size of the backbone,
    \item $C\in\{1,3\}$: number of logits.
\end{itemize}

\medskip
\subsection{Training Procedure}

\paragraph{Validation signal.}
Early stopping is triggered by the dev-set loss computed on an \emph{equal-sized, balanced} union
\[
\mathcal{D}_{\mathrm{dev}}
=\text{MNLI}_{\mathrm{m}}
\cup\text{MNLI}_{\mathrm{mm}}
\cup\text{ANLI}_{r1}
\cup\text{ANLI}_{r2}
\cup\text{ANLI}_{r3}
\cup\text{WANLI}
\cup\text{FEVERNLI}
\cup\text{LINGNLI}.
\]
At every evaluation step the loss is measured, and training stops when this loss fails to decrease for
$10$ consecutive evaluations, or $3$ epochs, whichever comes first. Evaluation is performed every $1\%$ of total steps.

\paragraph{Optimiser and schedules.}
Fine-tuning uses the PyTorch \texttt{AdamW} optimiser with default
settings $(\beta_1\!=\!0.9,\;\beta_2\!=\!0.999,\;\varepsilon\!=\!10^{-8},\; \text{weight-decay}\!=\!0.01)$.
The learning rate employs a \emph{linear warm-up} for the first $10\%$ of
steps followed by \emph{cosine decay}:
\[
\eta_t=
\begin{cases}
\displaystyle \eta_0 \frac{t}{0.1T}, & 0\le t < 0.1T,\\[6pt]
\displaystyle \tfrac12\eta_0\!\left(1+\cos\!\frac{\pi(t-0.1T)}{0.9T}\right),
& 0.1T\le t\le T,
\end{cases}
\]
with separate initial rates for the backbone $(\eta_{\text{enc}})$ and
classification head $(\eta_{\text{head}})$.

\begin{itemize}
  \item \textbf{Large backbones:}
        $\eta_{\text{enc}}=8\times10^{-6},\;
         \eta_{\text{head}}=4\times10^{-5}$.
  \item \textbf{Base backbones:}
        $\eta_{\text{enc}}=2\times10^{-5},\;
        \eta_{\text{head}}=1\times10^{-4}$.
\end{itemize}

All models train for $E=3$ epochs with mini-batch size $B=32$ and no layer
freezing.

\medskip
\subsection{Qwen3 Reranker}

\subsubsection{Prompt Template}

For every \textit{query}–\textit{document} pair we build a single decoder-only
prompt of the form
\[
P = \texttt{prefix}
      \;+\;
      \langle\text{Instruct}\rangle{:}\;I
      \;+\;
      \langle\text{Query}\rangle{:}\;q
      \;+\;
      \langle\text{Document}\rangle{:}\;d
      \;+\;
      \texttt{suffix}.
\]

\paragraph{Fixed strings.}
\begin{itemize}
\item \textbf{Prefix}
\begin{verbatim}
<|im_start|>system
Judge whether the Document meets the requirements based on the Query 
and the Instruct provided. Note that the answer can only be "yes" or "no".
<|im_end|>
<|im_start|>user
\end{verbatim}

\item \textbf{Suffix}
\begin{verbatim}
<|im_end|>
<|im_start|>assistant
<think>

</think>

\end{verbatim}
\end{itemize}

\paragraph{Instructions $I$.}
\begin{itemize}
  \item \textbf{NLI retrieval}
        \begin{verbatim}
Given a piece of text, retrieve the passage that entails the text the best
        \end{verbatim}
  \item \textbf{Label retrieval}
        \begin{verbatim}
Given a piece of text, retrieve relevant label descriptions that best match the text
        \end{verbatim}
\end{itemize}

\medskip
\subsubsection{Binary decision via ``yes/no'' tokens}
Let $\tau_{\text{yes}}$ and $\tau_{\text{no}}$ be the token IDs
that realise the strings “\texttt{yes}” and “\texttt{no}”.
Denote the final-step logit vector by
$v=L_{S-1}\in\mathbb{R}^{V}$.
We extract
\[
v_{\tau_{\text{yes}}}, v_{\tau_{\text{no}}}
\]
and compute the entailment probability as
\[
p_{\text{yes}}
   = \frac{e^{v_{\tau_{\text{yes}}}}}
          {e^{v_{\tau_{\text{yes}}}}+e^{v_{\tau_{\text{no}}}}}
\]



% ----------- Supplementary Content Ends Here -----------

% References and End of Paper
% These lines must be placed at the end of your paper
\bibliography{aaai2026}

\end{document} 