%\documentclass{uai2025} % for initial submission
\documentclass[accepted]{uai2025} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2025} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2025} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{amssymb}
\usepackage{bbm}
\usepackage{bm}
\usepackage{colortbl}
\usepackage{array}
\usepackage{multicol}
\usepackage{pifont} 
\usepackage{cancel} 
\usepackage{tcolorbox}
\usepackage{multirow}
\usepackage{xcolor}
\usepackage[ruled,vlined]{algorithm2e} % Add this for \KwIn and \KwOut

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Enhancing Uncertainty Quantification in \\ Large Language Models through Semantic Graph Density}

% The standard author block has changed for UAI 2025 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<lizhaoye23@nudt.edu.cn>}{Zhaoye~Li}{}}
\author[1]{Siyuan~Shen}
\author[1]{Wenjing~Yang}
\author[1]{Ruochun~Jin}
\author[1]{Huan~Chen}
\author[1]{Ligong~Cao}
\author[1]{Jing~Ren\thanks{Corresponding author.}}
% Add affiliations after the authors
\affil[1]{%
    College of Computer Science and Technology\\
    National University of Defense Technology\\
    Changsha, China
}

\begin{document}

\maketitle

\begin{abstract}
Large Language Models (LLMs) excel in language understanding but are susceptible to "confabulation," where they generate arbitrary, factually incorrect responses to uncertain questions. Detecting confabulation in question answering often relies on Uncertainty Quantification (UQ), which measures semantic entropy or consistency among sampled answers. While several methods have been proposed for UQ in LLMs, they suffer from key limitations, such as overlooking fine-grained semantic relationships among answers and neglecting answer probabilities. To address these issues, we propose Semantic Graph Density (SGD). SGD quantifies semantic consistency by evaluating the density of a semantic graph that captures fine-grained semantic relationships among answers. Additionally, it integrates answer probabilities to adjust the contribution of each edge to the overall uncertainty score. We theoretically prove that SGD generalizes the previous state-of-the-art method, Deg, and empirically demonstrate its superior performance across four LLMs and four free-form question-answering datasets. In particular, in experiments with Llama3.1-8B, SGD outperformed the best baseline by 1.52\% in AUROC on the CoQA dataset and by 1.22\% in AUARC on the TriviaQA dataset.
\end{abstract}

\section{Introduction}

Large language models (LLMs) have shown impressive performance in language understanding and text generation across various domains \citep{zhao2023survey, chang2024survey, wei2022emergent, chi2024unveiling}. However, these models often encounter a critical issue known as "hallucination," where the generated content is either nonsensical or unfaithful to the provided source \citep{ji2023survey, maynez2020faithfulness, filippova2020controlled}. 
Hallucinations manifest in various forms and exhibit different characteristics across different tasks \citep{huang2023survey,farquhar2024detecting}. 
In this paper, we focus exclusively on one type of hallucination—confabulation—and limit the scope to short-form question answering (QA).
Confabulation occurs when LLMs generate arbitrary, factually incorrect responses to uncertain questions \citep{farquhar2024detecting}, often arising when a query exceeds the model's knowledge boundaries \citep{huang2023survey}. For example, when asked "Which programming language has been used for implementing GWAR?", LLMs may answer "C++" or "Perl" inconsistently, even when the same question is posed.

Detecting confabulation in LLM-generated answers can be approached through Uncertainty Quantification (UQ), which assesses the likelihood that an LLM will generate a confabulated response to a given question \citep{farquhar2024detecting}.
Existing UQ methods include entropy-based \citep{farquhar2024detecting,nikitin2024kernel} and graph-based approaches \citep{lin2024generate,da2024llm}. 
Both of these approaches share the commonality of first sampling multiple possible answers and then evaluating their entropy or consistency.
The most well-known entropy-based method is Semantic Entropy (SE) \citep{farquhar2024detecting}, which makes a notable contribution by using semantic equivalence clustering to mitigate lexical uncertainty. However, SE only considers whether two responses are semantically equivalent, overlooking finer semantic similarities \citep{nikitin2024kernel}. Existing graph-based methods achieve effective modeling of semantic consistency through graphs, but they neglect answer probabilities. This introduces bias, as answers with higher probabilities are generally more representative and reliable for uncertainty quantification compared to answers with lower probabilities\footnote{In Appendix \ref{sec:bias}, we present a detailed example illustrating the consequences of current graph-based methods overlooking answer probabilities.} \citep{geng2024survey}. 


To address these issues, we propose a novel UQ method, Semantic Graph Density (SGD). For the first issue, SGD measures semantic consistency by evaluating the density of a semantic graph that reflects fine-grained semantic relationships. A denser graph indicates higher consistency and lower uncertainty. For the second issue, we adjust the contribution of each edge in the semantic graph to the density based on the probability of the answers it connects. The higher the probabilities of the two answers forming an edge, the greater their contribution to the final uncertainty score. We theoretically demonstrate that where certain hyperparameters of SGD are specified, it generalizes the previous state-of-the-art graph-based method, Deg \citep{lin2024generate}. 


We evaluate the performance of SGD across various question-answering domains, including conversational settings (CoQA \citep{reddy2019coqa}), trivia knowledge (TriviaQA \citep{joshi2017triviaqa}), biomedical science (BioASQ \citep{tsatsaronis2015bioasq}), and natural questions (NQ \citep{kwiatkowski2019natural}) derived from real-world Google Search queries. Our evaluation focuses on two key aspects:
(1) its capability to distinguish between correct and fabricated responses, and (2) the improvement in question-answering accuracy when high-uncertainty questions are rejected to answer. Experimental results across four LLMs demonstrate that SGD outperforms baseline methods, achieving enhanced efficiency and robustness in UQ. 




\section{Related Work}
\paragraph{Hallucinations and Confabulations in LLMs} 
Hallucinations in LLMs manifest in various forms, including factual fabrication \citep{huang2023survey,farquhar2024detecting}, instruction inconsistency \citep{huang2023survey}, and reasoning failure \citep{berglund2024reversal,zheng2023does}. This paper focuses on one specific type: confabulation, also known as fabrication, particularly in short-form QA scenarios. Fabrication in QA refers to the phenomenon where the LLM-generated answer is arbitrary and incorrect \citep{farquhar2024detecting}. For example, when asked, "Which programming language has been used for implementing GWAR?" the model may confabulate by answering "C++" at one time and "Perl" at another, despite the same input question. A more intuitive and concrete example is shown in Table \ref{tab:example_combined}. One possible cause of this phenomenon is the model's tendency to generate an answer even when the query exceeds its knowledge boundaries \citep{huang2023survey}. This behavior aligns with the training objective of maximizing rewards for providing answers, resulting in an "overeager" tendency to respond instead of abstaining \citep{farquhar2024detecting}. 
Detecting whether an LLM-generated answer is accurate or fabricated is an important research problem. Approaches include leveraging external knowledge sources, where LLM outputs are cross-referenced with established databases \citep{sui2024enhancing}, or using an external LLM as a judge \citep{cohen2023lm}. Another method involves supervised learning, training classifiers on the LLM's internal states to distinguish between accurate and fabricated content \citep{azaria2023internal}. A different approach is uncertainty quantification, as discussed in the following paragraph.

\paragraph{Uncertainty Quantification in LLMs}
Uncertainty Quantification (UQ) is an effective method for assessing whether LLMs generate hallucinations \citep{farquhar2024detecting,lin2024generate}.
Notably, the term "uncertainty" in this paper specifically refers to the degree of dispersion in the LLM's predicted distribution, rather than response confidence. The former depends exclusively on the input prompt, while the latter is influenced by both the input prompt and the output response.
A higher uncertainty score indicates a greater likelihood of hallucinations \citep{farquhar2024detecting}. 
Specifically for QA, it measures how likely an LLM is to produce a confabulated answer for a given question \citep{farquhar2024detecting}. 
Recently, numerous UQ methods have emerged, differing in their approaches to uncertainty modeling and the types of information utilized, such as output text and token probabilities. UQ methods are classified into white-box and black-box categories based on their access to the LLM’s internal workings and numerical outputs. Black-box methods only access the LLM’s output text, whereas white-box methods have full access \citep{lin2024generate}.
Semantic Entropy \citep{kuhn2023semantic, farquhar2024detecting} serves as a gold standard for quantifying uncertainty in LLMs and represents a prominent white-box approach. Early methods combined lexical and semantic uncertainty, overlooking the fact that different lexical expressions can convey the same meaning. Semantic Entropy, by focusing exclusively on semantics, addresses this limitation and marks a significant advancement in UQ. 


Representative black-box methods include Deg, Ecc, and EigV \citep{lin2024generate}. These methods construct a graph where edge weights reflect semantic similarity. EigV estimates the number of connected components in the graph by analyzing the eigenvalues of the graph Laplacian. In contrast, Deg and Ecc measure output diversity using the graph’s degree matrix and the spectral embedding of its nodes, respectively. Although SGD, as well as Deg, Ecc, and EigV, all construct semantic graphs, the key difference lies in our approach's use of graph density in the semantic graph as a consistency proxy. Additionally, we address the bias issues inherent in these methods by incorporating token probability.



Beyond these key methods, additional approaches include Discrete Semantic Entropy \citep{farquhar2024detecting}, a black-box approximation of Semantic Entropy; Kernel Language Entropy \citep{nikitin2024kernel}, which extends Semantic Entropy by incorporating fine-grained semantic relations beyond equivalence; D-UE \citep{da2024llm}, which addresses limitations in using average entailment probabilities from bidirectional Natural Language Inference (NLI) models to evaluate response similarity; and SEU \citep{grewal2024seu}, which utilizes transformer-based sentence embeddings to provide a smoother and more robust estimation of semantic similarities in UQ.


There is an increasing focus on quantifying uncertainty in long-form answers \citep{zhang2024luq, jiang2024graph, fang2024triples}. However, this paper specifically focuses on short-form answers, which are defined as single-proposition responses to a question \citep{farquhar2024detecting}. These answers are typically concise, comprising only a few words or, at most, a single sentence, in contrast to more extensive paragraphs.

\paragraph{Complementary methods}


\cite{kuhn2023semantic} and \cite{aichberger2024semantically} emphasized that UQ benefits from semantically diverse yet likely output sequences. \cite{aichberger2024semantically} experimentally demonstrated that Diverse Beam Search \citep{vijayakumar2018diverse} and the Semantically Diverse Language Generation (SDLG) method improve the performance of sample-based UQ methods. These techniques can be incorporated into SGD to further enhance its effectiveness.




\section{Semantic Graph Density}\label{sec:ssgd}
In question answering (QA), given an input prompt \(x\) (i.e., a question with or without context) and an LLM, the goal is to evaluate how likely the LLM is to generate a fabricated answer for the given prompt. It is important to emphasize that the objective is to derive a \textit{relative} score indicating the potential for confabulated output, rather than calculating the exact probability of the model's correctness (which is related to \textit{model calibration} \citep{zhu2023calibration, guo2017calibration}, an orthogonal research topic). A higher uncertainty score corresponds to a greater potential for confabulation.

The first two steps of Semantic Graph Density (SGD) involve sampling multiple possible answers (Step 1) and measuring the fine-grained semantic relationships among them (Step 2). In Step 3, these relationships are used to construct a semantic graph. Based on this graph, we compute the graph density and adjust each edge’s contribution according to the probabilities of the connected answers. Edges linking answers with higher probabilities contribute more to the final uncertainty score.

\paragraph{Step 1. Sample $N$ possible answers.} 

Following \cite{farquhar2024detecting} and \cite{nikitin2024kernel}, for an input \( x \), we sample \( N \) possible answers \( \{ y^{(i)} \}_{i=1}^{N} \), where each answer is represented as a sequence of tokens \( y^{(i)} = [y^{(i)}_1, y^{(i)}_2, \dots, y^{(i)}_{L_i}] \), with \( y^{(i)}_j \) denoting the \( j \)-th output token of \( y^{(i)} \). We then compute the corresponding length-normalized probability \footnote{In this paper, all probabilities refer to length-normalized probabilities \citep{murray2018correcting}, a commonly used method to correct for length bias in sequence probabilities.} \citep{murray2018correcting} for each answer: \( \{ P(y^{(i)}|x) \}_{i=1}^{N} \), with \( P(y^{(i)}|x) = \prod_{j=1}^{L_i} P(y_j^{(i)}|y_{<j}^{(i)},x)^{1/L_i} \), where \( y^{(i)}_{<j} \) represents the sequence of tokens preceding \( y^{(i)}_j \).




\paragraph{Step 2. Compute pairwise semantic similarities among $N$ possible answers.} 

The output logits of Natural Language Inference (NLI) models have been demonstrated to effectively measure the semantic similarity between two responses within a given textual context \citep{lin2024generate}. We follow the best practice in \citep{lin2024generate} to measure the semantic similarity between any two sampled answers $y^{(i)}$ and $y^{(j)}$ within the context $x$. The NLI model we employ is DeBERTa-Large-MNLI\footnote{\url{https://huggingface.co/microsoft/deberta-large-mnli}} \citep{he2021deberta}.

We concatenate \( x \) with \( y^{(i)} \) and \( y^{(j)} \) to yield \( x \oplus y^{(i)} \) and \( x \oplus y^{(j)} \) (see Appendix \ref{prompt_nli} for the input format used to generate the output logits of the NLI model) and feed them into the NLI model twice. In the first pass, \( x \oplus y^{(i)} \) is regarded as the premise and \( x \oplus y^{(j)} \) as the hypothesis. Conversely, in the second pass, \( x \oplus y^{(j)} \) is regarded as the premise, and \( x \oplus y^{(i)} \) as the hypothesis. We apply the softmax function to the predicted logits from the NLI model and take the average of the entailment logits from the two passes as the similarity score. 
\begin{equation}
\begin{split}
    s_{i,j} = \frac{1}{2} \big( & \hat{p}_{entail}(x \oplus y^{(i)}, x \oplus y^{(j)}) + \\
                                 & \hat{p}_{entail}(x \oplus y^{(j)}, x \oplus y^{(i)}) \big)
\end{split}
\end{equation}


\paragraph{Step 3. Construct a semantic graph and compute the semantic graph density.}
We first construct a semantic graph to capture fine-grained semantic relationships between answers and then adapt graph density to quantify semantic consistency. Given an undirected simple graph $G=(V,E)$, graph density is defined as the ratio of the number of edges $|E|$ to the maximum possible number of edges \citep{erdds1959random}. \begin{equation}
    D = \frac{|E|}{ \binom{|V|}{2} } = \frac{ |E| }{|V|(|V| - 1)/2} 
\end{equation}
Each answer is treated as a node in the graph. The adjacency matrix is represented as \(W = [w_{ij}]_{N \times N}\). In practice, an edge is established between nodes \(i\) and \(j\) if their similarity score \(s_{ij}\) exceeds a threshold \(\delta\). This relationship is formally defined as follows:
\begin{equation}
    w_{ij} := \mathbbm{1}_{s_{ij} > \delta}.
\end{equation}Under this design, \textit{a denser graph signifies greater semantic consistency and reduced uncertainty}. We define SGD as follows: \begin{equation} \label{eq_sgd_delta}
    \operatorname{SGD}_{\delta}(x)  = \frac{-|E|}{|V|(|V| - 1)/2} =-\sum_{\substack{i,j \in [N],\\ i < j}}  \frac{   \mathbbm{1}_{s_{ij} > \delta}}{N(N-1) / 2}.
\end{equation}Another approach is to directly define the weight \( w_{ij} \) of each edge as the similarity \( s_{ij} \), i.e., \( w_{ij} := s_{ij} \). However, graph density traditionally applies only to binary graphs. Since the weights defined by the similarity are non-negative and bounded within the range $[0, 1]$, we extend the definition of graph density by calculating the total sum of all edge weights divided by the total sum of the maximum possible weights of all edges. Under this design, SGD is defined as:
\begin{equation} \label{eq_sgd_s}
\begin{split}
        \operatorname{SGD}_{s}(x)  &= - \sum_{\substack{i,j \in [N] , i < j}} \frac{2w_{ij}}{|V|(|V| - 1) \cdot \sup_{\substack{m,n \in [N] \\ m < n}} w_{mn}} \\
        &= - \sum_{\substack{i,j \in [N] , i < j}} \frac{2s_{ij}}{N(N-1) \cdot 1} \\
        &= - \sum_{\substack{i,j \in [N] , i < j}} \frac{s_{ij}}{N(N-1)/2}.
\end{split}
\end{equation} Here, \( \sup_{\substack{m,n , m < n}} w_{mn} \) represents the maximum weight that can be assigned to any edge, which corresponds to the maximum pairwise similarity that can be assigned between responses \( \{ y^{(i)}\}_{i=1}^{N} \). Since the similarities are computed from the output logits of the NLI model, which are transformed via softmax and bounded within the range of \( [0, 1] \), we have \( \sup_{\substack{m,n , m < n}} w_{mn} = \sup_{\substack{m,n , m < n}} s_{mn} = 1 \).




\paragraph{Probability Incorporation}
In Equations \ref{eq_sgd_delta} and \ref{eq_sgd_s}, the numerator ($s_{ij}$ or $\mathbbm{1}_{s_{ij}>\delta}$) represents the weights of the edges. Each edge contributes equally to the uncertainty score at \( 1 / (N(N-1)/2) \), as determined by the denominator \(  N(N-1)/2 \). However, the sampled answers are not necessarily equally probable, meaning the edges formed by pairs of answers may not have equal probabilities. The length-normalized probability of an answer reflects the LLM’s confidence in that answer \citep{geng2024survey}. If an edge \( \{y^{(i)}, y^{(j)}\} \) has a higher probability compared to other edges, this indicates that the LLM has higher confidence in producing \( \{y^{(i)}, y^{(j)}\} \). As such, \( \{y^{(i)}, y^{(j)}\} \) should carry more weight in UQ, contributing more significantly to the final uncertainty score.



We define the contribution of each edge as $\mu(i,j)$, $\sum_{\substack{i,j \in [N], i < j}} \mu(i,j)=1$. SGD is further calculated as: \begin{equation}
\begin{split}
   \operatorname{SGD}_{\delta+P}(x) &= - \sum_{\substack{i,j \in [N] , i < j}} \mathbbm{1}_{s_{ij}>\delta} \cdot {\mu (i,j)}, \\
   \operatorname{SGD}_{s+P}(x) &= - \sum_{\substack{i,j \in [N] , i < j}} s_{ij} \cdot {\mu (i,j)}
   \end{split}
\end{equation} We use the notation "$+P$" to denote probability incorporation. As each sampling is mutually independent, the categorical distribution of the relative occurrence of $\{y^{(i)},y^{(j)}\}$ is estimated as: \begin{equation}\label{prob}
   P( y^{(i)},y^{(j)}|x) = \frac{ P(y^{(i)} |x) \cdot P(y^{(j)} |x) }{ \sum_{\substack{i,j \in [N] \\ i < j}} P(y^{(i)} |x) \cdot P(y^{(j)} |x)  }
\end{equation}
We define $\mu(i, j)$ as a convex combination of $1 / ( N(N - 1) / 2) $ and Equation \ref{prob}. 
\begin{equation}
\begin{split}
    \mu(i,j) &= \frac{\theta}{N(N-1)/2} + (1-\theta) \cdot P( y^{(i)}, y^{(j)} | x), \\
    &\quad i,j \in [N], i < j.
\end{split}
\end{equation}

Algorithm \ref{alg_sgd} summarizes the SGD procedure.


\paragraph{Generalization towards Deg \citep{lin2024generate}}\label{generalization}

Deg constructs a semantic graph based on the pairwise similarity of answers, where the edge weights are defined as \( w_{ij} := s_{ij}, \, i,j \in [N] \). The degree matrix is defined as \( D = \operatorname{diag}(d_1, d_2, \dots, d_N) \), where \( d_i = \sum_{j \in [N]} s_{ij} \) represents the degree of node \( i \). Next, we explain how to derive an approximation of Deg \citep{lin2024generate} from \( \operatorname{SGD}_{s} \) (equivalent to $\operatorname{SGD}_{s+P}$ when $\theta=1$).
\begin{equation}
\begin{split}
\operatorname{Deg}(x) &= \operatorname{tr}(N \mathbb{I} -D)/N^{2} \\
&=\left( N\operatorname{tr}( \mathbb{I} ) - \operatorname{tr}(D) \right) / {N^2} \\
&= \frac{1}{N^2} \left(  {N^2} - \sum_{i,j \in [N]} s_{ij}   \right)  \\
&= 1- \frac{1}{N^2} \left( \sum_{\substack{i,j\in [N] \\i=j } }s_{ij} + 2\sum_{\substack{i,j \in [N] \\ i<j} }s_{ij} \right) 
\end{split}
\end{equation} In Deg, \( s_{ij} \) (\(i, j \in [N], i = j\)) is calculated using the output logits of the NLI model. When \( y^{(i)} \) is identical to \( y^{(j)} \), the softmax logit for entailment is nearly equal to 1 (e.g., 0.998), though it can never be exactly 1. Thus, \( s_{ij} \) (\(i, j \in [N], i = j\)) can be approximated as 1. By further combining the following equation,\begin{equation}
      \begin{split}
          2\sum_{\substack{i,j \in [N] \\ i<j} }s_{ij} &=N(N-1) \sum_{\substack{i,j \in [N] \\ i<j} } \frac{s_{ij}}{N(N-1)/2} \\
          &=-N(N-1) \operatorname{SGD}_s(x),
      \end{split}
\end{equation} we can derive the following: \begin{equation}
    \begin{split}
        \operatorname{Deg}(x) &\approx 1-\frac{1}{N}+\frac{N-1}{N}\operatorname{SGD}_s(x) \\
        &= \frac{N-1}{N} (1+\operatorname{SGD}_s(x)).
    \end{split}
\end{equation} It is evident that the approximation of $\operatorname{Deg}(x)$ is a linear combination of $\operatorname{SGD}_s(x)$, and linearly scaling the uncertainty scores does not alter their relative discriminative capability (as previously mentioned, we require a relative score to determine whether a prompt will generate a correct or confabulated response). Ablation experiments (refer to Section \ref{sec:ablation}) demonstrate that $\operatorname{SGD}_s$ and Deg achieve nearly identical performance.


\begin{algorithm}
\caption{Semantic Graph Density}
\label{alg_sgd}
\KwIn{An input prompt \( x \) (i.e. a question with/without context), an LLM, the number of possible responses \( N \), hyperparameters $\delta$ and $\theta$. }
\KwOut{Semantic Graph Density.}

\textbf{Step 1: Sample $N$ possible answers.} \\
Sample $N$ possible answers \(\{ y^{(i)}\}_{i=1}^{N}\) based on the input prompt \( x \), and compute the length-normalized probability for each answer, resulting in \(\{ P(y^{(i)}|x) \}_{i=1}^{N}\). 

\textbf{Step 2: Compute pairwise semantic similarities among $N$ possible answers.} \\
Compute the pairwise semantic similarities for any pair of $N$ possible answers, resulting in $s_{ij}$, where $s_{i,j} =\big(  \hat{p}_{entail}(x \oplus y^{(i)}, x \oplus y^{(j)}) + \hat{p}_{entail}(x \oplus y^{(j)}, x \oplus y^{(i)}) \big) / 2$

\textbf{Step 3: Compute the semantic graph density.} \\
$\operatorname{SGD}_{\delta}(x) =- \sum_{\substack{i,j \in [N] , i < j}}  \frac{  \mathbf{1}_{s_{ij} > \delta}}{N(N-1)/2} $, \\
$\operatorname{SGD}_{s}(x) =- \sum_{\substack{i,j \in [N] , i < j}}  \frac{  s_{ij}}{N(N-1)/2} $,\\
$\operatorname{SGD}_{\delta+P}(x) =- \sum_{\substack{i,j \in [N] , i < j}}   \mathbf{1}_{s_{ij} > \delta} \cdot \mu (i,j) $,\\
$\operatorname{SGD}_{s+P}(x) =- \sum_{\substack{i,j \in [N] , i < j}}  s_{ij} \cdot \mu (i,j) $,\\
where $\mu(i,j) = \frac{\theta}{N(N-1)/2} + (1-\theta) \cdot P( y^{(i)}, y^{(j)} | x)$, \\
$P( y^{(i)},y^{(j)}|x) = \frac{ P(y^{(i)} |x) \cdot P(y^{(j)} |x) }{ \sum_{\substack{i,j \in [N] , i < j}} P(y^{(i)} |x) \cdot P(y^{(j)} |x)  }$, $i,j \in [N], i<j$.
\end{algorithm}
\paragraph{Computational Cost} 
We focus solely on the resource consumption arising from language model inference, as no other operation is more computationally expensive. Reviewing the entire process of calculating \( \operatorname{SGD} \), model inferences are only required in the first and second steps. The first step involves sampling \( N \) possible answers, requiring \( N \) LLM inferences, which can be executed in parallel. 
The second step demands \( 2 \cdot \binom{N}{2} \) inferences by the NLI model to calculate the semantic similarity \( s_{ij} \) for each pair \( \{y^{(i)}, y^{(j)}\} \) (\(i < j\)). Computing \( s_{ij} \) requires two inferences to obtain \( \hat{p}_{entail}(x \oplus y^{(i)}, x \oplus y^{(j)}) \) and \( \hat{p}_{entail}(x \oplus y^{(j)}, x \oplus y^{(i)}) \). These \( N(N-1) \) inferences can also be performed in parallel.
The NLI model used in this study is DeBERTa-Large-MNLI, with approximately 150 million parameters. In contrast to LLMs, which process over a billion parameters to generate a single token, the computational cost of NLI models is relatively minimal.

\section{Experiments}
\subsection{Experimental Setups}
\paragraph{Datasets and LLMs}
We consider four generative question-answering tasks for evaluation, including the open-book conversational QA dataset CoQA \citep{reddy2019coqa}, the closed-book QA dataset TriviaQA \citep{joshi2017triviaqa}, the biomedical QA dataset BioASQ \citep{tsatsaronis2015bioasq}, and Natural Questions \citep{kwiatkowski2019natural}. We use the development split of CoQA, which contains 7,983 questions, the deduplicated validation split of TriviaQA (rc.noncontext subset) with 9,960 questions, the validation split of NQ with 3,610 questions, and the training split of BioASQ with 2,814 questions\footnote{\url{http://participants-area.bioasq.org/Tasks/10b/trainingDataset/}}. We utilize four popular off-the-shelf instruction-tuned LLMs for evaluation, with model sizes ranging from 1B to 12B parameters. These models include Llama-3.2-1B\footnote{\url{https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct}}, Llama-3.1-8B\footnote{\url{https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct}}, Mistral-7B-v0.3\footnote{\url{https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3}} and Mistral-Nemo-12B\footnote{\url{https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407}}.

\paragraph{Evaluation Metric}
We use AUROC (Area Under the Receiver Operating Characteristic Curve) to evaluate how well the uncertainty scores distinguish between correct and incorrect answers. AUROC indicates the probability that a randomly selected correct generation has a lower uncertainty score than a randomly selected incorrect generation. An AUROC of 0.5 indicates that the assigned uncertainty score is no better than random guessing, meaning it cannot effectively differentiate between correct and incorrect answers. An AUROC of 1 signifies perfect discrimination, where all correct answers are assigned lower uncertainty scores than all incorrect answers.

Additionally, QA accuracy can be improved by rejecting questions with high uncertainty. This improvement is quantified using the AUARC (Area Under the Accuracy-Rejection Curve) \citep{nadeem2009accuracy}, which measures the area under the accuracy-rejection curve at various thresholds. The rejection accuracy at a given threshold is determined by the accuracy of the remaining answers after rejecting those with uncertainty scores above this threshold.

\paragraph{Answer Generation}
For each question, we generated 10 answers using nucleus sampling (\(P = 0.9\)) and top-\(K\) sampling (\(K = 50\)) at a temperature of \(T = 1\), following \cite{farquhar2024detecting} and \cite{nikitin2024kernel}. To assess model accuracy, we generated a single answer at \(T = 0.1\) and prompted GPT-4-0613 to verify whether this response aligned with any ground truths provided by the datasets, as adopted by \cite{farquhar2024detecting}. The prompts used for generating the answers and conducting correctness checks are detailed in Appendix \ref{sec:adx_datasets} and Appendix \ref{prompt_correct}, respectively.



\begin{table*}[htb]
    \centering
    \caption{Performance (AUROC) comparison of various uncertainty metrics. All results are presented as percentages. For each model-dataset combination, the best average result from both baseline methods and our proposed methods is \underline{underlined}, while the overall best result among all methods is highlighted in \textbf{bold}. 
    } 
    \label{tab:main_ret_auroc}
    \resizebox{\textwidth}{!}{
    
    \begin{tabular}{cccccccccccc}
        \toprule % from booktabs package

        \multirow{2}{*}{\textbf{Datasets}} & \multicolumn{3}{c}{\textbf{Entropy-based Methods}} & \multicolumn{4}{c}{\textbf{Graph-based Methods}} & \multicolumn{2}{c}{\textbf{Consistency-based Methods}} & \multicolumn{2}{c}{\textbf{Ours}} \\

        \cmidrule(lr){2-4} \cmidrule(lr){5-8} \cmidrule(lr){9-10} \cmidrule(lr){11-12}
        
          &  SE &  DSE &  KLE &  Ecc &  EigV &  Deg &  D-UE &  NSS &  SEU  & ${\operatorname{SGD}_{\delta+P}}$ &  ${\operatorname{SGD}_{s+P}}$     \\


        \midrule 
        \rowcolor{gray!20} \multicolumn{12}{c}{Llama-3.2-1B} \\ 

        NQ & \underline{77.48\scriptsize{±0.55}} & {76.50\scriptsize{±0.53}} & {75.36\scriptsize{±0.50}} & {76.66\scriptsize{±0.62}} & {75.87\scriptsize{±0.57}} & {77.18\scriptsize{±0.51}} & {71.93\scriptsize{±0.63}} & {76.43\scriptsize{±0.56}} & {67.38\scriptsize{±0.72}} & {76.89\scriptsize{±0.55}} & \underline{\textbf{78.29\scriptsize{±0.55}}} \\
        CoQA & {73.73\scriptsize{±0.21}} & {73.22\scriptsize{±0.22}} & {74.59\scriptsize{±0.28}} & {73.84\scriptsize{±0.29}} & {70.82\scriptsize{±0.25}} & \underline{75.73\scriptsize{±0.27}} & {73.95\scriptsize{±0.28}} & {72.51\scriptsize{±0.22}} & {69.80\scriptsize{±0.27}} & {75.66\scriptsize{±0.44}} & \underline{\textbf{76.35\scriptsize{±0.29}}} \\
        BioASQ & {86.87\scriptsize{±0.45}} & {86.76\scriptsize{±0.47}} & {86.73\scriptsize{±0.40}} & {86.79\scriptsize{±0.45}} & {85.53\scriptsize{±0.42}} & \underline{87.25\scriptsize{±0.39}} & {85.62\scriptsize{±0.44}} & {86.36\scriptsize{±0.44}} & {78.78\scriptsize{±0.51}} & \underline{\textbf{87.56\scriptsize{±0.39}}} & {87.32\scriptsize{±0.38}} \\
        TriviaQA & \underline{82.17\scriptsize{±0.18}} & {81.15\scriptsize{±0.16}} & {80.41\scriptsize{±0.18}} & {81.13\scriptsize{±0.18}} & {78.84\scriptsize{±0.16}} & {81.64\scriptsize{±0.16}} & {79.25\scriptsize{±0.13}} & {80.51\scriptsize{±0.15}} & {77.04\scriptsize{±0.14}} & {82.24\scriptsize{±0.18}} & \underline{\textbf{82.44\scriptsize{±0.20}}} \\
        \textit{Average} & {80.06} & {79.41} & {79.27} & {79.61} & {77.77} & \underline{80.45} & {77.69} & {78.95} & {73.25} & {80.59} & \underline{\textbf{81.10}} \\
        
          
        \midrule 
        \rowcolor{gray!20} \multicolumn{12}{c}{Llama-3.1-8B} \\ 
        NQ & {78.30\scriptsize{±0.43}} & {77.88\scriptsize{±0.47}} & {77.55\scriptsize{±0.44}} & {77.73\scriptsize{±0.46}} & {76.26\scriptsize{±0.41}} & \underline{78.64\scriptsize{±0.44}} & {75.00\scriptsize{±0.42}} & {77.48\scriptsize{±0.47}} & {71.03\scriptsize{±0.44}} & {78.84\scriptsize{±0.39}} & \underline{\textbf{78.87\scriptsize{±0.42}}} \\ 
        CoQA & {75.26\scriptsize{±0.36}} & {74.89\scriptsize{±0.35}} & {78.92\scriptsize{±0.27}} & {76.97\scriptsize{±0.40}} & {71.75\scriptsize{±0.33}} & \underline{80.04\scriptsize{±0.24}} & {77.90\scriptsize{±0.30}} & {74.14\scriptsize{±0.35}} & {72.71\scriptsize{±0.33}} & {78.59\scriptsize{±0.44}} & \underline{\textbf{81.56\scriptsize{±0.28}}} \\ 
        BioASQ & {83.40\scriptsize{±0.47}} & {83.35\scriptsize{±0.47}} & {84.28\scriptsize{±0.45}} & {83.03\scriptsize{±0.58}} & {81.32\scriptsize{±0.42}} & \underline{84.73\scriptsize{±0.46}} & {82.59\scriptsize{±0.57}} & {82.45\scriptsize{±0.51}} & {74.81\scriptsize{±0.74}} & {84.93\scriptsize{±0.42}} & \underline{\textbf{85.42\scriptsize{±0.48}}} \\ 
        TriviaQA & {85.95\scriptsize{±0.11}} & {85.23\scriptsize{±0.13}} & {85.67\scriptsize{±0.12}} & {84.97\scriptsize{±0.24}} & {83.27\scriptsize{±0.12}} & \underline{86.23\scriptsize{±0.12}} & {84.51\scriptsize{±0.13}} & {84.42\scriptsize{±0.13}} & {81.95\scriptsize{±0.13}} & {86.47\scriptsize{±0.13}} & \underline{\textbf{87.17\scriptsize{±0.12}}} \\ 
        \textit{Average} & 80.73 & 80.34 & 81.61 & 80.68 & 78.15 & \underline{82.41} & 80.00 & 79.62 & 75.13 & 82.21 & \underline{\textbf{83.26}}      \\ 

        \midrule 
        \rowcolor{gray!20} \multicolumn{12}{c}{Mistral-7B-v0.3} \\ 
        NQ & {76.88\scriptsize{±0.60}} & {76.88\scriptsize{±0.60}} & \underline{77.58\scriptsize{±0.55}} & {77.24\scriptsize{±0.46}} & {76.62\scriptsize{±0.37}} & {77.42\scriptsize{±0.56}} & {76.15\scriptsize{±0.45}} & {76.67\scriptsize{±0.57}} & {71.85\scriptsize{±0.43}} & {77.68\scriptsize{±0.60}} & \underline{\textbf{78.39\scriptsize{±0.60}}} \\
        CoQA & {75.82\scriptsize{±0.33}} & {75.76\scriptsize{±0.29}} & {77.60\scriptsize{±0.21}} & {78.11\scriptsize{±0.35}} & {72.18\scriptsize{±0.27}} & \underline{79.61\scriptsize{±0.28}} & {78.44\scriptsize{±0.26}} & {75.32\scriptsize{±0.28}} & {73.47\scriptsize{±0.25}} & {78.81\scriptsize{±0.55}} & \underline{\textbf{80.58\scriptsize{±0.25}}} \\
        BioASQ & {80.86\scriptsize{±0.53}} & {80.90\scriptsize{±0.50}} & \underline{83.66\scriptsize{±0.41}} & {83.05\scriptsize{±0.50}} & {82.66\scriptsize{±0.50}} & {83.57\scriptsize{±0.53}} & {80.54\scriptsize{±0.55}} & {80.98\scriptsize{±0.49}} & {67.84\scriptsize{±0.41}} & {83.88\scriptsize{±0.60}} & \underline{\textbf{84.56\scriptsize{±0.54}}} \\
        TriviaQA & {83.76\scriptsize{±0.29}} & {83.53\scriptsize{±0.28}} & {83.86\scriptsize{±0.28}} & {83.74\scriptsize{±0.11}} & {82.80\scriptsize{±0.12}} & \underline{85.04\scriptsize{±0.28}} & {83.58\scriptsize{±0.28}} & {82.98\scriptsize{±0.28}} & {79.59\scriptsize{±0.12}} & {85.48\scriptsize{±0.26}} & \underline{\textbf{86.27\scriptsize{±0.27}}} \\ 
        \textit{Average} & {79.33} & {79.27} & {80.68} & {80.54} & {78.57} & \underline{81.41} & {79.68} & {78.99} & {73.19} & {81.46} & \underline{\textbf{82.45}} \\
        
        \midrule 
        \rowcolor{gray!20} \multicolumn{12}{c}{Mistral-Nemo-12B} \\ 

        NQ & {76.78\scriptsize{±0.59}} & {76.35\scriptsize{±0.57}} & \underline{77.78\scriptsize{±0.58}} & {76.55\scriptsize{±0.47}} & {76.28\scriptsize{±0.58}} & {76.92\scriptsize{±0.52}} & {73.04\scriptsize{±0.44}} & {75.84\scriptsize{±0.56}} & {69.53\scriptsize{±0.39}} & \underline{\textbf{78.34\scriptsize{±0.50}}} & {77.14\scriptsize{±0.54}} \\
        CoQA & {76.08\scriptsize{±0.19}} & {75.72\scriptsize{±0.24}} & {78.09\scriptsize{±0.19}} & {77.25\scriptsize{±0.19}} & {71.11\scriptsize{±0.24}} & \underline{79.10\scriptsize{±0.14}} & {77.01\scriptsize{±0.16}} & {75.05\scriptsize{±0.23}} & {72.41\scriptsize{±0.20}} & {78.26\scriptsize{±0.73}} & \underline{\textbf{80.39\scriptsize{±0.18}}} \\
        BioASQ & {81.66\scriptsize{±0.48}} & {81.58\scriptsize{±0.56}} & \underline{84.54\scriptsize{±0.44}} & {82.20\scriptsize{±0.39}} & {81.90\scriptsize{±0.61}} & {83.60\scriptsize{±0.38}} & {79.55\scriptsize{±0.44}} & {80.91\scriptsize{±0.57}} & {69.64\scriptsize{±0.49}} & {84.03\scriptsize{±0.43}} & \underline{\textbf{84.54\scriptsize{±0.37}}} \\
        TriviaQA & {85.44\scriptsize{±0.10}} & {84.88\scriptsize{±0.19}} & {86.10\scriptsize{±0.10}} & {84.61\scriptsize{±0.14}} & {83.31\scriptsize{±0.11}} & \underline{86.29\scriptsize{±0.11}} & {84.29\scriptsize{±0.11}} & {84.07\scriptsize{±0.09}} & {81.47\scriptsize{±0.11}} & {86.83\scriptsize{±0.17}} & \underline{\textbf{87.39\scriptsize{±0.11}}} \\
        \textit{Average} & {79.99} & {79.63} & \underline{81.63} & {80.15} & {78.15} & {81.48} & {78.47} & {78.97} & {73.26} & {81.87} & \underline{\textbf{82.37}} \\



        \bottomrule
    \end{tabular}
    
    }

\end{table*}



\begin{table*}[htb]
    \centering
    \caption{Performance (AUARC) comparison of various uncertainty metrics. All results are presented as percentages. For each model-dataset combination, the best average result from both baseline methods and our proposed methods is \underline{underlined}, while the overall best result among all methods is highlighted in \textbf{bold}. 
    } 
    \label{tab:main_ret_auarc}
    \resizebox{\textwidth}{!}{
    
    \begin{tabular}{cccccccccccc}
        \toprule % from booktabs package

        \multirow{2}{*}{\textbf{Datasets}} & \multicolumn{3}{c}{\textbf{Entropy-based Methods}} & \multicolumn{4}{c}{\textbf{Graph-based Methods}} & \multicolumn{2}{c}{\textbf{Consistency-based Methods}} & \multicolumn{2}{c}{\textbf{Ours}} \\

        \cmidrule(lr){2-4} \cmidrule(lr){5-8} \cmidrule(lr){9-10} \cmidrule(lr){11-12}
        
          &  SE &  DSE &  KLE &  Ecc &  EigV &  Deg &  D-UE &  NSS &  SEU  & ${\operatorname{SGD}_{\delta+P}}$ &  ${\operatorname{SGD}_{s+P}}$     \\
        \midrule 
        \rowcolor{gray!20} \multicolumn{12}{c}{Llama-3.2-1B} \\ 

        NQ & {27.54\scriptsize{±0.63}} & {27.43\scriptsize{±0.63}} & {27.08\scriptsize{±0.58}} & {27.75\scriptsize{±0.49}} & {26.50\scriptsize{±0.53}} & \underline{28.10\scriptsize{±0.57}} & {25.91\scriptsize{±0.52}} & {27.20\scriptsize{±0.54}} & {23.70\scriptsize{±0.50}} & {27.62\scriptsize{±0.67}} & \underline{\textbf{28.56\scriptsize{±0.58}}} \\
        CoQA & {86.97\scriptsize{±0.28}} & {86.32\scriptsize{±0.16}} & {87.65\scriptsize{±0.11}} & {87.25\scriptsize{±0.14}} & {85.28\scriptsize{±0.12}} & \underline{87.98\scriptsize{±0.28}} & {87.45\scriptsize{±0.10}} & {86.08\scriptsize{±0.26}} & {85.80\scriptsize{±0.11}} & {87.87\scriptsize{±0.17}} & \underline{\textbf{88.34\scriptsize{±0.29}}} \\
        BioASQ & \underline{71.25\scriptsize{±0.86}} & {71.01\scriptsize{±0.85}} & {70.64\scriptsize{±0.93}} & {70.63\scriptsize{±0.90}} & {69.37\scriptsize{±0.88}} & {70.88\scriptsize{±0.83}} & {70.11\scriptsize{±0.86}} & {70.62\scriptsize{±0.91}} & {65.72\scriptsize{±0.81}} & \underline{\textbf{71.27\scriptsize{±0.84}}} & {71.22\scriptsize{±0.83}} \\
        TriviaQA & {52.04\scriptsize{±0.24}} & {51.48\scriptsize{±0.23}} & {51.39\scriptsize{±0.37}} & {51.55\scriptsize{±0.24}} & {49.35\scriptsize{±0.21}} & \underline{52.22\scriptsize{±0.24}} & {50.47\scriptsize{±0.21}} & {50.89\scriptsize{±0.23}} & {48.30\scriptsize{±0.22}} & {52.37\scriptsize{±0.27}} & \underline{\textbf{53.62\scriptsize{±0.23}}} \\
        \textit{Average} & {59.45} & {59.06} & {59.19} & {59.30} & {57.63} & \underline{59.80} & {58.49} & {58.70} & {55.88} & {59.78} & \underline{\textbf{60.44}} \\

        \midrule 
        \rowcolor{gray!20} \multicolumn{12}{c}{Llama-3.1-8B} \\ 

        NQ & {51.74\scriptsize{±1.02}} & {51.10\scriptsize{±1.11}} & {51.39\scriptsize{±1.06}} & {51.11\scriptsize{±0.90}} & {49.51\scriptsize{±1.00}} & \underline{52.10\scriptsize{±0.92}} & {49.66\scriptsize{±0.97}} & {50.71\scriptsize{±1.20}} & {46.83\scriptsize{±0.81}} & {51.62\scriptsize{±0.97}} & \underline{\textbf{53.18\scriptsize{±0.91}}} \\
        CoQA & {94.74\scriptsize{±0.29}} & {94.79\scriptsize{±0.30}} & {96.02\scriptsize{±0.19}} & {95.80\scriptsize{±0.17}} & {94.13\scriptsize{±0.24}} & \underline{96.30\scriptsize{±0.15}} & {95.92\scriptsize{±0.16}} & {94.69\scriptsize{±0.18}} & {95.15\scriptsize{±0.15}} & {95.99\scriptsize{±0.24}} & \underline{\textbf{96.39\scriptsize{±0.15}}} \\
        BioASQ & {82.48\scriptsize{±0.77}} & {82.30\scriptsize{±0.80}} & {83.57\scriptsize{±0.67}} & {82.97\scriptsize{±0.63}} & {81.05\scriptsize{±0.79}} & \underline{83.94\scriptsize{±0.53}} & {82.82\scriptsize{±0.54}} & {83.21\scriptsize{±0.37}} & {81.84\scriptsize{±0.36}} & {83.44\scriptsize{±0.66}} & \underline{\textbf{84.90\scriptsize{±0.53}}} \\
        TriviaQA & {84.12\scriptsize{±0.31}} & {83.60\scriptsize{±0.33}} & {84.18\scriptsize{±0.32}} & {83.85\scriptsize{±0.32}} & {82.50\scriptsize{±0.33}} & \underline{84.49\scriptsize{±0.29}} & {83.57\scriptsize{±0.33}} & {83.21\scriptsize{±0.37}} & {81.84\scriptsize{±0.36}} & {84.21\scriptsize{±0.38}} & \underline{\textbf{85.71\scriptsize{±0.30}}} \\
        \textit{Average} & {78.27} & {77.95} & {78.79} & {78.43} & {76.80} & \underline{79.21} & {77.99} & {77.96} & {76.42} & {78.82} & \underline{\textbf{80.05}} \\
        
        

        
        \rowcolor{gray!20} \multicolumn{12}{c}{Mistral-7B-v0.3} \\ 
        NQ & {51.96\scriptsize{±0.61}} & {51.49\scriptsize{±0.71}} & {52.53\scriptsize{±0.50}} & {52.22\scriptsize{±0.50}} & {51.16\scriptsize{±0.56}} & \underline{52.75\scriptsize{±0.48}} & {52.01\scriptsize{±0.51}} & {51.27\scriptsize{±0.70}} & {49.52\scriptsize{±0.64}} & {51.98\scriptsize{±0.86}} & \underline{\textbf{53.71\scriptsize{±0.48}}} \\
        CoQA & {92.83\scriptsize{±0.26}} & {93.11\scriptsize{±0.22}} & {94.29\scriptsize{±0.20}} & {94.17\scriptsize{±0.16}} & {91.95\scriptsize{±0.32}} & \underline{94.47\scriptsize{±0.13}} & {94.32\scriptsize{±0.16}} & {93.02\scriptsize{±0.18}} & {93.28\scriptsize{±0.17}} & {94.05\scriptsize{±0.31}} & \underline{\textbf{95.97\scriptsize{±0.13}}} \\
        BioASQ & {80.46\scriptsize{±0.77}} & {80.07\scriptsize{±0.66}} & \underline{82.27\scriptsize{±0.70}} & {81.07\scriptsize{±0.59}} & {80.34\scriptsize{±0.86}} & {81.65\scriptsize{±0.63}} & {80.18\scriptsize{±0.61}} & {80.05\scriptsize{±0.71}} & {73.21\scriptsize{±0.62}} & {81.75\scriptsize{±0.72}} & \underline{\textbf{82.64\scriptsize{±0.64}}} \\
        TriviaQA & {82.58\scriptsize{±0.25}} & {82.53\scriptsize{±0.31}} & {83.02\scriptsize{±0.25}} & {82.31\scriptsize{±0.26}} & {81.81\scriptsize{±0.33}} & \underline{83.11\scriptsize{±0.32}} & {82.11\scriptsize{±0.30}} & {82.23\scriptsize{±0.37}} & {79.48\scriptsize{±0.35}} & {83.39\scriptsize{±0.22}} & \underline{\textbf{84.19\scriptsize{±0.32}}} \\
        \textit{Average} & {76.96} & {76.80} & \underline{78.03} & {77.44} & {76.32} & {78.00} & {77.16} & {76.64} & {73.87} & {77.79} & \underline{\textbf{79.13}} \\


        \rowcolor{gray!20} \multicolumn{12}{c}{Mistral-Nemo-12B} \\ 

        
        NQ & {51.32\scriptsize{±1.30}} & {51.27\scriptsize{±1.17}} & \underline{52.12\scriptsize{±1.16}} & {51.18\scriptsize{±1.10}} & {50.47\scriptsize{±1.28}} & {51.70\scriptsize{±1.17}} & {49.83\scriptsize{±1.12}} & {50.82\scriptsize{±1.39}} & {47.12\scriptsize{±1.02}} & {51.82\scriptsize{±1.05}} & \underline{\textbf{53.46\scriptsize{±1.19}}} \\
        CoQA & {93.35\scriptsize{±0.24}} & {93.15\scriptsize{±0.24}} & {94.24\scriptsize{±0.18}} & {94.15\scriptsize{±0.17}} & {91.82\scriptsize{±0.17}} & \underline{94.54\scriptsize{±0.13}} & {94.10\scriptsize{±0.14}} & {93.02\scriptsize{±0.23}} & {93.03\scriptsize{±0.18}} & {94.17\scriptsize{±0.20}} & \underline{\textbf{95.60\scriptsize{±0.13}}} \\
        BioASQ & {82.31\scriptsize{±0.56}} & {82.00\scriptsize{±0.66}} & \underline{83.55\scriptsize{±0.53}} & {82.33\scriptsize{±0.53}} & {81.63\scriptsize{±0.70}} & {83.50\scriptsize{±0.45}} & {81.49\scriptsize{±0.56}} & {81.73\scriptsize{±0.59}} & {76.35\scriptsize{±0.54}} & {83.50\scriptsize{±0.68}} & \underline{\textbf{84.50\scriptsize{±0.46}}} \\
        TriviaQA & {85.35\scriptsize{±0.26}} & {85.07\scriptsize{±0.38}} & {85.85\scriptsize{±0.32}} & {85.14\scriptsize{±0.26}} & {84.14\scriptsize{±0.32}} & \underline{85.93\scriptsize{±0.27}} & {85.14\scriptsize{±0.27}} & {84.63\scriptsize{±0.29}} & {83.31\scriptsize{±0.27}} & {85.93\scriptsize{±0.27}} & \underline{\textbf{86.49\scriptsize{±0.25}}} \\
        \textit{Average} & {78.08} & {77.87} & \underline{78.94} & {78.20} & {77.02} & {78.92} & {77.64} & {77.55} & {74.95} & {78.86} & \underline{\textbf{80.01}} \\







        \bottomrule % from booktabs package
    \end{tabular}
    
    }

\end{table*}

\paragraph{Baselines}
We included nine UQ methods for comparison. These baselines are categorized as follows: (1) entropy-based methods, including Semantic Entropy (\textbf{SE}) \citep{farquhar2024detecting}, Discrete Semantic Entropy (\textbf{DSE}) \citep{farquhar2024detecting} and Kernel Language Entropy (\textbf{KLE}) \citep{nikitin2024kernel}; (2) graph-based methods, including \textbf{Ecc} \citep{lin2024generate}, \textbf{EigV} \citep{lin2024generate}, \textbf{Deg} \citep{lin2024generate} and \textbf{D-UE} \citep{da2024llm}; and (3) consistency-based methods, including Number of Semantic Sets (\textbf{NSS}) \citep{kuhn2023semantic} and Semantic Embedding Uncertainty (\textbf{SEU}) \citep{grewal2024seu}. 
For SE and DSE, we used GPT-3.5-Turbo-0125 for entailment prediction as recommended by \cite{farquhar2024detecting}. For KLE, we employ $\operatorname{KLE}(K_{HEAT})$, as it demonstrates the best performance among all variants of KLE. To ensure fairness, the NLI model utilized in KLE, Ecc, EigV, Deg, and D-UE is identical to ours, specifically DeBERTa-Large-MNLI. For further details, refer to Appendix \ref{sec:baseline_details}.
\paragraph{Implementation Details}

We repeated the experiment five times, each time randomly selecting 10 QA pairs from the dataset as context examples and subsequently dividing the remaining dataset into a validation set (1,000 QA pairs; 400 QA pairs for BioASQ) and a test set. Hyperparameter tuning for our method was conducted on the validation set. The hyperparameters $\delta$ and $\theta$ were selected from the set \{0.01, 0.1, 0.2, ..., 0.9, 0.99\} to maximize validation set performance. Finally, we evaluated the performance on the test data. Since both KLE and Ecc require hyperparameter tuning, to ensure fairness, all methods, including ours, were tuned using the same subset of the dataset. All experiments were conducted on a server equipped with one NVIDIA A100 (80GB) GPU.




\subsection{Main Results}


Table \ref{tab:main_ret_auroc} and Table \ref{tab:main_ret_auarc} present the AUROC and AUARC scores for various uncertainty metrics across 16 model-dataset combinations. $\operatorname{SGD}_{s+P}$ outperforms baseline methods in 15 out of 16 cases for both AUROC and AUARC. For instance, when evaluated on the CoQA dataset using Llama-3.1-8B, $\operatorname{SGD}_{s+P}$ achieved an average AUROC of 81.52\%, surpassing the best baseline result by 1.52\%. 
$\operatorname{SGD}_{s+P}$ consistently outperforms SE due to its utilization of  fine-grained semantic relations rather than relying solely on semantic equivalence. Compared to graph-based methods (e.g., Deg, Ecc) and KLE, both these baselines and $\operatorname{SGD}_{s+P}$ utilize semantic similarity measured by the NLI model. However, the advantage of $\operatorname{SGD}_{s+P}$ lies in its incorporation of probability to adjust the contribution of each edge of the semantic graph to the uncertainty score, which our subsequent ablation experiments validate as effective (see Section \ref{sec:ablation}). Among \( \operatorname{SGD}_{\delta + P} \) and \( \operatorname{SGD}_{s + P} \), the latter is the best as it has superior performance and requires only one hyperparameter, while the former needs two.

\subsection{Ablation Experiments}\label{sec:ablation}
\paragraph{Number of Possible Answers}
% In the main experiment, we sampled $N=10$ possible answers for each question. To investigate how the performance of our best method $\operatorname{SGD}_{s+P}$ and competitive baseline methods, including KLE and Deg, changes as the number of possible answers increases, we conducted additional experiments using Mistral-Nemo-12B on the TriviaQA dataset. 

In the main experiment, we sampled $N = 10$ possible answers for each question. In this section, we investigate how the performance changes as the number of possible answers increases. We selected SE, KLE, and Deg as baselines, as these three methods have demonstrated the strongest performance among numerous baseline approaches. We compared our best-performing method, $\operatorname{SGD}_{s+P}$, with these three baselines. The experiments were conducted using Mistral-Nemo-12B on the TriviaQA dataset.
In these experiments, $N$ varied from 3 to 10. Except for the variation in $N$, all other experimental settings were identical to those in the main experiment. The results, as shown in Figure \ref{fig:number_of_po_ans}, demonstrate that $\operatorname{SGD}_{s+P}$ is more generation-efficient compared to the baseline methods. Specifically, to achieve comparable AUROC or AUARC scores, $\operatorname{SGD}_{s+P}$ generates fewer possible answers, resulting in reduced computational resource consumption.
\begin{figure}[!h]
    \centering
    \includegraphics[width=0.22\textwidth]{TriviaQA_Mistral-Nemo-12B_AUROC.pdf}
    \includegraphics[width=0.22\textwidth]{TriviaQA_Mistral-Nemo-12B_AUARC.pdf}
\caption{Performance of different uncertainty metrics with increasing numbers of possible answers. The number of possible answers ranges from 3 to 10, incremented by 1 at each step. We only included competitive baseline methods SE, KLE and Deg for comparison.}
    \label{fig:number_of_po_ans}
\end{figure}
\paragraph{Effectiveness of Probability Incorporation}
In Section \ref{sec:ssgd}, we presented how to adjust the contribution of each edge in the semantic graph to the uncertainty based on the probability of the paired nodes, namely, the probability of the paired answers. Subsequently, we verified its effectiveness through experiments. Specifically, we compared the performance of $\operatorname{SGD}_{\delta}$ and $\operatorname{SGD}_{\delta+P}$, as well as that of $\operatorname{SGD}_{s}$ and $\operatorname{SGD}_{s+P}$. We verified the effectiveness of probability incorporation using Mistral-7B-v0.3 on the NQ and BioASQ datasets. Results are shown in Table \ref{tab:unequal_contri}. The results on AUROC and AUARC indicate that the incorporation of probability is effective, as demonstrated by the fact that the performance of $\operatorname{SGD}_{\delta}$ is inferior to that of $\operatorname{SGD}_{\delta+P}$, and the performance of $\operatorname{SGD}_{s}$ is inferior to that of $\operatorname{SGD}_{s+P}$.
In addition, we draw the following conclusions: (1) $\operatorname{SGD}_s$ and Deg exhibit nearly identical performance, consistent with the theoretical analysis in Section \ref{generalization}; and (2) the superior performance of $\operatorname{SGD}_{s+P}$ over baseline methods KLE and Deg can be primarily attributed to the incorporation of probability. This is evident from the fact that $\operatorname{SGD}_{s}$ yields nearly identical results to Deg but performs worse than KLE in the Mistral-7B experiment\footnote{In the experiments with the Mistral-7B-BioASQ combination, KLE outperforms Deg. However, as shown in Tables \ref{tab:main_ret_auroc} and \ref{tab:main_ret_auarc}, Deg generally outperforms KLE across most cases.}; after incorporating probability, $\operatorname{SGD}_{s+P}$ outperforms both KLE and Deg.


\begin{table}[htbp]
    \centering

    \caption{Performance comparison among four variants of SGD ($\operatorname{SGD}_{\delta}$, $\operatorname{SGD}_{\delta+P}$, $\operatorname{SGD}_{s}$, and $\operatorname{SGD}_{s+P}$) and competitive baseline methods including SE, KLE, and Deg. Experiments were conducted on Mistral-7B-v0.3.}
    
    \label{tab:unequal_contri}

    \resizebox{\linewidth}{!}{
        \begin{tabular}{ccccc}
      \toprule % from booktabs package


    \multirow{2}{*}{\textbf{Methods}} & \multicolumn{2}{c}{\textbf{NQ}} & \multicolumn{2}{c}{\textbf{BioASQ}}  \\
    
    \cmidrule(lr){2-3} \cmidrule(lr){4-5} 
    
      &  AUROC &  AUARC &  AUROC &  AUARC \\
      
      \midrule % from booktabs package
      SE & {76.88\scriptsize{±0.60}} & {51.96\scriptsize{±0.61}}  &  80.86\scriptsize{±0.53} & 80.46\scriptsize{±0.77}            \\
      KLE & {77.58\scriptsize{±0.42}} & {52.53\scriptsize{±0.50}}  & 83.66\scriptsize{±0.41} & 82.27\scriptsize{±0.70}            \\
      Deg & {77.42\scriptsize{±0.56}} & {52.75\scriptsize{±0.48}}  & 83.57\scriptsize{±0.53}  & 81.65\scriptsize{±0.63}            \\
      \midrule

      $\operatorname{SGD}_{\delta+P}$ & {77.68\scriptsize{±0.60}} & {51.98\scriptsize{±0.86}} & 83.88\scriptsize{±0.60}  & 81.75\scriptsize{±0.72} \\
      $\operatorname{SGD}_{\delta}$ & {76.71\scriptsize{±0.58}} $\downarrow$ & {50.88\scriptsize{±0.88}} $\downarrow$ & {82.94\scriptsize{±0.55}} $\downarrow$   & {80.92\scriptsize{±0.70}} $\downarrow$       \\
      \midrule
      $\operatorname{SGD}_{s+P}$ & {78.39\scriptsize{±0.60}} & {53.71\scriptsize{±0.48}} & 84.56\scriptsize{±0.54} & 82.64\scriptsize{±0.64}  \\
      $\operatorname{SGD}_{s}$ & {77.41\scriptsize{±0.56}} $\downarrow$ & {52.74\scriptsize{±0.48}} $\downarrow$ &  {83.58\scriptsize{±0.54}} $\downarrow$ & 81.66\scriptsize{±0.63} $\downarrow$       \\
      \bottomrule % from booktabs package
    \end{tabular}
    }
\end{table}
\paragraph{Results of Diverse Beam Search}
Previous studies predominantly utilized multinomial sampling to generate multiple  answers, often setting $T=1$ to increase diversity. Few studies explored alternative sampling methods. In this paper, we conducted an ablation study using Diverse Beam Search \citep{vijayakumar2018diverse} because it tends to produce diverse yet highly probable responses \citep{vijayakumar2018diverse}, which is crucial for UQ \citep{aichberger2024semantically}. We sampled 10 answers for each question by configuring 10 groups, with each group containing one beam. The answer from the first group, generated via greedy beam search, was used to evaluate model accuracy. The subsequent groups, designed to introduce greater diversity, provided the possible answers used to assess consistency. We compared our best method, $\operatorname{SGD}_{s+P}$, with several competitive baseline methods, including SE, DSE, KLE, and Deg. The results, shown in Figure \ref{fig:dbs}, indicate that under Diverse Beam Search, KLE performs worse than SE, which contrasts with the multinomial sampling experiments where KLE outperforms SE in most cases. However, our method, $\operatorname{SGD}_{s+P}$, still outperforms other competitive baseline methods, demonstrating the stronger robustness of our approach.
\begin{figure}[!htb]
  \centering
  \includegraphics[width=0.88\linewidth]{dbs.pdf}
  \caption{Performance comparison of different uncertainty metrics when generating possible answers using diverse beam search. We compare our best method $\operatorname{SGD}_{s+P}$ with competitive baseline methods SE, DSE, KLE, and Deg. All results are shown as percentages.}\label{fig:dbs}
\end{figure}
\subsection{Comparison of Computational Resource Consumption}
Previous experiments have demonstrated that $\operatorname{SGD}_{s+P}$ outperforms baseline methods. In this section, we specifically compare the resource consumption of different UQ methods, concentrating solely on the consumption of language model inferences, as these operations are significantly more computationally intensive than other components. All methods are sampling-based and first require sampling $N$ possible answers, which incurs the same consumption. 
To improve the accuracy of SE and DSE, we use GPT-3.5-turbo-0125 for entailment prediction, following \cite{farquhar2024detecting}. This approach requires at most $O(N^2)$ model inferences, where $N$ is the number of statements to be compared. We also apply this strategy to NSS. KLE and all graph-based methods need to use the output logits of the NLI model to obtain the similarity between texts, which incurs the same resource consumption as ours. SEU requires $N(N-1)$ inferences of the all-mpnet-base-v2 model \footnote{\url{https://huggingface.co/sentence-transformers/all-mpnet-base-v2}} with 109M parameters to obtain the embeddings of answers.
In summary, compared to SE, DSE, and NSS, our method outperforms them in terms of performance and has lower resource consumption; our method outperforms graph-based methods in terms of performance and has the same resource consumption; compared to SEU, although its resource consumption is lower than that of other methods, its performance is inferior to that of other baseline methods and our method.

\section{Conclusion}
In this study, we proposed Semantic Graph Density (SGD), a novel method for Uncertainty Quantification (UQ) in short-form QA. SGD incorporates fine-grained semantic relationships and adjusts edge contributions within semantic graphs, enabling more precise uncertainty quantification. Our experiments on four free-form QA datasets demonstrate that SGD outperforms baseline methods. These results highlight the potential of SGD to enhance the reliability of LLMs, providing a promising direction for addressing hallucination in LLMs. Future work may explore extending SGD to other NLP tasks and integrating it with real-time decision-making systems.

\begin{acknowledgements}
We thank the area chair and reviewers for their constructive feedback, which significantly improved the clarity and quality of this paper.
This work is supported by the General Program of the National Natural Science Foundation of China (No. 62372459).
\end{acknowledgements}

\bibliography{uai2025-template}

\newpage

\onecolumn

% Solution: https://tex.stackexchange.com/questions/507795/revtex-multiple-authors-with-no-affiliations-how-to-put-equal-contribution-as
% Avoid printing two `\thanks`s
\emptythanks
% Create a place to save the current number of in-text footnotes
\newcounter{footnotecounter}
% Save the current number of in-text footnotes
\setcounter{footnotecounter}{\value{footnote}}
% Reset the footnote counter
\setcounter{footnote}{0}


\title{Enhancing Uncertainty Quantification in \\ Large Language Models through Semantic Graph Density\\(Supplementary Material)}
\maketitle
% This Supplementary Material should be submitted together with the main paper.

\appendix

\section{Bias Arising from Neglect of Answer Probabilities in Graph-Based Methods}\label{sec:bias}
Different sampled answers may have varying probabilities, which can be calculated using the numeric outputs (token-level logits) provided by LLMs. Answers with higher probabilities are considered more representative in UQ, compared to answers with lower probabilities \citep{geng2024survey}.

Table \ref{tab:example_combined222} shows the responses of the same LLM to two questions. For both cases, the sample answers consisted of one "\textcolor{brown}{\textbf{No}}" and two "\textcolor{brown}{\textbf{Yes}}" responses.
However, the model’s confidence in the "\textcolor{brown}{\textbf{Yes}}" sampled answer is \textit{\textbf{notably higher in the first case}}, where the probabilities for "\textcolor{brown}{\textbf{Yes}}" and "\textcolor{brown}{\textbf{No}}" are \textcolor{brown}{\textbf{0.95}} and \textcolor{brown}{\textbf{0.15}}, respectively. In contrast, for the second case, the probabilities are \textcolor{brown}{\textbf{0.55}} and \textcolor{brown}{\textbf{0.45}}. It can be inferred that the model exhibits greater confidence in answering Question 1 compared to Question 2. Consequently, the uncertainty score assigned to Question 1 should be lower than that assigned to Question 2.

Existing graph-based methods \citep{lin2024generate,da2024llm} overlook answer probabilities and focus exclusively on LLM output text. Consequently, these methods would assign identical uncertainty scores to both questions, despite the sampled answers having different probabilities, which is problematic. 

\begin{table}[htbp]
    \centering
    \caption{Example Responses from BioASQ.} \label{tab:example_combined222}
    \begin{minipage}[t]{0.40\textwidth} % 左表
        \centering
        \begin{tabular}{l}
 \toprule % from booktabs package
            % \textbf{Example Responses from NQ} \\
            % \midrule
            \textbf{Question 1} Has Denosumab (Prolia) \\ been approved by FDA ? \\
            % \midrule % from booktabs package
            \textbf{Ground Truth} \\
            Yes \\
                        \midrule
            \textbf{Model Answer} \\
            \textcolor{blue}{\textbf{Yes}} \textcolor{green}{\ding{52}} \\
            \midrule
            \textbf{Possible Answer, Probability} \\
            • $y^{(1)}$: \textcolor{brown}{\textbf{No, 0.15}} \\
            • $y^{(2)}$: \textcolor{brown}{\textbf{Yes, 0.95}} \\
            • $y^{(3)}$: \textcolor{brown}{\textbf{Yes, 0.95}} \\
            \bottomrule % from booktabs package
        \end{tabular}
        % \caption*{Example Responses from NQ}
    \end{minipage}%
    \hfill
    \begin{minipage}[t]{0.60\textwidth} % 右表
        \centering
        \begin{tabular}{l}
 \toprule % from booktabs package
            % \textbf{Example Responses from NQ} \\
            % \midrule
            \textbf{Question 2} Is the ACE inhibitor \\ indicated for lung cancer treatment? \\
            % \midrule % from booktabs package
            \textbf{Ground Truth} \\
            No \\
                        \midrule
            \textbf{Model Answer} \\
            \textcolor{blue}{\textbf{Yes}} \textcolor{red}{\ding{56}} \\
            \midrule
            \textbf{Possible Answer, Probability} \\
            • $y^{(1)}$: \textcolor{brown}{\textbf{No, 0.45}} \\
            • $y^{(2)}$: \textcolor{brown}{\textbf{Yes, 0.55}} \\
            • $y^{(3)}$: \textcolor{brown}{\textbf{Yes, 0.55}} \\
            \bottomrule % from booktabs package
        \end{tabular}
        \caption*{}
    \end{minipage}
\end{table}








\section{Prompts for Obtaining Answers}\label{sec:adx_datasets}

We follow \cite{farquhar2024detecting} and use the following prompt template to obtain answers, including both the target answer (to evaluate the model's accuracy) and the possible answers (to measure the model's uncertainty), for datasets without context, such as NQ, TriviaQA, and BioASQ:


\begin{verbatim}
Question: {Example 1 Question}  
Answer: {Example 1 Answer}  
[Additional Examples]  
Question: {question}  
Answer:
\end{verbatim}


For CoQA, a dataset that includes context, we adopt a modified prompt template provided by \cite{lin2024generate}:


\begin{verbatim}
[The provided context paragraph]  
[Additional question-answer pairs]  
Q: [Provided question]  
A:
\end{verbatim}

Below are some example prompts for each dataset, formatted according to the template described above.

\begin{tcolorbox}[colback=white, colframe=black, arc=5mm, title=NQ, fonttitle=\bfseries]
\begin{tabular}{p{0.98\linewidth}}
\textbf{Question:} Can exosomes be detected in urine? \\
\textbf{Answer:} yes. \\

\textbf{Question:} Who wrote the music for Annie Get Your Gun? \\
\textbf{Answer:} Irving Berlin. \\

\textbf{Question:} What is the name of the speaker of parliament in Ghana? \\
\textbf{Answer:} Aaron Mike Oquaye. \\

\colorbox{gray!20}{\textbf{Additional Question-Answer Pairs}} \\

\textbf{Question:} When did Stevie Wonder release his first album? \\
\textbf{Answer:}  \\
\end{tabular}
\end{tcolorbox}


\begin{tcolorbox}[colback=white, colframe=black, arc=5mm, title=CoQA, fonttitle=\bfseries]
\begin{tabular}{p{0.98\linewidth}}

Once upon a time, in a barn near a farm house, there lived a little white kitten named Cotton. Cotton lived high up in a nice warm place above the barn where all of the farmer's horses slept. \colorbox{gray!20}{\textbf{Some contents are omitted here.}} 
"Don't ever do that again, Cotton!" they all cried. "Next time you might mess up that pretty white fur of yours and we wouldn't want that!" 
Then Cotton thought, "I change my mind. I like being special". \\ \\

\textbf{Q}: What color was Cotton? \\
\textbf{A}: white. \\
\colorbox{gray!20}{\textbf{Additional Question-Answer Pairs}} \\
\textbf{Q}: Where did she live? \\
\textbf{A}:
\end{tabular}
\end{tcolorbox}




\begin{tcolorbox}[colback=white, colframe=black, arc=5mm, title=BioASQ, fonttitle=\bfseries]
\begin{tabular}{p{1.0\linewidth}}
% \hline
\textbf{Question:} In which cell organelle is the SAF-A protein localized? \\
\textbf{Answer:} the nucleus. \\
% \hline
\textbf{Question:} What is the role of IL-18BP? \\
\textbf{Answer:} IL-18 binding protein (IL-18BP) is a natural inhibitor of IL-18. The balance between IL-18 and IL-18BP has an important role in the inflammatory setting. \\

\textbf{Question:} Which is the genetic lesion associated with Huntington’s disease? \\
\textbf{Answer:} A CAG trinucleotide repeat expansion in the HD gene. \\
\colorbox{gray!20}{\textbf{Additional Question-Answer Pairs}} \\
\textbf{Question:} What is a exposome? \\
\textbf{Answer:} \\
\end{tabular}
\end{tcolorbox}





\begin{tcolorbox}[colback=white, colframe=black, arc=5mm, title=TriviaQA, fonttitle=\bfseries]
\begin{tabular}{p{0.98\linewidth}}
\textbf{Question:} Who discovered electromagnetic induction, so facilitating the transformer and dynamo? \\
\textbf{Answer:} Michael Faraday. \\

\textbf{Question:} Which card game, originating in Spain and introduced to England in 1861, is played between 2 persons with 2 packs of cards (with sixes and below removed) who are dealt 8 cards each? \\
\textbf{Answer:} Bezique. \\

\textbf{Question:} Whose twelfth studio album Magna Carta Holy Grail released in July has topped the charts on both sides of the Atlantic? \\
\textbf{Answer:} JAY-Z. \\

\colorbox{gray!20}{\textbf{Additional Question-Answer Pairs}} \\
\textbf{Question:} Which American won the Nobel Peace Prize in 2002? \\
\textbf{Answer:} \\
\end{tabular}
\end{tcolorbox}





\section{Example Responses}



Table \ref{tab:example_combined} includes two examples showing the responses of the Llama-3.1-8B model to two questions, each corresponding to a question from either the NQ (left) or CoQA (right) dataset.

\begin{table}[htbp]
    \centering
    \caption{Example Responses from NQ and CoQA.} \label{tab:example_combined}
    \begin{minipage}[t]{0.38\textwidth} % 左表
        \centering
        \begin{tabular}{l}
            \toprule % from booktabs package
            \textbf{Example Responses from NQ} \\
            \midrule
            \textbf{Question} In the honour of which god \\ is anant chaturdashi celebrated ? \\
            % \midrule % from booktabs package
            \textbf{Ground Truth} \\
            Ganesh \\
            \textbf{Model Answer} \\
            4 gods \ding{56} \\
            \midrule
            \textbf{Possible Answers} \\
            • $y^{(1)}$: Hindu Lord Vishnu \\
            • $y^{(2)}$: 4 Avatars of Vishnu \\
            • $y^{(3)}$: Chaturbhuja or Kartikeya \\
            • $y^{(4)}$: 10 incarnations of Lord Vishnu \\
            • $y^{(5)}$: 12 Avatars of Lord Vishnu \\
            • $y^{(6)}$: the god of prosperity \\
            • $y^{(7)}$: 12-year-old Ganesh \\
            • $y^{(8)}$: 16 forms of Vishnu \\ 
            • $y^{(9)}$: 9 different deities of Hinduism \\
            • $y^{(10)}$: 16 forms of Vishnu \\ 
            \bottomrule % from booktabs package
        \end{tabular}
        % \caption*{Example Responses from NQ}
    \end{minipage}%
    \hfill
    \begin{minipage}[t]{0.62\textwidth} % 右表
        \centering
        \begin{tabular}{l}
            \toprule % from booktabs package
            \textbf{Example Responses from CoQA} \\
            \midrule
            
            \textbf{Context} Once upon a time, in a barn near a farm house ... \\
            \textbf{Question} Whose paint was it? \\
            % \midrule % from booktabs package
            \textbf{Ground Truth} \\
            the old farmer's \\
            \textbf{Model Answer} \\
            the old farmer's \ding{52} \\
            \midrule
            \textbf{Possible Answers} \\
            • $y^{(1)}$: it was the old farmer's paint \\
            • $y^{(2)}$: the old farmer's paint \\
            • $y^{(3)}$: It was the old farmer's orange paint \\
            • $y^{(4)}$: the old farmer's paint \\
            • $y^{(5)}$: the old farmer's paint \\
            • $y^{(6)}$: the old farmer's paint \\
            • $y^{(7)}$: the farmer's \\
            • $y^{(8)}$: the old farmer's \\
            • $y^{(9)}$: the old farmer's paint \\
            • $y^{(10)}$: the old farmer's \\
            \bottomrule % from booktabs package
        \end{tabular}
        \caption*{}
    \end{minipage}
\end{table}






\section{Input Format for NLI Models to Measure Contextual Semantic Similarity}\label{prompt_nli}


Following \cite{lin2024generate}, we use the following format to obtain the output logits of the NLI model:

\verb|[x] [yi] [SEP] [x] [yj]|

\section{Prompts for LLM-based Correctness Checking}\label{prompt_correct}



We automatically determined whether the target answer is \textit{correct or incorrect} by using GPT-4-0613 to compare the given answer to the corresponding ground truth answers provided by the dataset. Following \cite{farquhar2024detecting}, we set the temperature to 0.01 in order to minimize randomness. We utilized the template provided by \cite{farquhar2024detecting} as follows:

\begin{verbatim}
We are assessing the quality of answer to the following question: {question}
The expected answer is(are): {reference answers}
The proposed answer is: {target answer}
Within the context of the question, does the proposed answer mean the same as the
expected answer? Respond only with yes or no.
\end{verbatim}

\paragraph{Performance of Correctness Checking.} 



In the supplementary materials of Note 6 in \citep{farquhar2024detecting}, the authors evaluated the agreement between GPT-4-0613 and human raters in answer correctness assessment. The results showed that (1) GPT-4-0613 agreed with two human raters at an average rate of 93\%, while the two raters agreed with each other at a similar rate of 92\%. (2) Compared to Llama-2-Chat-70B and GPT-3.5, GPT-4 demonstrated performance most closely aligned with human-level judgment.




\section{Baseline Implementation Details}\label{sec:baseline_details}
\begin{itemize}
\item \textbf{Semantic Entropy (SE)} \citep{farquhar2024detecting}, \textbf{Discrete Semantic Entropy (DSE)} \citep{farquhar2024detecting}. SE measures the entropy of the meaning distribution of free-form responses to questions. Specifically, SE groups the sampled answers with the same semantic and computes cluster-wise predictive entropy to quantify semantic uncertainty. For our study, we utilize the SE implementation from the Nature publication \citep{farquhar2024detecting}, rather than the version from the ICLR publication \citep{kuhn2023semantic}. In accordance with the recommendation in \citep{farquhar2024detecting}, we use GPT-3.5-turbo-0125 to determine whether two answers entail one another. If the answers are deemed to entail each other, it indicates that they share the same semantic. DSE is an approximation of SE, designed for black-box LLMs. Unlike SE, DSE estimates the probability of the meaning by counting the frequency of each answer in the samples.

\item \textbf{Kernel Language Entropy (KLE)} \citep{nikitin2024kernel}. KLE treats each answer as a mixed state in quantum mechanics, where each pure state represents a distinct "semantic meaning". It then calculates the von Neumann entropy of the mixed state that corresponds to all sampled answers. Specifically, KLE constructs a semantic graph based on the fine-grained semantic relationships among answers, computes a semantic kernel $K$ over this graph, and calculates the von Neumann entropy of the kernel. We particularly employ the variant $\text{KLE}(K_{HEAT})$, which has demonstrated superior performance compared to other KLE variants.

\item \textbf{Sum of Eigenvalues of the Graph Laplacian (EigV)} \citep{lin2024generate}, \textbf{The Degree Matrix (Deg)} \citep{lin2024generate}, and \textbf{Eccentricity (Ecc)} \citep{lin2024generate}. These metrics construct a weighted semantic graph based on the semantic similarity among the sampled answers. EigV approximates the number of connected components by analyzing the eigenvalues of the graph Laplacian. Deg uses the degree matrix to quantify the diversity of the answers, while Ecc calculates the eccentricity of the spectral embedding, capturing the spread of the answers in the semantic space.
\item \textbf{Directed Uncertainty Evaluation (D-UE)} \citep{da2024llm}. D-UE captures the semantic relationships between answers using a bidirectional approach and quantifies uncertainty by analyzing directional instability. Specifically, D-UE constructs a directional graph based on the entailment logits from the NLI model. It then applies Random Walk Laplacian analysis, considering the asymmetric properties of the constructed directed graph. Finally, uncertainty is aggregated through the eigenvalues derived from the Laplacian process.
\item \textbf{Number of Semantic Sets (NSS)} \citep{kuhn2023semantic}. NSS clusters the sampled answers that share the same meaning and uses the number of clusters, which represents the number of distinct meanings conveyed by the prompt, as an uncertainty measurement.
\item \textbf{Semantic Embedding Uncertainty (SEU)} \citep{grewal2024seu}. SEU calculates the average pairwise cosine similarity of the embeddings of possible answers. SEU calculates the negative average pairwise cosine similarity of the embeddings of possible answers as an uncertainty measure. The embeddings are obtained using a pretrained embedding model; in this implementation, we use all-mpnet-base-v2\footnote{\url{https://huggingface.co/sentence-transformers/all-mpnet-base-v2}}. SEU argues that leveraging the embedding model to capture semantic similarities can achieve smoother and more robust estimation of semantic uncertainty than NLI models.


\end{itemize}


\section{Comparison with Non-Sampling-Based Methods}
In the main comparison, we focus on sampling-based baselines. In this section, we introduce three non-sampling-based methods as baselines for comparison. The baselines and their brief descriptions are as follows. The last two methods are verbalization-based, in which LLMs are prompted to express their uncertainty in words.\begin{itemize}
    \item Perplexity: Perplexity is calculated as the exponentiation of the average negative log-likelihood of the predicted probabilities for each token in a sequence, normalized by the length of the sequence.
    \item Post-hoc Verbalized Uncertainty (PH-VU) \citep{linteaching}: Instructing an LLM to evaluate the confidence in the accuracy of its previously generated answer, and using the negative value of this confidence as the uncertainty score. We use the following prompt: {\textit{Q: {question} A: {the best generation}. The proposed answer is true with a confidence value (0-100) of}}
    \item In-line Verbalized Uncertainty (IL-VU) \citep{xiong2024canllm}: Directing an LLM to provide an answer along with its confidence score, and using the negative value of this score as the uncertainty score. We use the following prompt provided by \cite{xiong2024canllm}: {\textit{Read the question, provide your answer, and your confidence in this answer. Note: The confidence indicates how likely you think your answer is true. Use the following format to answer: Answer and Confidence (0-100): [Your answer], [Your confidence level, please only include the numerical number in the range of 0-100] Question: {question} Answer and Confidence (0-100):}}    
\end{itemize}

We follow the setup of the main experiments in our paper. The experimental results are as shown in Table \ref{tab:nonsbmtd1} and Table \ref{tab:nonsbmtd2}. The results indicate that each variant of our method consistently and significantly outperforms these non-sampling-based approaches.


\begin{table}[!h]
    \centering
    \caption{Results with Llama-3.1-8B (AUROC).} \label{tab:nonsbmtd1}
    \begin{tabular}{cccccc}
        \toprule 
          &  Perplexity & PH-VU & IL-VU & $\text{SGD}_{\delta+P}$ (Ours) & $\text{SGD}_{s+P}$ (Ours)\\
        \midrule 
        NQ & 62.02±0.32	& 67.83±0.82 & 64.35±0.91 & 78.84±0.39 & \textbf{78.87±0.42} \\
        BioASQ & 63.96±0.51 & 68.24±0.48 & 68.52±0.62 & 84.93±0.42 & \textbf{85.42±0.48} \\
        TriviaQA & 62.08±0.21 & 67.38±0.35 & 70.25±0.41 & 86.47±0.13 & \textbf{87.17±0.12} \\
        \bottomrule 
    \end{tabular}
\end{table}

\begin{table}[!h]
    \centering
    \caption{Results with Mistral-7B-v0.3 (AUROC).} \label{tab:nonsbmtd2}
    \begin{tabular}{cccccc}
        \toprule 
          &  Perplexity & PH-VU & IL-VU & $\text{SGD}_{\delta+P}$ (Ours) & $\text{SGD}_{s+P}$ (Ours)\\
        \midrule 
        NQ & 60.42±0.58	& 64.91±1.04 & 63.06±0.88 & 77.68±0.60 & \textbf{78.39±0.60} \\
        BioASQ & 63.74±0.46 & 66.05±0.62 & 68.10±0.74 & 83.88±0.60 & \textbf{84.56±0.54} \\
        TriviaQA & 62.21±0.31 & 70.68±0.53 & 73.69±0.69 & 85.48±0.26 & \textbf{86.27±0.27} \\
        \bottomrule 
    \end{tabular}
\end{table}

\end{document}
