\documentclass[accepted]{uai2025} 
\usepackage[american]{babel}
\usepackage{natbib} 
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} 
\usepackage{booktabs}
\usepackage{tikz}
\usetikzlibrary{bayesnet}
\usepackage{bbm}
\usepackage{amsthm}
\usepackage{amsfonts}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{comment}
\usepackage{float}

\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}

\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\newtheorem{example}{Example}[section]

\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}


\title{Measuring IIA Violations in Similarity Choices with Bayesian Models}


\author[1]{\href{mailto:<correahs@cos.ufrj.br>?Subject=Your UAI 2025 paper}{Hugo~Sales~Corrêa}}
\author[2]{Suryanarayana~Sankagiri}
\author[1]{Daniel~Figueiredo}
\author[2]{Matthias~Grossglauser}
% Add affiliations after the authors
\affil[1]{%
    Department of Computer and Systems Engineering\\
    Federal University of Rio de Janeiro (UFRJ)\\
    Brazil
}
\affil[2]{%
    School of Computer and Communication Sciences\\
    Ecole Polytechnique Fédérale de Lausanne (EPFL)\\
    Switzerland
}

\graphicspath{ {./images/} }
  
\begin{document}
\maketitle

\begin{abstract}
\input{uai2025/paper_sections/abstract}
\end{abstract}

\section{Introduction}\label{sec:introduction}
\input{uai2025/paper_sections/introduction}

\section{Models and methods}\label{sec:models_methods}

This section presents the statistical tests we use to quantify IIA violations in data. Before we introduce these methods, we present some relevant notation.  

Consider a scenario where a set of similarity questions is presented to a set of participants. We formalize this scenario as follows. Let $T$ be a set of items (photos, people, countries, etc.) and $\overline{Q}$ a set of similarity questions. Every question $Q \in \overline{Q}$ has a target item $t_Q \in T$ and a choice-set $C_Q \subseteq T$ with cardinality $|C_Q| \geq 2$. Note that choice-set sizes of different similarity questions in $\overline{Q}$ can be different. 

Let $P$ denote a set of participants. When presented with a question $Q \in \overline{Q}$, a participant $p \in P$ must choose the item in the choice-set $C_Q$ that is most similar to the target $t_Q$. The response of a participant is represented by a random variable $R_{pQ}$ which follows a categorical distribution $\pi_{pQ}$ over the choice-set $C_Q$: 
$R_{pQ} \sim \text{Cat}(\pi_{pQ})$.

Note that the above formulation is very general as it allows each participant to have a unique response distribution over the choice-set for every similarity question. However, the dependence on the participants can be dropped by assuming a homogeneous population (all participants have the same response distribution) or by marginalizing the participants. For the latter, assume participant $p$ is chosen randomly from $P$ according to some probability distribution. The marginal response to a question $Q$ is given by
$R_Q \sim \text{Cat}(\pi_Q)$, where $\pi_Q = \mathbb{E}_p[\pi_{pQ}]$.

With this notation in place, the Independence of Irrelevant Alternatives (IIA) property for similarity choice models can be defined as follows.

\begin{definition}[Independence of Irrelevant Alternatives (IIA)]
IIA holds for a question set $\overline{Q}$ if for any questions $Q, Q' \in \overline{Q} $ with $t_Q = t_{Q'}$,
$$k, k' \in C_Q \cap C_{Q'} \implies \frac{\pi_{Qk}}{\pi_{Qk'}} = \frac{\pi_{Q'k}}{\pi_{Q'k'}} ,$$
where $\pi_{Qk}$ is the probability that a participant chooses item $k$ in question $Q$.
\end{definition}
The definition requires questions to have the same target; it is not reasonable for IIA to hold over choice-sets with different targets (since the target can significantly influence the choices). 

The IIA assumption implies that $\pi_Q$, for all questions $Q$ having the same target can be fully specified with $|T| - 1$ parameters, one per item not including the target (see further explanation in  Appendix~\ref{app:IIA}). Under IIA, it is sufficient to specify a similarity score $s_k$ for every item $k \in T \setminus \{\ell\}$ to a fixed target $\ell$, independent of $Q$. Therefore, without loss of generality, the response probability vector can be represented as follows:
\begin{equation}
\pi_{Qk}(\mathbf{s}) = \frac{e^{s_k}}{\sum_{k' \in C_Q} e^{s_{k'}}} \; ; 
\label{eq:pi_s}
\end{equation}
that is, the probability of choosing item $k$ from choice-set $C_Q$ is proportional to $e^{s_k}$. This implies having a BTL model for question sets sharing the same target; questions sets with different targets have different BTL parameters. Thus, IIA must be assessed in question sets that have the same target. 



\subsection{Testing for IIA}

Consider a question set $\overline{Q}$ where all questions have the same target, and a set of participants $P$ with $|P| = n$. Assume that participants provide responses to these questions, and let $r_{pQ} \in \{1, \ldots, |C_Q|\}$ be the response of participant $p$ to question $Q$, \emph{i.e.}, a realization of $R_{pQ}$. 
%\surya{I really don't think it is necessary to have separate notation for the random variable and its realization}

The likelihood of question $Q$ given the similarity vector $\mathbf{s}$ is given by
$$ L_Q(\mathbf{s}) = \prod_{k \in C_Q} (\pi_{Qk}(\mathbf{s}))^{a_{Qk}},$$
where $a_{Qk} = \sum_{p \in P} \mathbbm{1}(r_{pQ} = k)$ is the total number of participants whose response is $k$ to question $Q$. The combined log-likelihood for all questions in the question set is given by
    \begin{align}
%        L(\mathbf{\mathbf{s}}) &= \prod_{Q \in \overline{Q}}  L_Q(\mathbf{s}), \,\,\, \text{and} \\
        \log L(\mathbf{s}) &= \sum_{Q \in \overline{Q}} \log L_Q(\mathbf{s}). \label{eq:mnl_mle}
    \end{align}
Let $\hat{\mathbf{s}} = \arg\max_\mathbf{s} \log L(\mathbf{s})$, namely the Maximum Likelihood Estimator (MLE). Since all questions are being jointly considered, the value for $\hat{\mathbf{s}}$ will be a tradeoff between the questions. If IIA holds then $\pi_{Qk}(\hat{\mathbf{s}}) \approx n^{-1}a_{Qk}$ for all $Q$ and $k$, since the probabilities obtained from the MLE should be sufficiently close to their empirical ratios. However, if IIA does not hold, the empirical ratios can be far from the MLE probabilities. %Pearson's $\chi^2$ statistical test can be used determine if IIA holds on a given dataset \cite{Lehmann2022}. 

\subsubsection{A Classical goodness of fit test}
The above intuition can be formalized as a goodness of fit test (GFT). The null hypothesis is that the data is generated by a similarity choice model satisfying IIA, \textit{i.e.}, the true probabilities $\pi_Q$ can be parametrized by $\mathbf{s}$ by \eqref{eq:pi_s}. The alternate hypothesis is that the distributions $\pi_Q$ lie in some larger parameter space, possibly the unconstrained parameter space defined as the product of $|C_Q|-1$-simplices, for each question $Q$.

% Pearson's statistic can be used to determine if the categorical results observed for different classes are statistically equivalent (i.e., they all follow the null hypothesis). Here, a class corresponds to a question and an observation corresponds to the response of a participant. Moreover, the null hypothesis under consideration is IIA, namely that responses follow $\pi_{Qk}(\mathbf{s})$ for some fixed $\mathbf{s}$.  

Consider Pearson's  $\chi^2$ test statistic, which is given by
\begin{equation}
    D(\mathbf{s}) = \sum_{Q \in \overline{Q}} \sum_{k \in C_{Q}}\frac{(n\pi_{Qk}(\mathbf{s}) - a_{Qk})^2}{n\pi_{Qk}(\mathbf{s})}. \label{eq:chi2}
\end{equation}
Under the null hypothesis, $D(\hat{\mathbf{s}})$ converges in distribution to $\chi^2_\nu$, where $\nu = \sum_{Q \in \overline{Q}} (|C_Q|-1) - (|T| - 2)$ is the total number of degrees of freedom. This is because in an unrestrictive model, each question has $|C_Q|-1$ parameters, while under IIA there are $|T| - 2$ parameters (since one item in $T$ is the target, and the probability of an item is one minus the sum of the others). In contrast, if the probabilities do not follow \eqref{eq:pi_s}, for some $\mathbf{s}$, $D(\hat{\mathbf{s}})$ is likely to be large. We calculate the $p$-value as the probability of drawing a sample from $\chi^2_\nu$ equal or larger than $D(\hat{\mathbf{s}})$. If this $p$-value is low, the observed choices are unlikely to have been generated by an IIA-compliant model.

The described test can be seen as an approximation to a likelihood-ratio test between the BTL model and an unrestricted model~\citep{Lehmann2022}, having independent parameters for each question $Q \in \overline{Q}$. In the rest of the paper, we will refer to this test as the goodness of fit test, or GFT for short.

\begin{comment}
\subsubsection{Mcfadden-Tye-Train test}

We also apply the Mcfadden-Tye-Train test, a more explicit approximate likelihood-ratio test. It's basic idea is to obtain two MLE models, one by maximizing Equation \ref{eq:mnl_mle} on the full dataset of responses $\{a_Q\}_{Q \in \bar{Q}}$, and another on a reduced dataset $\{b_Q\}_{Q \in \bar{Q}}$. The counts $b_Q$ are obtained from $a_Q$ by excluding some item $l \in \{1, \ldots, |C_Q|\}$. If IIA holds, then $b_Q$ should have a distribution that maintains the count Let $\hat{\mathbf{s}}_a$ be the MLE parameters found in the original dataset $\{a_Q\}_{Q \in \bar{Q}}$, but with $s_l$ excluded. Also define $\hat{\mathbf{s}}_b$ as the MLE found on the reduced dataset $\{b_Q\}_{Q \in \bar{Q}}$. The likelihood ratio test is performed by evaluating both MLE estimates on the reduced dataset $\{b_Q\}_{Q \in \bar{Q}}$.

\begin{equation}
R = - 2\big[ \log L_b(\hat{\mathbf{s}}_a) - \log L_b(\hat{\mathbf{s}}_b)\big],
\end{equation}
where $L_b$ denotes the likelihood on $\{b_Q\}_{Q \in \overline{Q}}$. Under the null hypothesis, $R$ also converges to a $\chi^2_\nu$ distribution, but now with $\nu = K-2$. 
\end{comment}

\subsubsection{Combining Multiple statistics}
\label{sec:combining}

Consider the partition of a general question set $\overline{Q}$ by targets such that all questions in the subsets $\overline{Q}_1, \ldots, \overline{Q}_m$ of the partition share the same target. Note that the GFT can be applied to each question set, and thus each question set will have a $p$-value. However, one of our goals is to test for IIA violations in the dataset as a whole, and therefore these multiple $p$-values must by aggregated. One approach is to consider the minimum $p$-value to reject the null hypothesis. Using the minimum, the null hypothesis is rejected when at least one $p$-value is below the significance threshold. To avoid this approach leading to a high chance of a Type 1 error, Bonferroni Correction~\citep{Wasserman2004} is used to reduce the significance threshold from $\alpha$ to $\alpha/m$. 

Alternatively, the statistics computed on each question set can be added into a single value. Let $D_1, \ldots, D_m$ be the $\chi^2$ statistics for the respective question sets. The joint null hypothesis is that IIA holds for all question sets. Under the null, all $D_i$'s are approximately $\chi^2_{\nu_i}$ distributed. With the additional assumption that $D_1, \ldots, D_m$ are mutually independent, the aggregate statistic $D = \sum_i D_i$ will also be approximately $\chi^2_\nu$ distributed, with degrees of freedom $\nu = \sum_i \nu_i$ where $\nu_i$ is the degrees of freedom for the statistic $D_i$. Both approaches are considered in the numerical analysis that follows.

\subsection{Posterior Predictive Checks}

Posterior Predictive Checks (PPC) is a Bayesian diagnostic tool for assessing discrepancies between a Bayesian model and data~\citep{gelman1996posterior}. PPC is better thought of as an \textit{assessment}, rather than a test, which is geared towards checking \textit{usefulness}, rather than correctness. This is a relevant distinction given that IIA violations have already been demonstrated \citep{tversky1977features}. Being a Bayesian method, it is fundamentally different from classic $\chi^2$ tests, and thus serves as an alternative to measure IIA violations in data. In what follows, a brief introduction to PPC is provided.

%For a Bayesian version of the tests, we decided to apply Posterior Predictive Checks (PPC), instead of the direct analogue of likelihood ratio tests, which would be Bayesian Model Selection. In our scenario, given the previous accounts of IIA violations from \cite{tversky_features_1977} and others, we do not expect IIA to truly hold. Our inquiry is only about the criticality of assuming IIA in a model: are context effects so prevalent that such an assumption is detrimental to applications? This kind of question is very well suited to PPCs, which in essence, are diagnostic tools for discrepancies between model and data. Let us introduce the basic mechanics of PPC.

Let $\mathbf{y}$ be the observable data. A Bayesian generative model for $\mathbf{y}$ is given by
\begin{align*}
   p(\mathbf{y}) = \int_\theta p(\mathbf{y} \mid \theta) p(\theta) \text{d}\theta
\end{align*}

The factorized model above implies a two step data generative procedure: First sample $\theta$ with density $p(\theta)$, then use it to sample $\mathbf{y}$ with $p(\mathbf{y} \mid \theta)$. If however, the observed data $\mathbf{y}^{\text{obs}}$ is given, Bayes' rule can be used to infer the likely value of $\theta$ to have generated $\mathbf{y}^{\text{obs}}$. In other words, we can calculate (and sample from) $p(\theta \mid \mathbf{y}^{\text{obs}})$. From the sampled values of $\theta$ given $\mathbf{y}^{\text{obs}}$, we can then generate replicate datasets $\mathbf{y}^{\text{rep}}$ with
\begin{equation}
p(\mathbf{y}^\text{rep} \mid \mathbf{y}^{\text{obs}}) = \int_\theta p(\mathbf{y}^\text{rep} \mid \theta) p(\theta \mid \mathbf{y}^{\text{obs}}) \, \text{d}\theta.
\end{equation}
If $\mathbf{y}^{\text{obs}}$ has indeed been generated by the assumed model, then $\mathbf{y}^{\text{rep}}$ should “look like” $\mathbf{y}^{\text{obs}}$. In PPC, this similarity translates to there being a relevant aspect of the data, represented by a statistic $T(\mathbf{\mathbf{y}^{\text{obs}}})$, that any useful model needs to capture. Thus, under a useful model, $T(\mathbf{\mathbf{y}^{\text{rep}}}) \approx T(\mathbf{\mathbf{y}^{\text{obs}}})$. As shown in \cite{gelman1996posterior}, the Bayesian approach also allows $T$ to have $\theta$ as an extra argument. Finally, posterior predictive $p$-value is defined as follows:
\begin{equation}
p_{\text{ppc}} = \mathbb{P}(T(\mathbf{\mathbf{y}^{\text{rep}}}, \theta) \geq T(\mathbf{\mathbf{y}^{\text{obs}}}, \theta) \mid \mathbf{y}^{\text{obs}})
\end{equation}

In practice, $p_{\text{ppc}}$ is approximated by simulation, through Algorithm \ref{alg:ppc}.

\begin{algorithm}[H]
        \caption{Posterior Predictive Check}
        \label{alg:ppc}
        \begin{algorithmic}[1]
        \STATE \textbf{Input:} Data $\mathbf{y}^{\text{obs}}$, posterior $p(\theta \mid \mathbf{y}^{\text{obs}})$, model $p(\mathbf{y} \mid \theta)$, and statistic $T(\mathbf{y}, \theta)$.        
        \FOR{$i = 1$ to $N$}
            \STATE Sample $\theta^{(i)} \sim p(\theta \mid \mathbf{y}_{\text{obs}})$.
            \STATE Sample replicated data $\mathbf{y}^{\text{rep},(i)} \sim p(\mathbf{y} \mid \theta^{(i)})$.
            \STATE Compute statistic for observed data: $T(\mathbf{y}^{\text{obs}}, \theta^{(i)})$.
            \STATE Compute stat. for replicated data: $T(\mathbf{y}^{\text{rep},(i)}, \theta^{(i)})$.
        \ENDFOR
        \STATE Calculate the posterior predictive $p$-value:
        \[
        p_{\text{ppc}} = \frac{1}{N} \sum_{i=1}^N \mathbb{I}\big(T(\mathbf{y}^{\text{rep},(i)}, \theta^{(i)}) \geq T(\mathbf{y}^{\text{obs}}, \theta^{(i)})\big)
        \]
        
        \STATE \textbf{Return:} $p_{\text{ppc}}$.
        \end{algorithmic}
\end{algorithm}

\begin{comment}
\begin{itemize}
    \item Algorithm 1
    \item PPC test for IIA: the Bayesian model, the t-statistic
    \item PPC test for user dependence: the Bayesian model, the t-statistic
    \item PPC test for a type of context effect?
\end{itemize}
\end{comment}

\subsubsection{PPC applied to choice models}

To test IIA with the PPC framework, we define a Bayesian version of the BTL model. For question sets $\overline{Q}_1, \ldots, \overline{Q}_m$, with targets $t_1, \ldots, t_m$, respectively,
\begin{align*}
    \sigma &\sim \text{HalfNorm}(2) \\
    s_{ik} &\sim \mathcal{N}(0, \sigma^2),\, i=1, \ldots, m, \,k \in T \setminus \{t_i\} \\
    a_{Q} &\sim \text{Mult}(n, \pi_Q(\mathbf{s}_i)), \, i=1, \ldots, m,\, Q \in \overline{Q}_i,
\end{align*}
that is, we define a half-normal hyperprior for the prior $\sigma$, and sample the similarity scores of items $k$ to target $t_i$ through a zero-mean Gaussian with standard deviation $\sigma$. Note that  $\sigma$ is shared across all question sets $\overline{Q}_i$, making this a hierarchical model (see graphical representation in Figure~\ref{fig:graph_iia} in Appendix \ref{sec:graphical_models}). 

When applied to survey data, the posterior distribution of $\sigma$ can be interpreted as the general magnitude of similarity scores. When $\sigma$ approaches 0, then most questions are answered close to uniformly at random. Conversely, if $\sigma$ is large, then one item is likely to stand out in each question. Having the generative model specified above, and having calculated the posteriors for all $s_{ik}$'s, we can the use the same statistic $D$ in Algorithm \ref{alg:ppc} to obtain a Bayesian version of the goodness of fit test.

\subsection{Population homogeneity}\label{sec:pop_homogeneity_theory}

A common assumption in the study of context effects and IIA is that of population homogeneity. In essence, participants are statistically equivalent in their similarity judgement of the questions they respond. It is also known that models like the Mixed Multinomial Logit model (MMNL) can violate IIA just by accounting for population heterogeneity \citep{mcfadden2000mixed, train2009discrete}. Thus, violations of IIA measured on real data can also be due to population heterogeneity, and not necessarily context effects induced by the choice-sets (see Appendix~\ref{app:heterogeneity} for a simple example). Thus, an additional step when quantifying IIA violations is assessing population homogeneity. If population is indeed homogeneous and IIA is violated, this provides stronger evidence of that relative similarity to the target depends on the choice-set. A statistical test based on PPC to measure population homogeneity is presented in Section~\ref{sec:pop_homogeneity}. 


\section{Analysis of Synthetic Data}
\label{sec:synthetic}

Testing for IIA in similarity choices requires data where multiple questions share the same target. Moreover, the choice-sets of such questions must also overlap. In what follows we propose a model for generating synthetic data with such characteristics. This same data format will also be used in the user experiments to be presented.

\subsection{Generative model}
Let $Q^0$ denote a similarity question with a target $t_{Q^0}$ and choice-set $C_{Q^0} = \{c^1, c^2, c^3, c^4\}$. Question $Q^0$ is used to create four other similarity questions with the same target: $Q^{i}, i=1,\ldots,4$, where the choice-set $C_{Q^{i}} = C_{Q^0} \setminus \{ c_i \}$, namely dropping item $c_i$ once from the original choice-set. For example, the question $Q^{1}$ has choice-set $C_{Q^{1}} = \{c^2, c^3, c^4\}$. These five questions form a question set denoted by $\overline{Q}$. In what follows, three models are presented to generate $\pi_{Q^ik}$, namely the probability that item $c_k$ is chosen by a participant when presented question $Q^i$. 
%This idea to leave one item out of questions to build a question set to test for IIA has been adopted in the literature since the seminal work of Tversky~\cite{tversky1977features}. 

%Our synthetic dataset will consist of $m$ different questions sets: $\overline{Q}_1, \Bar{Q}_2, \ldots \Bar{Q}_m$.

{\bf IIA compliant.} Recall that under IIA, it is sufficient that every item has a similarity score to the target. Let $s_k \sim \mathcal{N}(0, \sigma^2)$ be a normally distributed and independent random variable for every $k=1,\ldots,4$. Given $s_k$, the following choice model is considered:
\begin{align}
    \pi_{Q^ik}(\mathbf{s}) = \frac{e^{s_k}}{\sum_{k' \in C_{Q^i}} e^{s_{k'}}}, \; k \in C_{Q^i}, \; i = 0,\ldots,4 
    \label{eq:IIA}
\end{align}
Note that every item $c_k$ is associated to a similarity score $s_k$, independently of the choice-set.

{\bf Additive perturbation to IIA.} In order to induce IIA violations, it is sufficient that the similarity scores of items to the target depend on the choice-set. The following model adds a perturbation to the baseline similarity scores. Let $\varepsilon^{i}_k \sim \mathcal{N}(0, \sigma_p^2)$ be a normally distributed and independent random variable for every $k=1,\ldots,4$ and $i=1,\ldots,4$. Note that the single parameter controlling $\varepsilon^{i}_k$ is $\sigma_p$. Conditioned on $\varepsilon^{i}_k$, the following choice model is considered:
\begin{align}
    \pi_{Q^ik}(\mathbf{s}) = \frac{e^{s_k + \varepsilon^{i}_k}}{\sum_{k' \in C_{Q^i}} e^{s_{k'}+\varepsilon^{i}_{k'}}}, \; k \in C_{Q^i}, \; i = 1,\ldots,4 
    \label{eq:additive}
\end{align}
Note that $Q^0$ is not perturbed. Moreover, if $\sigma_p = 0$, the additive perturbation becomes zero and the IIA compliant model is recovered; if $\sigma_p$ increases to large enough values the choice probabilities become relatively independent of each other. Thus, $\sigma_p$ is a parameter that controls how strong the additive model induces IIA violations. The additive perturbation model is a general description of IIA violations, without any particular mechanism for inducing context effects. In fact, even when an alternative perturbation model is used for generating data, fitting the additive perturbation model to the synthetic data results in an estimated positive $\sigma_p$  (see Appendix \ref{app:additive_sim}).


\begin{comment}
Assume that each question $Q$ is answered by $n$ people. Further, assume that each person makes a choice using the same probability distribution \textit{i.e.}, the population is homogenous. In this case, the statistics for $Q$ will follow a multinomial distribution $\textsf{Mult}(n, \pi)$, where, $\pi$ is a distribution over the choice-set $C$. We adopt the following Bayesian model for $\pi$:
\begin{align}
    s_j \sim \mathcal{N}(0, 2), \ 
    \pi_j = \frac{e^{s_j}}{\sum_{j'} e^{s_{j'}}}, \, j \in \{1, \ldots, 4\}. \label{eq:base}
\end{align}
Each $s_j$ can be interpreted as a measure of the similarity between the target $t$ and the choice item $c_j$, and users answer the similarity choice query in accordance with this similarity measure.

Next, we extend the model to account for the subquestions. Continue to assume that each subquestion is answered by $n$ people, who are homogenous in their similarity assessments. Thus, the statistics for each $Q^{-k}$ will follow a multinomial distribution $\textsf{Mult}(n, \pi^{-k})$, where, $\pi^{-k}$ is a distribution over the choice-set $C^{-k}$. We now specify three models for generating $\pi^{-k}$.

The first model, $\texttt{IIA}$, assumes the data satisfies the IIA assumption. Therefore, 
\begin{align}
    \pi^{-k}_j &= \frac{e^{s_j}}{\sum_{j' \neq k} e^{s_{j'}}}, \, &j, k \in \{1, \ldots, 4\}, \ j \neq k,\label{eq:IIA}
\end{align}
where each $s_j$ is the same as in $\eqref{eq:base}.$ In other words, it assumes that the similarity measure between the target and choice items is not affected by the choice-set.

The second model, $\texttt{AdditivePerturbation}$, perturbs the similarity measures in the subquestions relative to their original values as follows:
\begin{align}\label{eq:additive_perturbation1}
    \varepsilon^{-k}_j &\sim \mathcal{N}(0, \sigma_p),  \\
    \pi^{-k}_j = &\frac{e^{s_j+\varepsilon^{-k}_j}}{\sum_{j' \neq k} e^{s_{j'}+\varepsilon^{-k}_{j'}}}, \, j, k\in \{1, \ldots, 4\}, \ j \neq k \label{eq:additive_perturbation2}
\end{align}
Thus, the IIA assumption is violated in this model. The parameter $\sigma_p$ controls the strength of the perturbation. When the perturbation is large, it is possible that the order of similarity among the items changes. A larger $\sigma_p$ implies that the IIA violations are more easily detectable from empirical data.

The third model, $\texttt{MultiplicativePerturbation}$, perturbs the similarity measures in the subquestions in a different way:
\begin{align}\label{eq:multiplicative_perturbation1}
    \varepsilon^{-k} &\sim \mathcal{N}(1, \sigma_c),  \\
    \pi^{-k}_j = &\frac{e^{\varepsilon^{-k}s_j}}{\sum_{j' \neq k} \varepsilon^{-k}e^{s_{j'}}}, \, j, k\in \{1, \ldots, 4\}, \ j \neq k \label{eq:multiplicative_perturbation2}
\end{align}
In this model too, the IIA assumption is violated, with the degree of violation being specified by the parameter $\sigma_c$. For small values of $\sigma_c$, $\varepsilon^{-k}$ is likely to be close to one, which means the order of similarity among the choice items is likely to be retained. When $\sigma_c$ is large, there is a high chance that $\varepsilon^{-k}$ is negative. In this case, the similarity judgements are completely reversed, thereby exhibiting a gross violation of IIA.
\end{comment}

\subsection{Numerical evaluation}

Consider $m=100$ different question sets $\overline{Q}$, each generated independently by the generative models previously defined. Moreover, assume that each question in a question set is presented to $n=30$ simulated participants who all provide a simulated answer according to the choice probabilities defined by the respective model. 
%Note that for each question, the distribution of the number of participants choosing each item follows a multinomial distribution with parameters $n$ and $\pi(Q)$. 
Let $\sigma = 2$ for the IIA compliant model. Last, since $p$-values are random (since the dataset is random), the entire experiment is independently repeated 30 times, and the average of the minimum and aggregate $p$-values are presented. 

%The results of an experiment of a question set $\Bar{Q}$ can be organized in a {\em contingency table} where rows corresponds to questions ($Q^i, i=0,\ldots,4)$, columns correspond to items ($c_k, k=1,\ldots,4)$, and a table entries correspond to the number of participants that chose item $c_k$ in question $Q^i$, namely $a_{Q^ik}$. The contingency table is the basis for computing the MLE under the null hypothesis and for running the $\chi^2$ statistical test for IIA violations. Recall that each question set will yield a $p$-value but these can be combined to yield a single $p$-value for the dataset (see Section~\ref{sec:combining}). 


\begin{figure}[t]
\centering
\includegraphics[width=1.0\linewidth]{additive_lineplot.pdf}
\caption{$p$-values obtained by the statistical tests for IIA violations as a function of $\sigma_p$ for the additive perturbation model.}
\label{fig:test_additive}
\end{figure}

Figure~\ref{fig:test_additive} shows the $p$-values for data generated by the additive perturbation model as a function of $\sigma_p$, for both the minimum and aggregate $p$-values. Note that as $\sigma_p$ increases, the $p$-value for both statistical tests decreases, eventually cross the significance threshold of 0.0005 or 0.05 for the minimum and aggregate cases, respectively. However, the significance threshold for the minimum test requires a larger perturbation (around $\sigma_p = 0.35$ and $\sigma_p = 0.45$ for GFT and PPC, respectively) than in the aggregate test (around $\sigma_p = 0.2$ and $\sigma_p = 0.3$ for GFT and PPC, respectively). In essence, this is the amount of perturbation required for IIA to be rejected. Interestingly, the $p$-values for both tests decay relatively similar with $\sigma_p$ validating one another. Note that for PPC, when $\sigma_p \geq 0.6$ the $p$-value is zero since all samples from the posterior in the simulation where rejected. Last, an interesting phenomenon occurs at $\sigma_p = 0$ in the aggregate test; here, the $p$-value for the classical GFT is slightly larger than PPC, indicating that the null hypothesis is less likely to be rejected under GFT than PPC. 

\begin{figure}[t]
\centering
\includegraphics[width=1.0\linewidth]{additive_rejections.pdf}
\caption{Number of rejections as a function of the selection threshold $\alpha$ for different $\sigma_p$.}
\label{fig:rej_additive}
\end{figure}

The GFT and PPC tests can also be used to determine if a particular question set violates the null hypothesis. A selection threshold $\alpha$ can be applied to each question set, and question sets with a $p$-value below $\alpha$ are rejected under the null. Figure~\ref{fig:rej_additive} shows the number of rejections as a function of $\alpha$ for different $\sigma_p$ for both tests. Note that for $\sigma_p=0$, the number of rejections under GFT grows linearly with the threshold $\alpha$ as expected (under the null hypothesis, the $p$-values are uniformly distributed in the limit $n \rightarrow \infty$). However, PPC rejects less question sets for smaller values of $\alpha$. As $\sigma_p$ increases, GFT rejects more with very small $\alpha$ values and for $\sigma_p = 0.8$ around 60 question sets are rejected as soon as $\alpha$ is non-zero. PPC is slower to start rejections but it is faster to terminate rejecting all question sets. PPC rejects all question sets before $\alpha = 1$ while GFT requires $\alpha = 1$ to reject all. Thus, there is a tradeoff in these two statistical tests in the context of IIA. 



\section{Experimental results}\label{sec:experiments}

In order to assess for IIA in similarity choices made by people, two different experiments have been designed in the form of web surveys. The items appearing in the questions to judge similarity surveys are images of dishes, fruits, snacks and food items in general. The set $T$ of 100 items used in the surveys was selected by manually curating the CROCUFID dataset \citep{CROCUFID}, so as to achieve the following properties:
\begin{enumerate}
    \item Variety: western and eastern food dishes, sweet and salty snacks, fruits, etc.
    \item Compositionality: items have single or multiple ingredients or combinations. For instance, meatball with mashed potatoes versus meatball alone.
    \item Perspective: the same item can appear from different visual perspectives. For instance, a whole loaf of bread versus bread slices. 
\end{enumerate}
Similarity judgement between the curated items can be drawn in many ways,  such as using ingredients, color, taste, and even culture associated with the items. Thus, the experiment serves as a prototypical setting for studying complex similarity judgements, and in particular, for testing for IIA. 

The web surveys designed have the same general structure: A participant provides her responses to 20 similarity questions; each question is comprised of one target food item displayed on the top of the screen and a choice-set displayed on the bottom (with three of four options); we ask ``Which option is most similar to the food item on top?'' to which the participant must respond by selecting exactly one item from the choice-set, before moving on to the next question (revising answers by returning to previous questions is also not allowed). The Prolific\footnote{Prolific is an online research platform with over 200k registered participants: \url{https://www.prolific.com/}} platform was used to solicit paid participants for the surveys, and no demographic filters were used when soliciting participants. On average, a participant completed the survey in 3 minutes\footnote{Excluding the time to log in the system and read the instructions.}. We provide as screenshot of the survey website in Appendix~\ref{sec:survey_website}.

\subsection{Handcrafted Dataset}

The first experiment is inspired by Tversky to show IIA violations and illustrate the {\em diagnosticity principle} in similarity judgements~\citep{tversky1977features}. In such experiment, questions are generated in pairs that have the same target and a single item difference in their choice-set. More precisely, $Q_a$ and $Q_b$ have the same target $t$ and choice-sets $C_{Q_a} = \{c_1, c_2, c_3\}$ and $C_{Q_a} = \{c_1, c_2, c_4\}$, respectively. Moreover, the questions should be designed such that $c_1$ and $c_2$ are comparatively more similar to $t$, and item $c_3$ or $c_4$ is more similar to $c_1$ or $c_2$, respectively, but also dissimilar from $t$. The general idea is that $c_3$ and $c_4$ change the context for the question, and can thus change the ratios of responses between $c_1$ and $c_2$ in the two questions (thus, violating IIA). %Figure~\ref{fig:question_0058} shows an example of a question pair used in the survey.

\begin{comment}
\begin{figure}
    \centering
    \includegraphics[width=1.05\linewidth]{handcrated_example_0336.png}
    \caption{Caption}
    \label{fig:enter-label}
\end{figure}
\end{comment}

\begin{figure}
    \centering
    \includegraphics[width=1.1\linewidth]{handcrated_example_0058.png}
    \caption{Example of a question pair from the survey. Vertical red line in the plots indicate number of participants selecting that item; blue curve shows the distribution of the (posterior) number of choices.}
    \label{fig:question_0058}
\end{figure}

A dataset consisting of 20 questions pairs ($Q_a, Q_b$) was manually built by the authors using the 100 curated food items. Each food item appears at most once in a survey (either as the target or in the choice-set) in order to minimize dependencies between question pairs. A participant in the survey was either presented with $Q_a$ or $Q_b$ but not both, for all 20 question pairs (see Appendix~\ref{sec:survey} for all twenty questions). The version $Q_a$ or $Q_b$ of a question pair was randomly chosen for each participant, as well as the order in which the participant answers the 20 questions\footnote{One question was used as a honey pot to flag spurious participants, and is not considered in the analysis.}. The total number of participants was 207.

Figure~\ref{fig:question_0058} illustrates the question pair with the smallest $p$-values for both GFT and PPC. Note that adding a red-coloured fruit (strawberry) in $Q_a$ increases the choices for the green fruit (kiwi), while adding a green-coloured fruit (pear) in $Q_b$ increases the choices of the red fruit (raspberry). This is a good example of what \cite{tversky1977features} calls the {\em diagnosticity principle} and a clear violation of IIA. Moreover, note that posterior distribution for the number of times an item is chosen in that choice-set under IIA is not a good model, given by the relative distance to the actual number of choices (red vertical bars).

\begin{comment}
    Individual results for handcrafted dataset:
    {'min_ppc': 0.006583333333333333, 'min_gft': 0.000516026123911077, 'agg_ppc': 0.041166666666666664, 'agg_gft': 1.4808368181607826e-05}
\end{comment}

The minimum $p$-values for GFT and PPC were 0.00052 and 0.0066, and thus the IIA is rejected by GFT ($\alpha=0.05/19 = 0.0026$). The aggregate $p$-values for GFT and PPC were 0.000015 and 0.041, and thus IIA is rejected by both tests. 

\begin{figure}
    \centering
    \includegraphics[width=1\linewidth]{handcrafted_survey_pvalues.png}
    \caption{$p$-values obtained by PCC and GFT for each question set in the handcrafted dataset sorted by GFT value. Diagonal line corresponds to uniform distribution under IIA hypothesis.}
    \label{fig:handcrafted_pvalues}
\end{figure}

Figure~\ref{fig:handcrafted_pvalues} shows the $p$-values obtained for both tests for all questions in the survey (sorted by GFT). As with the synthetic dataset, GFT has smaller values than PPC. Note that under the joint null hypothesis (IIA), the $p$-values for GFT follow a uniform distribution, and thus the empirical CDF of $p$-value samples should follow a diagonal line, as indicated in the plot. The measured GFT $p$-values are below this diagonal line corroborating the rejection of the null hypothesis. 

\begin{figure}[h]
    \centering
    \includegraphics[width=1\linewidth]{posteriors_additive_model_handcrafted_survey.png}
    \caption{Posterior distributions for $\sigma$ and $\sigma_p$ given the handcrafted survey dataset. The mean values for $\sigma$ and $\sigma_p$ are 2.1 and 0.29, respectively.}
    \label{fig:additive_posterior_handcrafted}
\end{figure}

Besides testing for the IIA hypothesis, the additive perturbation model was also fitted to the handcrafted dataset. A $p$-value of 0.254 was obtained with PPC, thus implying this model can better represent this dataset (and not be rejected). Figure~\ref{fig:additive_posterior_handcrafted} shows the posterior distributions for $\sigma$ and $\sigma_p$ given the dataset. Interestingly, the posterior for $\sigma_p$ falls within $[0.12, 0.48]$ with high probability indicating it is an important component of the model. Moreover, the ratio between the average $\sigma_p$ and average $\sigma$ is $0.29/2.1 = 0.138$, indicating its relative magnitude is not insignificant. 

Interestingly, the average $\sigma_p$ when fitting the additive model to IIA compliant simulated data was 0.049, indicating this parameter plays a small role in this scenario (where IIA is present) but not on real data (see Figure~\ref{fig:additive_posterior_iia} in the Appendix). 

\subsection{Randomized Dataset}

The second experiment was designed to have a very different flavor. In contrast to manually curating question pairs, question sets were randomly determined using the curated food items. In particular, a total of 100 question sets were generated, each having a different target (thus, every food item in $T$ served as a target). For every target, a question $Q^0$ was generated by randomly selecting four food items for its choice-set. From $Q^0$, four questions were created by removing one of the items in its choice-set at a time, identical to the procedure described in Section~\ref{sec:synthetic}. Thus, $Q^i, i=1,\ldots,4$ have choice-sets with size 3.
%Note that this dataset has no cognitive biases in the construction of the choice-sets for the different targets. In particular, some question sets are rather strange where the similarity between the target and the choices are far from clear (see Figure~\ref{})! Can IIA be violated in such a randomized scenario? \surya{Rewrite the last few sentences.}

While the total number of questions in this dataset is 500 (100 question sets each with 5 questions), the survey of a participant had only 20 questions, randomly chosen from the set of 500. However, in every participant survey, items in the target or choice-sets only appeared once. Last, every question received at least 18 responses, and 30 on average. 
%\surya{This paragraph could be written a bit more clearly.}

\begin{figure}
    \centering
    \includegraphics[width=0.9\linewidth]{uai2025/images/random_survey_pvalues.png}
    \caption{$p$-values obtained by PCC and GFT for each question set in the randomized dataset sorted by GFT value. Diagonal line corresponds to uniform distribution under IIA hypothesis.}
    \label{fig:random_pvalues}
\end{figure}

\begin{comment}
Individual results for randomized dataset:
{'min_ppc': 0.011,
'min_mtt': 0.3296757363448152,
'min_gft': 0.00026558998609916374,
'agg_ppc': 0.005666666666666667,
'agg_mtt': 1.0,
'agg_gft': 2.0607242932986737e-05}
\end{comment}

The minimum $p$-values for GFT and PPC were 0.0002 and 0.011, and thus the IIA is rejected by GFT ($\alpha=0.05/100 = 0.0005$). The aggregate $p$-values for GFT and PPC were 0.00002 and 0.0056, and thus IIA is rejected by both tests. The individual $p$-values per question set are shown in Figure \ref{fig:random_pvalues}.


\begin{figure}[h]
    \centering
    \includegraphics[width=1\linewidth]{posteriors_additive_model_random_survey.png}
    \caption{Posterior distributions for $\sigma$ and $\sigma_p$ after fitting the random survey data. The mean values for $\sigma$ and $\sigma_p$ are is 1.6 and 0.21, respectively.}
    \label{fig:additive_posterior_random}
\end{figure}

The additive perturbation model was also fitted to the randomized dataset and a $p$-value of 0.518 was obtained with PPC, implying this model can better represent this dataset (and not be rejected). Figure~\ref{fig:additive_posterior_random} shows the posterior distributions for $\sigma$ and $\sigma_p$ given this dataset. 
Interestingly, the posterior for $\sigma_p$ falls within $[0.14, 0.29]$ with high probability indicating its importance in fitting this model. The ratio between the average $\sigma_p$ and average $\sigma$ is $0.22/1.6 = 0.138$. Notably, this ratio is the same as for the handcrafted dataset. This result suggests that there is significant deviation from IIA even among randomly sampled similarity questions.


\begin{comment}
\subsection{The Data Collection Process}
\begin{itemize}
    \item How much data
    \item How we collect it
    \item Rationale behind the collection procedure
\end{itemize}

\subsection{Results}
\begin{itemize}
    \item PPC rejects IIA on random data
    \item PPC does not reject user dependence, \textit{i.e.}, suggests user homogeneity -- motivate by the possibility of lack of diagnosticity principle. IIA can be violated without diagnosticity if we have used dependence. Do we have it? 
    \item PPC strongly rejects IIA on handcrafted data, because these questions were indeed designed to induce context effects
    \item Baselines don't say anything conclusive? (too many $p$-values, too little data?)
\end{itemize}
\end{comment}

\section{Testing for population homogeneity}\label{sec:pop_homogeneity}

In this section, we develop a statistical test for population homogeneity (PH) based on the PPC framework. The motivation behind this test is to investigate whether a heterogenous population is a significant factor behind the observed IIA violations (see Section \ref{sec:pop_homogeneity_theory}). Our null hypothesis is that the respondents form homogenous population, as similarity comparisons are not too subjective (unlike preferences).
%Indeed, compared to preferences (judgements of taste), similarity comparisons are not as subjective, making it reasonable to assume that all users are statistically similar in their responses.

Suppose one has a survey of questions $\overline{Q}$. For each question $Q \in \overline{Q}$, one has a baseline distribution $\pi_Q$ specifying probabilities of responses for each question. Suppose a new participant takes the survey, and we want to test whether they follow the baseline distribution in their responses, or whether they display anomalous behaviour.
Let $I_p$ denote the {\em information content} (IC) of participant $p$, given by the negative log-probability of its selections, \textit{i.e.},
\begin{equation}
    I_p = - \sum_{Q \in \overline{Q}} \log \pi_{Qr_{pQ}}.
\end{equation}

A participant $p$ that answers according to the distribution $\pi_Q$, for all $Q \in \overline{Q}$ will have an $I_p$ whose expected value is the sum of entropies of each $\pi_Q$. A participant with a response distribution that is significantly different would have a much larger $I_p$. Therefore $I_p$ is an useful statistic to test whether a new participant $p$ follows the pre-specified parameters $\pi_Q$. The statistical test we propose is an extension of this basic idea to a set $P$ of participants and unknown parameters $\pi_Q$. We use the responses of the participants themselves to estimate the parameters, and then aggregate the $I_p$s of each participant into a single statistic, as we will see below.

Consider the experiment using the randomized dataset and a question set $\overline{Q}$ composed of all questions with four choices, $Q^0$. A total of 148 participants provide responses to these 100 questions and each participant answers 20 questions. Recall that each question has a unique target, and thus the similarity of items to the target can be treated independently (each item in each question as a similarity value). Thus, the MLE will simply be the empirical proportion of each chosen item $\pi_{Q_0k}(\hat{\mathbf{s}}) = n^{-1}a_{Q_0k}$, for $k \in C_{Q_0}$.

\begin{figure}[H]
    \centering
    \includegraphics[width=0.8\linewidth]{../images/ic_distribution2.png}
    \caption{Distribution of the information content of participants.}
    \label{fig:IC}
\end{figure}

Figure~\ref{fig:IC} shows the distribution of the information content of the 148 participants. Note that the distribution appears skewed to the right, as indicated by two $I_p$'s higher than 25.

We define the test statistic for PPC to be the difference between the maximum and minimum information content among participants, namely
$$T = \max_{p \in P} I_p - \min_{p \in P} I_p.$$ We reject the null hypothesis (that the population is homogenous) if the $T$ is larger than what is expected under the null. We use the same PPC framework as before (Algorithm \ref{alg:ppc}).
% To calculate the distribution of $T$ under the null hypothesis, the posterior predictive checks can be adopted.
%one can either employ permutation tests, by shuffling the responses of different participants to the same question, or employ posterior predictive checks.
% Using the responses of participants that have answered the same question, the posterior distribution $p(\pi \mid R^\text{obs})$ can be computed and this enables the execution of Algorithm \ref{alg:ppc}. 

PPC returns a $p$-value of 0.0315, thus rejecting PH. However, this is not surprising as the distribution of IC indicated the presence of an outlier. Moreover, when the responses of the single outlier participant is removed from the dataset, PPC returns a $p$-value is 0.27. Therefore, the null hypothesis cannot be rejected. This analysis suggests that the population is fairly homogeneous. Thus, the violations in IIA we observe are likely to stem from context effects.

\begin{comment}
In the case of PPC, we use the formulation in terms of utilities to define a fully generative model.
\begin{align*}
    \sigma_u &\sim \text{HalfNorm}(2) \\
    u(Q_i) &\sim \mathcal{N}(0, \sigma_u) \\
    R_j(Q_i) &\sim \text{Cat}(\text{softmax} (u(Q_i)))\, , \text{for } j=1, \ldots, N.
\end{align*}

Once we collect participant responses $ R^\text{obs} = \{R^\text{obs}_j(Q_i)\}_{i, j}$, then we obtain the posterior distribution $p(\pi \mid R^\text{obs})$ that enables us to execute Algorithm \ref{alg:ppc}. On Figure \ref{fig:IC}, we show an example of the calculation of $T$ for some sampled $\pi$ from the posterior distribution. We see that the replicated data fails to obtain a statistic higher than the true data, indicating a possible rejection of the PPC, if such a pattern holds in other sampled values for $\pi$.
\end{comment}

\begin{comment}
\begin{figure}[H]
    \centering
    \includegraphics[width=0.7\linewidth]{../images/ic_hists.png}
    \caption{Ic values, both for observed and replicated data, with some sampled $pi$ from the posterior.}
    \label{fig:IC}
\end{figure}
Indeed, when we run the full algorithm, we obtain a $p$-value of 0.0315, thus rejecting participant homogeneity, as we have the presence of one outlier participant. 

(\textcolor{red}{for MATT, DANIEL and SURYA: the one we reviewed the responses last year}). 
\end{comment}

\begin{comment}
\begin{figure}[H]
    \centering
    \includegraphics[width=0.7\linewidth]{../images/ic_full_stat.png}
    \caption{$p$-value calculations}
    \label{fig:IC2}
\end{figure}

However, when we exclude the outlier participant from the dataset, the resulting $p$-value is 0.27. Therefore, the null hypothesis cannot be rejected.
\end{comment}

\begin{comment}
\begin{figure}[H]
    \centering
    \includegraphics[width=0.7\linewidth]{../images/ic_stat_fixed.png}
    \caption{$p$-value calculations after excluding outlier.}
    \label{fig:IC2}
\end{figure}
\end{comment}

\begin{comment}
\surya{I would skip the part from here onwards.}
It is crucial to note, however, that the failed rejection of the null hypothesis does not completely eliminate the possibility of user heterogeneity. Consider for instance, a mixture of four kinds of users, one that always selects the first option, one that always selects the second, and so on. If they all represent 1/4 of the population each, then the resulting distribution for all questions will be uniform: 1/4 for each option. If all questions follow an uniform distribution, then $I_j$ will be constant for all users $j$, equalling the maximum entropy  $100 \log(4)$ for 100 questions. Consider the following simulating procedure:

\begin{align*}
\pi_j(Q_i) &\sim \text{SymDir}(\alpha_j) \\
R_j(Q_i) &\sim \text{Cat}(\pi_j(Q_i))
\end{align*}

The extreme example corresponds to the limit $\alpha \rightarrow^+ 0$, where all $\alpha_j$'s equal $\alpha$. In the case where $\alpha_j$'s are different, simulated data shows that our test indeed has power to reject the null hypothesis. we generated data with $\alpha_1 = 0.1$ and $\alpha_2 = 1.1$ and obtained through Algorithm 1 a $p$-value of 0.000125.

\begin{figure}[H]
    \centering
    \includegraphics[width=0.7\linewidth]{../images/ic_stat_sim.png}
    \caption{$p$-value calculations for simulated data, with $\alpha_1 = 0.1$ and $\alpha_2 = 1.1$, a 1/2 split between the two mixtures, 20 total questions and 100 total participants.}
    \label{fig:IC2}
\end{figure}
\end{comment}


\section{Summary and Future Work}
\label{sec:summary}



In this paper, we argue that it is important to test for the validity of the independence of irrelevant alternatives (IIA) property in similarity choice data. We also discuss that existing tests are not suitable for this purpose.
We propose two methods for this task, (1) the aggregation of goodness-of-fit statistics and (2) the application of posterior predictive checks (PPC) to a hierarchical Bayes model. Both tests give us a single $p$-value indicating the prevalence of IIA violations across the entire dataset. Moreover, an extension of the Bayesian model (the additive perturbation model) allows us to measure the strength of the IIA violations over the full dataset (see Figure \ref{fig:additive_posterior_handcrafted} and the accompanying discussion). Finally, we demonstrate the flexibility of the Bayesian model by using it to develop a test for population homogeneity. 

We apply these methods to similarity choice data that we collect through online anonymized surveys. The main findings of this paper indicate that IIA violations on similarity choice data are prevalent even under randomly generated questions. Indeed, this effect is as prominent in randomly generated questions as it is in handcrafted questions designed specifically to induce context effects. Further experiments confirm that population heterogeneity is not a factor causing these violations. Thus, our work provides convincing evidence that similarity choice data exhibits context effects. It motivates the development of richer choice models that can incorporate such effects,  perhaps by modelling known cognitive phenomena. In addition, this work also highlights the potential pitfalls of collecting similarity choice data with large choice-sets and breaking them down into triplets, as commonly done with the artist similarity dataset \citep{ellis2002quest} and the food similarity dataset \citep{wilber2014cost}.

%  With IIA violations, breaking choice-sets can lead to poor performance in downstream tasks. 


\begin{comment}
\begin{itemize}
    \item Studying context effects in similarity choice is nontrivial because different targets leads to a different problem altogether
    \item PPC is a great tool in this context
    \item We show IIA is violated, not only in handcrafted examples like Tversky, but also in random examples
    \item This shows that it may be possible to learn them through a model like Tversky's
    \item This could have important ramifications in search and recommender engines that learn from comparison data.
\end{itemize}
\end{comment}

\begin{acknowledgements} 
This work received financial support through research grants from CNPq
(402689/2019-4 and 310742/2023-4) and FAPERJ (E-26/200.483/2023), and from the Swiss National Science Foundation (SNSF) under grant IZBRZ2-186313.
%Briefly acknowledge people and organizations here. \emph{All} acknowledgements go in this section.
\end{acknowledgements}

\bibliography{uai2025}

\newpage 

\onecolumn

\title{Measuring IIA Violations in Similarity Choices with Bayesian Models\\(Supplementary Material)}
\maketitle

% This Supplementary Material should be submitted together with the main paper.

\appendix

\section{IIA implies model with at most $T-1$ parameters}
\label{app:IIA}

The IIA assumption implies that $\pi_Q$, for all questions $Q$ having the same target can be fully specified with at most $|T|-1$ parameters, one per item in the set $T$ excluding the target. To see this is true, let $Q^*$ be a question with a choice-set that has all items except its target $t_{Q^*} = \ell$, thus $C_{Q^*} = T \setminus \{ \ell \}$. For any question $Q$ with the same target $\ell$, we have that 
$$ \frac{\pi_{Qk}}{\pi_{Q^*k}} = \frac{\pi_{Qk'}}{\pi_{Q^*k'}}, \, \forall k, k' \in C_{Q}. $$
These equalities across all item pairs $k, k' \in C_{Q}$ imply that the response probabilities in $Q^*$ provide the response probabilities for questions $Q$ as follows
$$ \pi_{Qk} = \frac{\pi_{Q^*k}}{\sum_{l\in C_Q} \pi_{Q^*l}}. $$

Thus, assuming that IIA holds, it is sufficient to specify a similarity score $s_k$ for every item $k \in T \setminus \{\ell\}$ to a fixed target $\ell$, independent of the question $Q$. For instance, on can take $s_k = \log \pi_{Q^*k}$. Therefore, without loss of generality, the response probability can be represented by the following model parametrized by the similarity vector $\mathbf{s}$:
\begin{equation}
\pi_{Qk}(\mathbf{s}) = \frac{e^{s_k}}{\sum_{k' \in C_Q} e^{s_{k'}}} \; , 
\label{eq:pi_s2}
\end{equation}
specifying that the probability of choosing item $k$ from choice-set $C_Q$ is proportional to $e^{s_k}$. 

\section{Violating IIA with population heterogeneity}
\label{app:heterogeneity}

In order illustrate how a mixture of two populations can violate IIA, consider a question set with two questions and the following choice-sets: $Q_{C_1} = \{a, b, c\}$, and $Q_{C_2} = \{a, b, d\}$. Consider two participant populations $p_1$ and $p_2$, each of them homogeneous but with different preferences, as shown in Table~\ref{tab:heterogeneous}. 

\begin{table}
        \centering
        \begin{tabular}{c|c|c|c}
             & $p_1$ & $p_2$ & mixt. $p_1$ $+$ $p_2$\\
             \hline 
             $(a, b, c)$ & 0.4, 0.6, 0  & 0.09, 0.01, 0.9 & 0.25, 0.3, 0.45 \\
             $(a, b, d)$ & 0.2, 0.3, 0.5 & 0.9,  0.1,  0  & 0.55, 0.2, 0.25 \\
        \end{tabular}
        \caption{IIA violation example, due just to population heterogeneity.}
        \label{tab:heterogeneous}
\end{table}
Note that for both $p_1$ and $p_2$ the odd ratios between items $a$ and $b$ are invariant in the two questions (2:3 and 9:1, respectively). Thus, each population conforms to IIA. However, under an equal mixture of $p_1$ and $p_2$ (i.e., 50\% each), the response probabilities for each question will change, as shown in Table~\ref{tab:heterogeneous}. In the mixed population, the ratios between items $a$ and $b$ in the two questions are no longer equal, violating IIA. 

\section{Numerical methods}
\label{app:numerical_methods}

We used the PyMC (\cite{AbrilPla2023}) to implement our bayesian models and estimate the model posteriors by MCMC sampling, using the NUTS algorithm (\cite{NUTS}). 

For executing the goodness of fit test with the $\chi^2$ statistic, we obtained the MLE executing a simple gradient descent algorithm. We used a learning rate of 0.005, and a stopping criterion of a less than $10^{-4}$ improvement in the log-likelihood of the parameters. If for a given question set $\overline{Q}$ with the same target $t$, some item $k$ was never selected, i.e. $a_{Qk} = 0, \forall Q \in \overline{Q}$, then we excluded item $k$ as an option from the data. That way, we have a bounded optimization problem without the need for regularization, which was not used. 

\section{Graphical model diagrams}\label{sec:graphical_models}

First, we show in Figure \ref{fig:graph_iia} the plate notation for the Bayesian BTL model. The standard deviation $\sigma$ of similarity scores $s$ is sampled from a half-normal distribution with parameter $\alpha_\sigma$. For each target $t_i, i=1, \ldots, m$, we sample $s_{ik}$ from the normal with $\sigma$. For a given question $Q$ with target $t_i$, The number of participants that selected option $k$ will be sampled from a multinomial distribution parametrized by the softmax of all similarities $s_{ik}$ with $k \in C_Q$.

\begin{figure}[h]
    \centering
    \begin{tikzpicture}

        % Nodes
        \node[latent] (sigma) {$\sigma$};
        \node[latent, below=1.4cm of sigma] (s) {$s_{ik}$};
        \node[latent, right=2.3cm of s] (a) {$a_Q$};
        \node[const, above=1.2cm of sigma] (asigma) {$\alpha_\sigma$};

        \factor[above=0.6cm of sigma]     {sigma-f}     {right:HalfNormal} {} {} ; %
        \factor[above=0.8cm of s]     {s-f}     {right:Normal} {} {} ; %
        \factor[left=1.2cm of a]     {a-f}     {above:Multinomial} {} {} ; %

        % Edges
        \factoredge {asigma} {sigma-f} {sigma} ; %
        \factoredge {sigma} {s-f} {s};
        \edge {s} {a};

        % Plates
        \plate {plateK} {(s)} {$k \in T \setminus \{t_i\}$} ;
        \plate {plateQ} {(a)} {$Q \in \overline{Q}_i$};
        \plate[inner sep=.1cm] {plateI} {(plateQ) (plateK)} {$i=1,\dots,m$};

    \end{tikzpicture}
    \caption{Graphical model representation for the IIA model}
    \label{fig:graph_iia}
\end{figure}

\begin{figure}[h]
    \centering
    \begin{tikzpicture}

        % Nodes
        \node[latent] (sigma) {$\sigma$};
        \node[latent, right=3.5 cm of sigma] (sigma_p) {$\sigma_p$};
        \node[latent, below=1.4cm of sigma] (s) {$s_{ik}$};
        \node[latent, below=1.4cm of sigma_p] (p) {$\varepsilon_{Qk}$};
        \node[latent, below=1.5cm of a-f] (a) {$a_Q$};
        \node[const, above=1.2cm of sigma] (asigma) {$\alpha_\sigma$};
        \node[const, above=1.2cm of sigma_p] (bsigma) {$\beta_\sigma$};

        \factor[above=0.6cm of sigma]     {sigma-f}     {right:HalfNormal} {} {} ; %
        \factor[above=0.8cm of s]     {s-f}     {right:Normal} {} {} ; %
        \factor[above=0.6cm of sigma_p]     {sigmap-f}     {right:HalfNormal} {} {} ; %
        \factor[above=0.8cm of p]     {sp-f}     {right:Normal} {} {} ; %
        \factor[right=0.88cm of s]     {a-f}     {above:Multinomial} {} {} ; %

        % Edges
        \factoredge {asigma} {sigma-f} {sigma} ; %
        \factoredge {sigma} {s-f} {s};
        \factoredge {bsigma} {sigmap-f} {sigma_p} ; %
        \factoredge {sigma_p} {sp-f} {p};
        \factoredge {s} {a-f} {a};
        \factoredge {p} {a-f} {a};

        % Plates
        \plate {plateK} {(s)} {$k \in T \setminus \{t_i\}$} ;
        \plate {plateP} {(p)} {$Q \in \overline{Q}_i, k \in C_Q$}
        \plate[inner sep=.1cm] {plateQ} {(a)} {$Q \in \overline{Q}_i$};
        \plate[inner sep=.1cm] {plateI} {(plateQ) (plateK) (plateP)} {$i=1,\dots,m$};

    \end{tikzpicture}
    \caption{Graphical model representation for the additive perturbation model}
    \label{fig:graph_add_pert}
\end{figure}

In Figure \ref{fig:graph_add_pert}, we show the plate notation for the additive perturbation model. In addition to $\sigma$ and the subsequent $s_{ik}$'s, we also have per question/item perturbation terms $\varepsilon$. Similar to $s_{ik}$, the noises $\varepsilon_{Qk}$ are sampled from a normal distribution, whose standard deviation $\sigma_p$ is sampled from a half-normal hyper-prior controlled by $\beta_\sigma$. For a given target $t_i$, all questions containing a certain item $k$ will attribute to it the same similarity score $s_{ik}$, but every question $Q \in Q_i$ will have a distinct perturbation term $\varepsilon_{Qk}$ added to that similarity. The perturbed similarities will then be put through a softmax to determine the parameters of the multinomial distribution generating outcomes $a_Q$.


\section{Model Posteriors}

\subsection{Additive perturbation model applied to simulated datasets}
\label{app:additive_sim}

We fitted the additive perturbation model to IIA-compliant simulated data with ground truth $\sigma=2$. We set the $\sigma$ hyper-prior parameter at $\alpha_\sigma = 1.5$ and the $\sigma_p$ hyper-prior parameter at $\beta_\sigma = 1$. The posterior distribution was estimated by executing the NUTS algorithm with 4 chains and 40000 samples each (burn-in of 20000). By applying PPC, we obtained a $p$-value of 0.5001, thus implying the model to be a good fit to the data. Moreover, the posterior $\sigma_p$ average was 0.058, while the ground-truth of $\sigma$ was recovered. This simulation shows that the additive perturbation model is well-behaved and identifies the lack IIA violations. See Figure~\ref{fig:additive_posterior_iia}. 

\begin{figure}[h]
    \centering
    \includegraphics[width=0.9\linewidth]{posteriors_additive_model_iia_sim.png}
    \caption{Posterior distributions for $\sigma$ and $\sigma_p$ after fitting the additive perturbation model to the IIA compliant simulated data.}
    \label{fig:additive_posterior_iia}
\end{figure}

\begin{figure}[h]
    \centering
    \includegraphics[width=0.9\linewidth]{posteriors_additive_model_add_sim.png}
    \caption{Posterior distributions for $\sigma$ and $\sigma_p$ after fitting the additive perturbation model to the additive perturbation simulated data.}
    \label{fig:additive_posterior_add_sim}
\end{figure}


We fitted the additive perturbation model to IIA-violating (from additive perturbation) simulated data, with ground truths $\sigma = 2$ and $\sigma_p = 0.2$. Again, we set the $\sigma$ hyper-prior parameter at $\alpha_\sigma = 1.5$ and the $\sigma_p$ hyper-prior parameter at $\beta_\sigma = 1$. The posterior distribution was estimated by executing the NUTS algorithm with 4 chains and 20000 samples each (burn-in of 10000). Through PPC we obtained a $p$-value of 0.6, thus implying the model to be a good fit to the data, unsurprisingly. Moreover, the posterior averages of $\sigma$ and $\sigma_p$ matched the ground truths. See Figure~\ref{fig:additive_posterior_add_sim}.

We also fitted the additive perturbation model to data generated with the multiplicative perturbation model (defined in Appendix ~\ref{app:multiplicative}) and found that when $\sigma_m$ is high, we get an estimated positive $\sigma_p$. More precisely, we generated 100 questions, with 30 responses each, following the logic described in Section ~\ref{sec:synthetic}. With $\sigma_m = 0.1$, the posterior distribution of $\sigma_p$ had a 2.5th percentile of 0.027 (averaged over 10 runs), which is close to 0, however, when we increased $\sigma_m$ to 0.2 and 0.3, we obtained 2.5th percentiles of 0.08 and 0.24, respectively, which are more distant from 0. In all cases,  $\sigma=2$ was used.


\section{Multiplicative perturbation model}
\label{app:multiplicative}

{\bf Multiplicative perturbation to IIA.} Similar to the additive above, this model also adds perturbations to the the original similarity scores. However, it does so using a single noise parameter per question in a multiplicative fashion. Thus, it is a simpler alternative model to induce IIA violations. Let $\varepsilon^{i} \sim \mathcal{N}(1, \sigma_m)$ be a normally distributed and independent random variable for every $i=1,\ldots,4$. Assuming $\varepsilon^{i}$, the following Bayesian choice model is considered:
\begin{align}
    \pi_{Q^ik}(\mathbf{s}) = \frac{e^{s_k \varepsilon^{i}}}{\sum_{k' \in C_{Q^i}} e^{s_{k'} \varepsilon^{i}}}, \; k \in C_{Q^i}, \; i = 1,\ldots,4 
    \label{eq:multiplicative}
\end{align}
Note that $Q^0$ is not perturbed. Moreover, if $\sigma_m = 0$, the multiplicative perturbation variable becomes one and the IIA compliant model is recovered; note that a large and positive $\varepsilon^{i}$ will magnify the similarity score differences, and thus violate IIA, but will preserve the preference ordering among the choice-set; a negative $\varepsilon^{i}$ will invert the ordering. Thus, $\sigma_m$ is a knob that controls how strong the multiplicative perturbation model induces IIA violations. Last, note that the two perturbation models (additive and multiplicative) are relatively different in their mechanism to induce IIA violations. 

\begin{figure}[h]
\centering
\includegraphics[width=0.55\linewidth]{multiplicative_lineplot.png}
\caption{$p$-values obtained by the statistical tests for IIA violations as a function of $\sigma_p$ for the multiplicative perturbation model.}
\label{fig:test_multiplicative}
\end{figure}

Figure~\ref{fig:test_multiplicative} shows the $p$-values for the multiplicative perturbation model as a function of $\sigma_p$, for both the minimum and aggregate $p$-values. Again, note that that as $\sigma_p$ increases the $p$-values decrease, eventually crossing the significance threshold. Interestingly, for both minimum and aggregate cases, a smaller value for $\sigma_p$ is required to cross the significance threshold, in comparison to the additive model (see Fig.~\ref{fig:test_additive}). This suggests that the multiplicative model introduces stronger violations of IIA for the same $\sigma_p$ (although the two models are not directly comparable). Again, both GFT and PPC behave relatively similar in both cases. 

\subsection{Multiplicative perturbation model applied to randomized dataset}

We fitted the multiplicative perturbation model to the randomized survey dataset, and obtained a $p$-value of 0.066 with PPC, failing to reject the model. The $p$-value is however, low enough for us to infer that the multiplicative model is unlikely to explain the range of context effects in the data. Figure~\ref{fig:multiplicative_posterior_random} shows the posterior distribution for both $\sigma$ and $\sigma_p$. Note that their mean values are $1.6$ and  $0.16$, respectively, indicating that $\sigma_p$ contributes to explaining the dataset, as is the case for the additive perturbation model. 

\begin{figure}[h]
    \centering
    \includegraphics[width=0.9\linewidth]{posteriors_multiplicative_model_random_survey.png}
    \caption{Posterior distributions for $\sigma$ and $\sigma_p$ after fitting the randomized dataset to the multiplicative pertubation model.}
    \label{fig:multiplicative_posterior_random}
\end{figure}


\begin{comment}
\subsection{Multiplicative perturbation model applied to handcrafted dataset}

\begin{figure}
    \centering
    \includegraphics[width=1\linewidth, draft]{posteriors_multiplicative_model_random_survey.png}
    \caption{Posterior distributions for $\sigma$ and $\sigma_p$ after fitting the handcrafted survey data.}
    \label{fig:additive_posterior_random}
\end{figure}
\end{comment}

\section{The survey website}\label{sec:survey_website}


\begin{figure}[H]
    \centering
    \includegraphics[width=0.5\linewidth]{survey_screenshot.png}
    \caption{Screenshot of a typical survey question asked to participants on Prolific.}
    \label{fig:survey_screenshot}
\end{figure}

\section{Question pairs in the handcrafted dataset}\label{sec:survey}
The following figures, Combined with Figure \ref{fig:question_0058}, display the question pairs (and the response statistics) that were asked in the handcrafted dataset.
\begin{figure}
    \centering
    \includegraphics[width=0.65\linewidth]{handcrated_example_0042.png}
    \caption{Example of a question pair from the survey. Vertical red line in the plots indicate number of participants selecting that item; blue curve shows the distribution of the (posterior) predicted counts.}
    \label{fig:question_0042}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[width=0.8\linewidth]{handcrated_example_0055.png}
    \caption{Example of a question pair from the survey. Vertical red line in the plots indicate number of participants selecting that item; blue curve shows the distribution of the (posterior) predicted counts.}
    \label{fig:question_0055}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[width=0.8\linewidth]{handcrated_example_0058.png}
    \caption{Example of a question pair from the survey. Vertical red line in the plots indicate number of participants selecting that item; blue curve shows the distribution of the (posterior) predicted counts.}
    \label{fig:question_0058dup}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[width=0.8\linewidth]{handcrated_example_0149.png}
    \caption{Example of a question pair from the survey. Vertical red line in the plots indicate number of participants selecting that item; blue curve shows the distribution of the (posterior) predicted counts.}
    \label{fig:question_0149}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[width=0.8\linewidth]{handcrated_example_0244.png}
    \caption{Example of a question pair from the survey. Vertical red line in the plots indicate number of participants selecting that item; blue curve shows the distribution of the (posterior) predicted counts.}
    \label{fig:question_0244}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[width=0.8\linewidth]{handcrated_example_0289.png}
    \caption{Example of a question pair from the survey. Vertical red line in the plots indicate number of participants selecting that item; blue curve shows the distribution of the (posterior) predicted counts.}
    \label{fig:question_0289}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[width=0.8\linewidth]{handcrated_example_0305.png}
    \caption{Example of a question pair from the survey. Vertical red line in the plots indicate number of participants selecting that item; blue curve shows the distribution of the (posterior) predicted counts.}
    \label{fig:question_0305}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[width=0.8\linewidth]{handcrated_example_0329.png}
    \caption{Example of a question pair from the survey. Vertical red line in the plots indicate number of participants selecting that item; blue curve shows the distribution of the (posterior) predicted counts.}
    \label{fig:question_0329}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[width=0.8\linewidth]{handcrated_example_0331.png}
    \caption{Example of a question pair from the survey. Vertical red line in the plots indicate number of participants selecting that item; blue curve shows the distribution of the (posterior) predicted counts.}
    \label{fig:question_0331}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[width=0.8\linewidth]{handcrated_example_0336.png}
    \caption{Example of a question pair from the survey. Vertical red line in the plots indicate number of participants selecting that item; blue curve shows the distribution of the (posterior) predicted counts.}
    \label{fig:question_0336}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[width=0.8\linewidth]{handcrated_example_0346.png}
    \caption{Example of a question pair from the survey. Vertical red line in the plots indicate number of participants selecting that item; blue curve shows the distribution of the (posterior) predicted counts.}
    \label{fig:question_0346}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[width=0.8\linewidth]{handcrated_example_0353.png}
    \caption{Example of a question pair from the survey. Vertical red line in the plots indicate number of participants selecting that item; blue curve shows the distribution of the (posterior) predicted counts.}
    \label{fig:question_0353}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[width=0.8\linewidth]{handcrated_example_0364.png}
    \caption{Example of a question pair from the survey. Vertical red line in the plots indicate number of participants selecting that item; blue curve shows the distribution of the (posterior) predicted counts.}
    \label{fig:question_0364}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[width=0.8\linewidth]{handcrated_example_0389.png}
    \caption{Example of a question pair from the survey. Vertical red line in the plots indicate number of participants selecting that item; blue curve shows the distribution of the (posterior) predicted counts.}
    \label{fig:question_0389}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[width=0.8\linewidth]{handcrated_example_0523.png}
    \caption{Example of a question pair from the survey. Vertical red line in the plots indicate number of participants selecting that item; blue curve shows the distribution of the (posterior) predicted counts.}
    \label{fig:question_0523}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[width=0.8\linewidth]{handcrated_example_0525.png}
    \caption{Example of a question pair from the survey. Vertical red line in the plots indicate number of participants selecting that item; blue curve shows the distribution of the (posterior) predicted counts.}
    \label{fig:question_0525}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[width=0.8\linewidth]{handcrated_example_0542.png}
    \caption{Example of a question pair from the survey. Vertical red line in the plots indicate number of participants selecting that item; blue curve shows the distribution of the (posterior) predicted counts.}
    \label{fig:question_0542}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[width=0.8\linewidth]{handcrated_example_0589.png}
    \caption{Example of a question pair from the survey. Vertical red line in the plots indicate number of participants selecting that item; blue curve shows the distribution of the (posterior) predicted counts.}
    \label{fig:question_0589}
\end{figure}

\begin{figure}
    \centering
    \includegraphics[width=0.8\linewidth]{handcrated_example_0865.png}
    \caption{Example of a question pair from the survey. Vertical red line in the plots indicate number of participants selecting that item; blue curve shows the distribution of the (posterior) predicted counts.}
    \label{fig:question_0865}
\end{figure}

\end{document}
