\documentclass[accepted]{uai2025} % after acceptance, 

\usepackage[american]{babel}

\usepackage{mathtools}
\usepackage{nicefrac}
\usepackage[ruled,vlined]{algorithm2e}
\SetKwInput{KwInit}{Init}
% \usepackage{algpseudocode}
%% Some suggested packages, as needed:
\usepackage{natbib} 
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{amsthm,amssymb,amsmath,amsfonts} 
% \usepackage{subfig}
% \usepackage{subcaption}

\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{proposition}{Proposition}
\newtheorem{corollary}{Corollary}

\theoremstyle{definition} % Non-italic text
\newtheorem{definition}{Definition}
\newtheorem{example}{Example}
\theoremstyle{remark} % For remarks
\newtheorem{remark}{Remark}
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{graphicx}

\usepackage{lipsum}
\usepackage{wrapfig}
\usepackage{subfigure}
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Privacy-Preserving Neural Processes for Probabilistic User Modeling}

\author[1,2]{Amir Sonee
\thanks{These authors contributed equally.}
\thanks{Currently with Imperial College London but the work was done at University of Manchester.}
}
\author[1]{Haripriya Harikumar\protect\footnotemark[1]}
\author[3]{Alex Hämäläinen}
\author[3]{Lukas Prediger}
\author[1,3,4]{Samuel Kaski}

\affil[1]{%
    Department of Computer Science\\
    University of Manchester\\
    Manchester, UK
}
\affil[2]{%
    Department of Computing\\
    Imperial College London\\
    London, UK
}
\affil[3]{%
    Department of Computer Science\\
    Aalto University, Espoo\\
    Finland
}
\affil[4]{
    ELLIS Institute Finland, Helsinki, Finland
}


\begin{document}
\maketitle

\begin{abstract}
Uncertainty-aware user modeling is crucial for designing AI systems that adapt to users in real-time while addressing privacy concerns. This paper proposes a novel framework for privacy-preserving probabilistic user modeling that integrates uncertainty quantification and differential privacy (DP). Building on neural processes (NPs), a scalable latent variable probabilistic model, we enable meta-learning for user behaviour prediction under privacy constraints. By employing differentially private stochastic gradient descent (DP-SGD), our method achieves rigorous privacy guarantees while preserving predictive accuracy. Unlike prior work, which primarily addresses privacy-preserving learning for convex or smooth functions, we establish theoretical guarantees for non-convex objectives, focusing on the utility-privacy trade-offs inherent in uncertainty-aware models. Through extensive experiments, we demonstrate that our approach achieves competitive accuracy under stringent privacy budgets. Our results showcase the potential of privacy-preserving probabilistic user models to enable trustworthy AI systems in real-world interactive applications.
\end{abstract}

\section{Introduction}
Understanding and modeling user behaviour \citep{user-modeling-recommender20, user-modelling-recomm} is essential for designing adaptive AI systems in real-world interactive scenarios. Many recent AI-assistant applications, e.g. \citep{de2024preference,moon2023amortized,oulasvirta2022computational}, are based on employing parametric user simulators to reason about humans via Bayesian inference. Using such simulators allows data-efficient modeling, but their computational cost has typically prevented their practical application. Fortunately, more recent works \citep{diff_user_models_UAI,moon2023amortized} have proposed integrating amortized inference to mitigate this issue, enabling real-time user modeling via precomputations. 

Despite the proven effectiveness of these user modeling approaches, their lack of formal privacy guarantees limits their deployment in sensitive environments. Existing approaches to privacy-preserving user modeling primarily rely on Federated Learning (FL) \citep{mcmahan2017communication,liu2024cross,liu2023federated} or meta-learning \citep{MAML17}. FL enables distributed centralised training while preserving data locality. However, in FL setup as shown in Fig. \ref{fig:fl-dpdum} (left) imposes significant communication and infrastructure overhead, and often results in global models that underperform on user-specific tasks, particularly in heterogeneous environments with high variability in user behaviour.
Differential Privacy meta-learning \citep{DP-GBML20, tasklevel-DP22}, on the other hand, protects task-specific updates in a federated setting but has not been adapted to uncertainty-aware user models.
\begin{figure*}
    \centering
    \begin{subfigure}
        \centering       \includegraphics[width=0.28\textwidth]{fig/FL.png}
    \end{subfigure}
    \hfill
    \begin{subfigure}
        \centering \includegraphics[width=0.28\textwidth]{fig/DPUM.png}
    \end{subfigure}
    \hfill
    \begin{subfigure}
        \centering
        \includegraphics[width=0.37\textwidth]{fig/fine_level_view.png}
    \end{subfigure}
    \caption{Comparison of Federated Learning (FL)-based user modeling (left) and our proposed  Privacy-preserved Probabilistic User Modeling (middle). In FL, a central server aggregates model updates from multiple environments, requiring continuous communication and centralized coordination. In contrast, our framework enables privacy-preserving probabilistic user modeling, where environments can request and share differentially private user models without direct data exchange (a fine-level view of the ${\sf Env}$ and $\sf Env^\prime$ is shown on the right).}
    \label{fig:fl-dpdum}
\end{figure*}

To address these challenges, we propose a novel privacy-preserving probabilistic user modeling framework. Our proposed approach leverages Neural Processes for their ability to perform amortized inference, enabling fast, few-shot personalization and uncertainty-aware predictions at the user level. NPs naturally support modeling structured user behavior without requiring centralized orchestration, making them especially well-suited for decentralized privacy-preserving settings. Our framework integrates differentially-private stochastic gradient descent (DP-SGD)\citep{DP-deep-learning16} with privacy loss distribution (PLD) \citep{Privacy-Accounting-POPETS22} accounting and applies them to Neural Process (NP) \citep{neural-process18} based differentiable user modeling inspired by \cite{diff_user_models_UAI}. Unlike prior privacy-aware meta-learning approaches, our method provides tight privacy guarantees through PLD-based composition accounting, offering more accurate privacy loss estimation than moment-based techniques \citep{DP-deep-learning16} and Rényi DP \citep{Reyni-DP17,Individual-privacy-Reyni21}. Additionally, unlike PATE \citep{PATE18}, which requires large datasets for teacher-student learning, our approach is well-suited for small-data, high-sensitivity environments.

To concretize the impact of our contribution, consider a scenario where AI assistants in multiple healthcare centers personalize treatment plans. 
Direct data sharing is infeasible due to privacy constraints, and while FL could train a global model via encrypted updates, we do not use this centralized FL coordination mechanism. In contrast, our approach leverages meta-learning with neural processes, where each assistant trains a privacy-protected amortized surrogate user model, which can be selectively shared upon request, enabling optional collaboration (not essentially bi-directional) without direct data exchange. Also, these healthcare centers have the autonomy to decide whether to use the shared surrogate model as it is or further fine-tune it with their local data for enhanced performance. For example, an assistant trained on diabetic patients’ data can share its surrogate model with another assistant treating metabolic disorders, enabling effective and voluntary collaboration without exposing sensitive data. An elaborated real-world example is discussed in Section \ref{problem statement}. 

\begin{table*}[]
\centering
\begin{tabular}{@{}lcccccc@{}}
\toprule
\multicolumn{1}{c}{\textbf{\begin{tabular}[c]{@{}c@{}}Method\end{tabular}}} & 
\textbf{\begin{tabular}[c]{@{}c@{}}Model\end{tabular}} & 
\textbf{\begin{tabular}[c]{@{}c@{}} Update Flow\end{tabular}} &  
\textbf{\begin{tabular}[c]{@{}c@{}}Data\\requirement\end{tabular}} &
\textbf{\begin{tabular}[c]{@{}c@{}}Co-ordinator\end{tabular}} & 
\textbf{\begin{tabular}[c]{@{}c@{}}Privacy\end{tabular}} & 
\textbf{\begin{tabular}[c]{@{}c@{}}User-\\adaptability\end{tabular}}
\\ \midrule
FL  & Global & Bi-directional & Large & Central Server & None & None\\ 
Cross-silo FL & Global & Bi-directional &Large & Central Server & record/user-specific &None\\ 
Personalized FL & Local & Bi-directional &Large & Central Server & None &None\\ 
Neural Process (Ours) & Local & Local &Small & None & Environment-specific &High\\ 
\bottomrule
\end{tabular}
\caption{Key differences of our proposed work with existing approaches.}
\end{table*}

The main contributions of this paper are:
\begin{itemize}
\item Privacy-preserving probabilistic user modeling via DP-SGD and PLD-based privacy accounting for neural processes, ensuring strong privacy guarantees with adaptive user representations.
\item New utility and privacy bounds under non-convex optimization, advancing theoretical insights in privacy-aware meta-learning.
\item Scalable and efficient training across diverse user tasks, maintaining high accuracy even under stringent privacy constraints.
\item Empirical validation on cognitively justified user modeling tasks, demonstrating competitive accuracy compared to non-private baselines.
\end{itemize}
Our extensive experiments on diverse user modeling tasks, including grid-world navigation and cognitively justified menu search modeling under various privacy regimes validate our approach, showing its effectiveness in real-world privacy-sensitive applications. 

\section{Related works}
User modeling \citep{fischer2001user,strouse2021collaborating} in AI-driven systems requires both uncertainty quantification and privacy guarantees to ensure adaptive and trustworthy interactions while protecting sensitive data. Traditional approaches rely on centralized aggregation, posing significant privacy risks. Recent research in privacy-preserving learning has explored federated learning (FL) \citep{mcmahan2017communication}, differential privacy (DP) \citep{Algorithmic-Foundations-of-DP-14}, and meta-learning \citep{MAML17, tasklevel-DP22} to address these challenges.

Federated learning \citep{mcmahan2017communication,liu2023federated} allows decentralized learning while keeping data local, but centralized aggregation remains a privacy risk. In \citep{liu2023federated}, a personalized federated learning technique enhances user adaptability by leveraging hierarchical structures for improved task-specific generalization. Cross-silo FL \citep{liu2024cross} deals with record-level personalized differential privacy, enabling flexible privacy-utility trade-offs while preserving decentralized model training. Personalized FL allows tailored user models but still relies on continuous communication, limiting adaptability. Our approach eliminates this centralized aggregation by enabling privacy-preserving surrogate user models that can be shared securely across environments.

Meta-learning \citep{MAML17} enables fast adaptation to new tasks by leveraging prior knowledge. DP meta-learning \citep{DP-GBML20} and clustering-based regularization \cite{tasklevel-DP22} for task adaptations, protects task-specific updates in a federated setting but does not consider uncertainty-awareness that can model user behaviours probabilistically. We extend this by integrating probabilistic NPs with DP, ensuring flexible, privacy-preserving few-shot adaptation. 
Neural Processes \citep{neural-process18} provide a probabilistic approach to adaptive user modeling \citep{diff_user_models_UAI}, but existing work does not account for privacy risks. Our method incorporates DP-SGD \citep{DP-SGD13} with Privacy Loss Distribution \citep{Privacy-Accounting-POPETS22}, enabling scalable, private uncertainty estimation for interactive AI systems.
So, we introduce a novel privacy-preserving framework for adaptive and uncertainty-aware user modeling, facilitating efficient decentralized learning with strong privacy guarantees in complex, non-convex settings. Unlike federated learning, our approach eliminates centralized coordination, enabling AI assistants to securely exchange privacy-protected representations rather than raw model updates.
\section{Preliminaries}
In this section, we introduce the details needed to formalize and understand our proposed method \emph{privacy protected  probabilistic user modeling}. We first examine this problem from the general perspective of building privacy-protected differentiable surrogates for behavioural models. Then, we cast it into differentially private learning of neural processes, and finally discuss the \emph{user-level DP} notion to address the privacy of users' behavioural datasets. 

\subsection{Privacy-Protected Probabilistic User Modeling}\label{problem statement} 

We consider a setting, as shown in Fig. \ref{fig:fl-dpdum} (middle), consisting of a set of collaborative environments. The detailed view of two environments ${\sf Env}$ (base) and $\sf Env'$ (target), each with an AI-assistant, ${\sf AS}$ and $\sf AS^\prime$, trying to simulate the behaviour of their set of users in some decision-making tasks as shown Fig. \ref{fig:fl-dpdum} (right). Users are characterised by their internal states $\theta_u$ following the distribution $p(\theta)$, $u\in\mathcal{U}$, which govern their behaviour in a specific task described by parameters $\theta_t$, $t\in\mathcal{T}$, following distribution $p(\theta_t)$. A specific user with parameters $\theta_u$ and task parameters $\theta_t$ is described by the parameter $\theta=(\theta_u,\theta_t)\in\Theta$ with distribution $p\left(\theta\right)$. For a given parameter $\theta$, the behavioural policy of a specific user in a task is described by the implicit stochastic process $\pi_\theta$ drawn from the family of probabilistic user models $P(\pi|\theta)$ reflecting uncertainties in the behaviour of users. 

Based on this user model, each user in $\sf Env$ who makes a decision about a task, executes a stochastic behavioural policy $\pi_{\theta}$ on the state $\mathbf{s}_{\theta} \in \mathcal{S}$ of the environment to generate an action $\mathbf{a}_{\theta} = \pi_{\theta}(\mathbf{s}_{\theta}) \in \mathcal{A}$. The sequence of states and actions generated over a horizon of length $n_\theta$ constitutes the user's dataset as $\mathcal{D}_{\theta}=\bigcup_{i=1}^{n_\theta}{\left\{\mathbf{d}_{\theta}^i\right\}}$, $\mathbf{d}_{\theta}^i=(\mathbf{s}_{\theta}^i,\mathbf{a}_{\theta}^i)\in\mathcal{S}\times\mathcal{A}$, which is observed by $\sf AS$. The database of all users' trajectories is $\mathcal{D}=\cup_{\theta}\mathcal{D}_{\theta}$. With $\mathcal{D}$, $\sf AS$ learns a surrogate user model, enabling computation of the posterior predictive distribution $Q(\pi(\mathbf{s}^\prime)|\mathbf{s},\mathbf{a})=\int p(\theta|\mathbf{s},\mathbf{a})P(\pi(\mathbf{s}^\prime)|\theta)d\theta$ over behaviour policy $\pi\sim P$. Surrogates enable better assistance by anticipating the behaviour of a new user in a new task based only on few observed steps of a trajectory, the so-called \emph{context set} $\mathcal{D}^{\sf C}_{\theta}=\{\mathbf{d}_{\theta}^i\}_{i=1}^{m_{\theta}}$.
Our goal is now to have $\sf AS$ help ${\sf AS^\prime\
}$ learn better surrogates of user model, from $\sf Env$, in similar or related tasks. This could be done, in principle, by sharing parameters of the surrogate user model trained by $\sf AS$ using database $\mathcal{D}_{\theta}$ from $\sf Env$. However, to prevent extracting users' sensitive information from parameters of the surrogate model \citep{fredrikson2015model,inferenceattack}, $\sf AS$ learns and transfers a privacy-protected variant of the surrogate model parameters to $\sf AS^\prime$ for utilisation toward learning new surrogates on $\sf Env^\prime$.  

 An illustrating real-world example case of this setting is two healthcare
 centres (environments), each with their own patients, treatment team and AI-assistant. Each treatment team (users) performs the task of giving personalised treatment for specific types of diseases. The treatment team needs to choose actions, including prescription of various types of medication, radiotherapy, etc. by observing the EHR and health status of patients’ (corresponding to the states), the centre’s medical equipment and also the specialists’ knowledge of the specific disease. The AI-assistant of each centre helps their experts design effective treatments for the patients with similar or related diseases. To this end, the two AI-assistants interact to learn from each other’s experience by exchanging information on the diagnostic and treatment actions taken by experts without revealing sensitive information. This sensitive information includes the state of the environment which can be the health status and records of patients (also reflecting their personal identity) under treatment by specialists can be considered private information. 

Formally, the objective of $\sf AS$ is to ensure privacy while learning predictive distribution $Q_\omega$, parameterised by $\omega\in\Omega\subset\mathbb{R}^{d}$, defining a distribution over function $\pi_{\theta}(\mathbf{s})$ for target states $\mathbf{s}\in\mathcal{D}_{\theta}^{\sf T}=\{\mathbf{s}_{\theta}^i\}_{i=m_{\theta}}^{n_{\theta}}$ given $\mathcal{D}^{\sf C}_{\theta}$ where $\pi_{\theta}\sim P$ and $\pi_{\theta}:\mathcal{S}\rightarrow\mathcal{A}$. The intention is for the learning is to adapt well when predicting actions of a new population of users in similar/related tasks and for the target states of $\sf Env^\prime$. Toward this end, $\sf AS$ learns a privacy-protected model parameter $\omega\in\Omega$ that helps $\sf AS^\prime$ learn a low risk $\omega^\prime$ when testing on new population of users in similar/related tasks. These two goals can be viewed as the training and testing phases of a meta-learning algorithm for few-shot prediction of users' behavioural policies that are performed by $\sf AS$ and $\sf AS^\prime$, respectively. While the former trains and transfers a privacy-protected parameter $\omega$ of surrogate user models from $\sf Env$, the latter receives and adapts it for learning new surrogate user models that can predict on target data from $\sf Env^\prime$. 

We cast the problem of privacy-protected user modeling into privacy-preserving meta-learning for few-shot stochastic regression of users' behavioural policies. Toward this end, we consider differentially-private learning of neural process models as a latent variable probabilistic model.
\subsection{Differential Privacy in User Modeling}
\subsubsection{Definiton of Differential Privacy}
Formally, DP is defined as follows:
\begin{definition}[$(\varepsilon,\delta)-DP$ mechanism]
A probabilistic mechanism $\mathcal{M}:\mathcal{D}\rightarrow\mathcal{O}$ is $(\varepsilon,\delta)$-differentially private if for some $\varepsilon\geq0$ and $\delta\in[0,1]$, any measurable subset $\Bar{\mathcal{O}}\subseteq\mathcal{O}$ and for all neighbouring datasets $\mathcal{D}^\prime,\mathcal{D}^{\prime\prime}\in\mathcal{D}$ differing in just one $\mathcal{D}_\theta$, 
\begin{equation}\label{eq:DP-def}
    {\sf Pr}\left[\mathcal{M}(\mathcal{D}^\prime)\in \bar{\mathcal{O}}\right] \leq e^\varepsilon {\sf Pr}\left[\mathcal{M}(\mathcal{D}^{\prime\prime}) \in \bar{\mathcal{O}} \right] + \delta .
\end{equation}
\end{definition}

\subsubsection{User-Level Differential Privacy}
$\sf AS$ runs a privacy-preserving mechanism $\mathcal{M}$ during training to satisfy \emph{user-level DP}. This matches our threat model (see Sec. \ref{problem statement}) in contrast to record-level DP which would only protect a single trajectory \citep{User-Level-Privacy-NIPS21,User-level-DP-22,DP-model-personalization21}.
The notion of user-level privacy enters via the definition of the neighbouring relationship of the datasets. We consider two datasets $\mathcal{D}^\prime=\{\mathcal{D}^\prime_{\theta_u}\}_{\theta_u}$ and $\mathcal{D}^{\prime\prime}=\{\mathcal{D}^{\prime\prime}_{\theta_u}\}_{\theta_u}$ as neighbouring if for some $\theta_u\in\Theta$, $\mathcal{D}^\prime$ and $\mathcal{D}^{\prime\prime}$ differ only in the dataset related to a specific user i.e. $\mathcal{D}_{\bar{\theta}_u}^\prime=\mathcal{D}_{\bar{\theta}_u}^{\prime\prime}$, $\forall \bar{\theta}_u\neq\theta_u$. This guarantees that $\sf AS^\prime$ cannot distinguish if any specific user dataset was utilised. The $\varepsilon$ is the bound on the the privacy loss of database $\mathcal{D}$ and $\delta$ denotes a small amount of slack in terms of the probability mass difference of regions where this $\varepsilon$ bound may be violated.
\begin{algorithm}
\caption{DP-SGD Training of Neural Process}\label{alg:dp-sgd}
\DontPrintSemicolon
\KwIn{Population of users $\mathcal{U}$ with distribution $p(\theta_u)$ and corresponding task parameter distribution $p(\theta_t)$ constituting the user-task parameters $\theta=(\theta_u,\theta_t)$ with distribution $p(\theta)$}
\KwIn{behaviour generative process $p(\pi|\theta)$}
\KwIn{Step size hyperparameter $\gamma$, Clipping bound $c$, Number of iterations $T$, User sampling rate $q\in(0,1]$, Privacy budget $\delta \ll 1/|\mathcal{U}|$ and $\varepsilon$}
\KwInit{Encoder $\psi$ and decoder $\phi$ for ${\rm h}_\psi$ and ${\rm g}_\phi$}
Compute required privacy noise: $\sigma \gets \texttt{privacy\_oracle}(T, q, \varepsilon, \delta)$\;
\For{$t = 1$ \KwTo $T$}{
    Sample, with probability $q$, a batch $\mathcal{B}_l$ of user-task parameters\;
    \ForEach{$\theta \in \mathcal{B}_l$}{
        Generate a trajectory $\mathcal{D}_\theta$ of length \(n_\theta\)\;
        Split $\mathcal{D}_\theta$ into $\mathcal{D}_\theta^{\sf C}$ (context) and $\mathcal{D}_\theta^{\sf T}$ (target)\;
        Encode $\mathcal{D}_\theta^{\sf C}$ as $\mathbf{w}=\frac{1}{|\mathcal{D}^{\sf C}_\theta|} \sum_{\mathbf{d}_\theta\in\mathcal{D}_\theta} h_\psi(\mathbf{d}_\theta)$\;
        Compute user-specific gradients: $\mathbf{g}_\theta = (\mathbf{g}_{\theta,\psi} ,\mathbf{g}_{\theta,\phi}) = \nabla_{\psi, \phi}{\rm L}_\theta(\psi,\phi)$\;
        Clip gradients: $\bar{\mathbf{g}}_{\theta,\psi} = \min\{1, c/\|\mathbf{g}_{\theta,\psi}\|\} \mathbf{g}_{\theta,\psi}$\;
        Perturb gradient: $\bar{\mathbf{g}}_{\theta,\psi} = \tilde{\mathbf{g}}_{\theta,\psi} + \mathcal{N}(0, c^2 \sigma^2 \mathbb{I})$\;
    
    }
    Average gradients: $\tilde{\mathbf{g}}_{\psi} = \frac{1}{|\mathcal{B}_l|} \sum_{\theta \in \mathcal{B}_l} \tilde{\mathbf{g}}_{\theta,\psi}$,
     $\mathbf{g}_{\phi} = \frac{1}{|\mathcal{B}_l|} \sum_{\theta \in \mathcal{B}_l} \mathbf{g}_{\theta,\phi}$\;
    Update parameters: $\psi\gets \psi - \gamma \tilde{\mathbf{g}}_{\psi}$, $\phi\gets \phi - \gamma \mathbf{g}_{\phi}$\;
}
\end{algorithm}
\subsection{Differentially Private Neural Process}\label{sec:DP-NP}
Neural Process are computationally-efficient models, combining Gaussian processes and neural networks, which we use to approximate parameterised surrogates that compute an amortised version of the predictive posterior distribution as $Q_\omega\left(\pi_\theta(\mathcal{D}_\theta^{\sf T})|\mathcal{D}_\theta^{\sf C},\mathcal{D}_\theta^{\sf T}\right)=\int q_\psi(\mathbf{z}|\mathcal{D}_\theta^{\sf C})p_\phi(\pi_\theta(\mathcal{D}_\theta^{\sf T}|\mathcal{D}_\theta^{\sf T},\mathbf{z})d\mathbf{z}$, \(\omega=(\psi,\phi)\), of the user behaviour $\pi_\theta$ at target unseen states. This is performed by considering a joint Gaussian distribution over the values of the policy $\pi_\theta$ at target states of interest, given the context dataset, followed by a NN to learn the mean and variance parameters of the surrogate distributions in the following three steps that builds the NP model \cite{neural-process18}:
\begin{enumerate}
    \item A parameterised \emph{encoder} ${\rm h}_\psi:\mathcal{S}\times\mathcal{A}\rightarrow\mathcal{R}$, $\mathcal{R}\subseteq\mathbb{R}^e$, embedding samples $(\textbf{s}_\theta^i,\textbf{a}_\theta^i)$ of observed context dataset $\mathcal{D}^{\sf C}$ to a fixed-dimension representation $\textbf{r}_\theta^i=h_\psi(\textbf{a}_\theta^i,\textbf{a}_\theta^i)$ of user and task parameters $\theta$. 
    \item A permutation invariant \emph{aggregator}, ${\rm Agg}$, computing an order-invariant global representation $\mathbf{r}_\theta={\rm Agg}(\{\mathbf{r}^i_\theta\}_i)=\sum_{i=1}^{m_\theta}\mathbf{r}_\theta^i/m_\theta$ of the context dataset. This parameterises generation of a global latent variable $\mathbf{z}\sim\mathcal{N}(\mu(\mathbf{r}_\theta),\sigma(\mathbf{r}_\theta))$.
    \item A parameterised \emph{decoder} ${\rm g}_\phi:\mathcal{S}\times\mathcal{Z}\rightarrow\mathcal{A}$ predicting the actions for states of interest in target set $\mathcal{D}^{\sf T}_\theta$ as $\mathbf{a}={\rm g}_\phi(\mathbf{s},\mathbf{z})$, $\mathbf{s}\in\mathcal{D}_\theta^{\sf T}$. 
\end{enumerate}

The NP model builds a surrogate posterior $p_\psi(\mathbf{z}|(\mathbf{s}_\theta,\mathbf{a}_\theta))$ and likelihood $p_\phi(\mathbf{a}_\theta|\mathbf{s}_\theta,\mathbf{z})$, by implementing the mappings of encoder ${\rm h}_\psi$ and decoder ${\rm g}_\phi$ with optimisable parameters $\psi$ and $\phi$. The goal is to optimise the loss function ${\rm L}(\psi,\phi) ={\sf E}_{\theta\sim p(\theta),\pi_\theta\sim p(\pi|\theta)} {\rm L}_{\theta}\left( \psi,\phi\right)$ with respect to parameters $\psi$ and $\phi$ for generalisation of the surrogates over the user/task population.
${\rm L}_{\theta}(\psi,\phi)={\sf E}_{(\mathbf{s}_\theta,\mathbf{a}_\theta)\sim \pi_\theta}[\mathcal{L}(\pi_\theta(\mathbf{a}_\theta|\mathbf{s}_\theta),Q_{\omega}(\mathbf{a}_\theta|\mathbf{s}_\theta,\mathcal{D}_\theta^{\sf C}))]$ is the loss function for specific user-task parameters, 
and $\mathcal{L}$ is the loss between simulated and ground-truth behaviours of the users formulated in terms of the evidence lower bound (ELBO) as
\begin{align}\label{eq:ELBO}
  &\mathcal{L}(\pi_\theta(\mathbf{a}_\theta|\mathbf{s}_\theta),Q_{\omega}(\mathbf{a}_\theta|\mathbf{s}_\theta,\mathcal{D}_\theta^{\sf C}))\\
  &\!=\!{\sf E}_{q_\psi(\mathbf{z}|\mathcal{D}_\theta)}\!\!\left[\sum_{i=m_\theta+1}^{n_\theta}\!\!\!\log( p_\phi(\mathbf{a}_\theta^i|\mathbf{s}^i_\theta,\mathbf{z}))\!+\!\log\!\left(\frac{q_\psi(\mathbf{z}|\mathcal{D}_\theta^{\sf C})}{q_\psi(\mathbf{z}|\mathcal{D}_\theta)}\!\right)\!\right]\!\!.\nonumber  
\end{align}
Here, $q_\psi(\mathbf{z}|\cdot)$ is a variational posterior of the latent variable $\mathbf{z}$ given the respective dataset. 

$\sf AS$ performs differentially-private optimisation for this problem to protect privacy of the users' datasets $\mathcal{D}$ while transferring the optimised parameters $\psi$ and $\phi$ to $\sf AS^\prime$ for meta-testing. These parameters are utilised by $\sf AS^\prime$ as an initialisation that performs well when running the few-shot optimisation process on new user/task populations while not leaking sensitive information about the users' data to $\sf AS^\prime$.   
\section{Method and Non-convex Theoretical Guarantees}
In this section, we propose an algorithm to address the optimization process of the privacy-protected user modeling problem based on training a NP model \citep{neural-process18} using DP-SGD \citep{DP-deep-learning16}. DP-SGD provides a scalable and computationally-efficient differentially-private method to accomplish stochastic gradient descent for non-convex optimisation and is generally applicable with privacy properties that are well understood. This proposes building privacy-protected surrogates while being differentiable and generalisable over user/task population. This will extend the non-private solution for user modeling problem discussed by \citet{diff_user_models_UAI}. 

Our main contribution is to apply this method in a new application, with provable statistical guarantees, for DP few-shot stochastic prediction toward probabilistic user modeling. We present results on utility and privacy guarantees, as well as their trade-off, of the proposed algorithm as a solution for DP meta-learning of the (non-convex) ELBO loss function. For this, we make use of an optimality result of the ELBO given by \citet{ELBO-VAE2023}.

\subsection{DP-SGD training of neural processes} \label{dp-sgd-np}
For training the NP, a set of policies $\{\pi_\theta\}_\theta$ are sampled from distribution \(P\), and are executed by the simulator at the base environment to generate pairs of states and actions $\mathcal{D}_\theta=\{(\mathbf{s}_\theta^i,\mathbf{a}_\theta^i)\}_i$ with $\mathbf{a}_\theta^i=\pi_\theta(\mathbf{s}_\theta^i)$. During training, this dataset is split into a context dataset $\mathcal{D}_\theta^{\sf C}=\{(\mathbf{s}_\theta^i,\mathbf{a}_\theta^i)\}_{i=1}^{m_\theta}$ and target dataset $\mathcal{D}_\theta^{\sf T}=\{(\mathbf{s}_\theta^i,\mathbf{a}_\theta^i)\}_{i=m_\theta+1}^{n_\theta}$ that are fed into the encoder and decoder of the NP as described in Subsection \ref{sec:DP-NP}. We utilise the DP-SGD algorithm for training this NP as described in Alg.~\ref{alg:dp-sgd}. First, a batch of users are sampled, with fixed batch size and sampling rate of $q$, and then the gradient is computed for each user within the batch using samples of data points. Second, the gradient components \(\mathbf{g}_\theta(\psi)\), with respect to weights of encoder, are clipped to bound $\ell_2$-norm sensitivity followed by perturbation with a calibrated Gaussian noise vector to guarantee privacy. It should be noticed that encoder weights are required to preserve privacy as they are subsequently utilised to generate a latent variable for use at the decoder during testing. Accordingly, the gradient components with respect to the weights of the decoder are not obfuscated. To characterise the noise level $\sigma$ required to achieve the desired level of privacy parameters in Alg.~\ref{alg:dp-sgd}, we resort to a recently established principled way for privacy accounting, based on PLD, over subsequent accesses to the dataset through training iterations. This privacy accountant enables establishing tight upper bounds on the real privacy loss \citep{Privacy-Accounting-POPETS22} to \emph{efficiently compute} a tight $\delta$ given the privacy loss budget $\varepsilon$ (and the hyperparameters of the composed privacy mechanisms over $T$ training iterations, $\boldsymbol{\Xi}$), i.e., $\delta(\varepsilon; \boldsymbol{\Xi})$. In our case, $\boldsymbol{\Xi} = (T, \sigma,q)$. 

 We can use these techniques to determine the noise level $\sigma$ for a set of desired privacy parameters $(\varepsilon, \delta)$ and known number of iterations $T$.\footnote{By inverting $\delta(\varepsilon; T, \sigma)$ for $\sigma$ using any (numerical) root-finding technique.} In Alg.~\ref{alg:dp-sgd}, we abstract away the details of this and simply assume that we have a function $\texttt{privacy\_oracle}(T, q, \varepsilon, \delta)$ that outputs a suitable $\sigma$. We choose $\delta = |\mathcal{U}|^{-2}$ following the common advice that $\delta \ll |\mathcal{U}|^{-1}$  as it would otherwise allow the leakage of arbitrary raw data points through the algorithm \citep{Algorithmic-Foundations-of-DP-14}. Alg.~\ref{alg:dp-sgd} is an instantiation of DP-SGD to train NP using PLD accountant. The model weights $\psi$ optimise the loss of generalising the privacy-protected user surrogates over the users population while $\phi$ optimise the loss of generalisation over the task of predicting unseen states. 
These weights are transferred to $\sf AS^\prime$ from $\sf Env^\prime$ for meta-testing on a new user-task and have to predict actions for new target datasets containing states of interest.


\subsection{Utility and Privacy Guarantees}
In this section, we present our results on the utility and privacy guarantees, as well as their trade-off, for the proposed Algorithm \ref{alg:dp-sgd} that leverages DP-SGD method \citep{DP-deep-learning16} for training the NP model \citep{neural-process18}.
For the utility guarantee, we derive a new bound on the norm of stochastic gradient vectors, based on the stochastic non-convex optimization analysis of \citep{stochastic-nonconvex-optimization,cpsgd-nips18}, to account for the noise variance of the perturbed gradients with respect to weights $\psi$. This enables us to investigate the convergence of the proposed algorithm for DP-SGD learning of the NP toward building privacy-protected probabilistic user models. We also discuss how the perturbed gradients affect the convergence of the algorithm compared with non-private gradients. 

These results show that we are able to build differentiable user models that can efficiently adapt to similar or related users/tasks while preserving the privacy of users’ datasets for further use in another environment. The impact of this is to facilitate faster adaptation by releasing a privacy-protected experience from one environment to another.

\begin{theorem}[Utility Guarantee]\label{Thrm1}
Suppose the neural network has differentiable non-linearities contributing to the $\lambda$-smoothness of the loss function ${\rm L}$ with bounded gradients $\|\nabla{\rm L}_{\psi,\phi}(\psi,\phi)\|\leq c$, $\forall \psi,\phi$, and the step sizes $\gamma_\psi^t=\gamma_\phi^t=\gamma=\min\left\{\frac{1}{\lambda},\frac{\sqrt{2\Delta{\rm L}^\ast}}{\sigma_\psi\sqrt{\lambda T}}\right\}$, $t\in[T]$. Then the gradients of the loss function after $T$ iterations of learning can be bounded as
\begin{eqnarray}
    &&\!\!\!\!\!\!\!\!{\sf E}\left[\|\nabla_\psi{\rm L}(\psi^t,\phi^t)\|^2+\|\nabla_\phi{\rm L}(\psi^t,\phi^t)\|^2\right]\nonumber\\
    &&\!\!\!\leq\frac{2\Delta{\rm L}^{\ast}\lambda}{T}+2\sqrt{\frac{2\lambda\Delta{\rm L}^\ast}{T}}c\sqrt{d_\psi} \sigma_{\psi} \label{eq:utility_guarantee}
\end{eqnarray}
where ${\sf E}\left[\|\tilde{\mathbf{g}}_\psi-\nabla{\rm L}_\psi(\psi,\phi)\|^2\right]\leq d_\psi c^2\sigma_\psi^2$ and $d_\psi$ is the dimension of the encoder weight vector.
$\Delta{\rm L}^\ast$ is such that ${\rm L}(\psi_0,\phi_0)-L^\ast\leq\Delta{\rm L}^\ast$ with $L^\ast={\sf E}_{\theta\sim p(\theta)}[L_\theta^\ast]$ being the optimal loss.  $L_\theta^\ast$ is given by the sum of entropy values of the prior and surrogates for the posterior and likelihood as   
\begin{equation}
\!L_\theta^{\!\ast}\!=\!\frac{1}{2n_\theta}\!\sum_{j=1}^{d_z}\!\log\!\left(\!\frac{\varsigma_{j}^2(\mathcal{D}^{\sf C})}{\varsigma_{j}^2(\mathcal{D})}\!\right)\!\!+\!\frac{1}{2n_\theta}\!\!\sum_{i=1}^{n_\theta-m}\!\sum_{j=1}^{d_\mathcal{A}}\log\!\left( 2\pi e\tau_{j}^2(\mathbf{a},\!\mathbf{s}^i)\!\right)    
\end{equation}

$\left(\varsigma_{1}^2,\ldots,\varsigma_{d_z}^2\right)$ and $\left(\tau_{1}^2,\ldots,\tau_{d_\mathcal{A}}^2\right)$ correspond to the diagonal elements of the encoder and decoder covariance matrices, respectively. {\rm See Appendix \ref{sec:proof-thm1} for the proof}.
\end{theorem}

The utility guarantee implies that the gradient norm (as used for the convergence analysis of non-convex stochastic optimisation [Ghadimi and Lan, 2013]) decreases as the number of training iterations increases and hence the training method conducts the model toward the optimal value. This is while the increase in noise variance to preserve stricter privacy slows down the convergence.

Next, we present privacy guarantees based on the \emph{subsampled Gaussian} privacy-preserving mechanism. We leverage recent PLD results for accounting privacy loss over compositions of elementary mechanisms \citep{Privacy-Accounting-POPETS22} to show that, for any $\sigma$ computed by the $\texttt{privacy\_oracle}(T, \varepsilon, q, \delta)$, the algorithm is $(\varepsilon,\delta)$-DP. We consider the substitution neighbouring relation and fixed-sized batch sampling (with rate $q$) without replacement to derive this result. 

Privacy accountants enable us to first find the tightest privacy parameter \(\delta(\epsilon)\) as a function of the total privacy leakage budget \(\epsilon\) over \(T\) iterations. Second, this helps adjusting the amount of additive artificial noise such that a convergent learning is guaranteed that does not exceed the given privacy leakage budget \(\epsilon\) through T-fold sequential composition of the privacy-preserving mechanisms during continual observation of the mechanism output. It should be noted that compared to other accountants, PLD-based accounting has been found superior in earlier work of \citep{Privacy-Accounting-POPETS22} by providing tighter bounds on the DP parameters through high-accuracy estimation of the overall privacy parameters after \(T\)-fold sequential composition as is the case in iterative learning methods \citep{Privacy-classes19}. 

\begin{theorem}[Privacy Guarantee] \label{Thm2}
DP-SGD training of the NP is user-level $(\varepsilon,\delta)$-DP, using the subsampled Gaussian mechanism with variance $\sigma=\delta^{-1}(\varepsilon,\delta;T,q)$, and fixed batch sampling of rate $q$ at each iteration for the $T$-iteration learning algorithm \ref{alg:dp-sgd}. {\rm Proof is provided in Appendix \ref{sec:app-privacy}}.
\end{theorem}

Finally, to find the trade-off between utility and privacy guarantees in this problem, we first provide an analytical upper bound on the variance of the noise that satisfies the privacy budget. To this end, first, we use the known result that subsampling amplifies privacy \citep{What-can-we-learn-privately-08}, to analytically bound $\delta(\varepsilon)$ of the subsampled mechanism with that of the pure Gaussian mechanism \citep{Privacy-classes19},
\begin{eqnarray}\label{eq:ub-delta-subsamp}
    \delta(\varepsilon; T, q, \sigma)\!\!\!\!\!\!&\leq&\!\!\!\!\!\!\frac{1}{2}\!\left[{\sf erfc}\!\left(\!\frac{\varepsilon\sigma\!-\!T/2\sigma}{\sqrt{2T}}\!\right)\!\!-\!e^\varepsilon {\sf erfc}\!\left(\! \frac{\varepsilon\sigma\!+\!T/2\sigma}{\sqrt{2T}} \!\right)\!\right]_{\!+} \nonumber\\
    \!\!\!\!\!&\leq&\!\!\!\!\!\frac{1}{2}{\sf erfc} \left(\frac{\varepsilon\sigma - T/2\sigma}{\sqrt{2T}}\right).
\end{eqnarray}
The above holds for $q \leq \frac{e^{2\varepsilon} - 1}{2e^{2\varepsilon} - 1}$; see Appendix~\ref{sec:app-mechanism-bound} for further details. 
Then, the upper bound on $\sigma$ is obtained by solving \eqref{eq:ub-delta-subsamp} for $\sigma$ as 
\begin{eqnarray}
    \sigma_\psi=\sigma_\phi\!\leq\!\frac{\sqrt{T}}{\sqrt{2}\varepsilon}\!\left[{\sf erfc}^{-1}\left(2\delta\right)\!+\!\sqrt{\varepsilon\!+\!\left({\sf erfc}^{-1}\!\left(2\delta\right)\right)^2}\right]. \label{eq:sigma_upper_bound}
\end{eqnarray}
\begin{figure}  
\centering
\includegraphics[height=0.3\textwidth]{fig/ex1_acc_vs_users_alleps_v1.png}
\caption{Performance of user models trained with different number of users ($500$ to $15,000$) in a non-differentially private setting (black) and differentially private setting with $\varepsilon =1~({\rm red}), 3~({\rm orange}), 5~({\rm green})$, and $10~({\rm blue})$.}
\label{fig:accuracy-users-seen-ex1}
\end{figure}

\begin{figure*}
\begin{subfigure}
    \centering
    \includegraphics[width=0.30\textwidth]{fig/ex1_500_acc_vs_ntraj.png}
\end{subfigure}
\hfill
\begin{subfigure}
    \centering
    \includegraphics[width=0.30\textwidth]{fig/ex1_2000_acc_vs_ntraj.png}
\end{subfigure}
\hfill
\begin{subfigure}
    \centering
\includegraphics[width=0.30\textwidth]{fig/ex1_5000_acc_vs_ntraj.png}
\end{subfigure}
\caption{Accuracy in terms of the number of observed behaviours for non-private (black) and private models under privacy budgets, $\varepsilon = 1, 3, 5,$ and $ 10$, for neural process user models trained with $500$ (Left), $2000$ (middle), and $5000$ (right) users.}
\label{fig:accuracy-task-numb}
\end{figure*}
Accordingly, we can now state the following result for the trade-off rule by inserting into Eq.~\eqref{eq:utility_guarantee}:
\begin{corollary}[{\bf Trade-off analysis}]\label{Trade-off-result}
Supposing the ELBO function is $\lambda$-smooth and has bounded gradients, then for the step sizes $\gamma_\psi$ and $\gamma_\phi$ as in Thm. \ref{Thrm1}, the gradients of the loss function can be bounded as
\begin{align}
    &{\sf E}\left[\|\nabla_{(\psi,\phi)}{\rm L}(\psi^t,\phi^t)\|^2\right]\\
    &\!\!\leq\!\!\frac{2\Delta{\rm L}^{\ast}\lambda}{T}\!+\!4c\sqrt{\!\frac{\lambda\Delta{\rm L}^\ast d}{\varepsilon}}\!\left[{\sf erfc}^{-1}\!\!\left(2\delta\right)\!+\!\!\sqrt{\!\varepsilon\!+\!\left({\sf erfc}^{-1}\!\left(2\delta\right)\right)^{\!2}}\right]\nonumber
\end{align}
\end{corollary}


\section{Experiments}
In this section, we include three experiments for comparing the accuracy of the DP-protected user model surrogates against their non-DP counterparts under various privacy budgets. We provide analysis on the method's performance in terms of accuracy (not expressed in percentage) for different numbers of new users ($\#$ users) seen as well as the amount of their user behaviour data seen. We use a fixed clipping bound of 
$c=2$ as a hyperparameter in our experiments. 
\subsection{Gridworld Environment}
\begin{figure*}
\begin{subfigure}
    \centering
\includegraphics[width=0.30\textwidth]{fig/500usersdplots.png}
\end{subfigure}
\hfill
\begin{subfigure}
    \centering
\includegraphics[width=0.30\textwidth]{fig/5000usersdplots.png}
\end{subfigure}
\hfill
\begin{subfigure}
    \centering
\includegraphics[width=0.30\textwidth]{fig/10000usersdplots.png}
\end{subfigure}
\caption{Convergence of the differentially private neural process during training with different number of users : 500 (Left), 5000 (center), 10,000 (right), under privacy budgets ($\varepsilon({\rm eps})=1~({\rm red}), 3~({\rm orange}), 5~({\rm green}), 10~({\rm blue})$). As the number of users increases, convergence is achieved faster (note the different x-axis scales).}
\label{fig:loss-convergence}
\end{figure*}
The first experiment is the benchmark setting used by \citet{diff_user_models_UAI}, evaluating the method's ability to act as a surrogate for a model describing simulated agents (users) on a $10 \times 10$ gridworld environment. The gridworld environment is defined as a POMDP with deterministic transition dynamics. Here, the behaviour of agents in the agent population are assumed to emerge as a result of Monte Carlo Tree Search (MCTS)-based optimization. Each agent is defined by an individual reward function, observations and MCTS parameters. This results in a wide variety of different behaviours over the full population. We train the method on data generated by sampling individual users, as detailed in Section~\ref{problem statement}.
The environment state captures the current location of the user, which the user can change by moving into one of its adjacent states. The reward function, unique to each user, always assigns one environment state with a positive reward and a negative reward. The initial user location for each episode is chosen at random. Further details are included in Appendix \ref{Exp1-details}.
The modeling task for the AI assistant is to predict the subsequent behaviour of a user, given access to some previous context behaviour of that particular user in different tasks, and the behaviour during the current task. The context observations correspond to the trajectories of state-action pairs performed by the user. Each task is assumed to correspond to an episode in the same environment (sampled independently), but each starting from a different initial state. The number of context trajectories and trajectory length can vary between each agent and task. All other information, such as the reward function and MCTS parameters, are hidden from the assistant.


\subsubsection{On users seen}
 Fig. \ref{fig:accuracy-users-seen-ex1} shows the performance of the models when trained with different numbers of users. The lower the number of users, the gap between the accuracy of the private with tighter privacy restriction, and non-private model is higher. However, it is also clear that for small data, the models with less privacy restrictions have comparable performance without sacrificing the actual accuracy. As we have more users, the gap between private neural process with tighter privacy restrictions is reduced significantly. The model trained with $5000$ users achieves a comparable performance in tighter to lighter differential privacy bound. The differentially private models which have more users than $5000$ have similar behaviour of a non-private model. This means one can guarantee privacy along with same utility of a non-private neural process. The consistent behaviour of a non-DP neural process that uses more users for training can have diminishing benefits as evident from Fig. \ref{fig:accuracy-users-seen-ex1}. This reduces the transfer risk of the trained user models in one environment to the other even under a strict privacy regime.
\subsubsection{On behaviours seen}
Fig. \ref{fig:accuracy-task-numb} shows the generalization of the user models trained with $500$, $2000$ and $5000$ users and prediction of their  behaviours. Unsurprisingly, as the number of observed behaviours per user increases, the accuracy also increases for all privacy budgets and for all the user models.

For smaller $\varepsilon$, the difference between the non-private model is larger. However, this gap tends to decrease as the number of trajectories increases. The model performance also is quite similar to the non-private models as we increase the number of users to train the user models. 
It is worth noticing that the amount of data received from an individual user is very small and training models with data only from individual users results in significantly poorer performance, as implied by the decreasing modeling performance when the amount of users decreases.
\begin{figure*}
\begin{subfigure}
    \centering
\includegraphics[width=0.30\textwidth]{fig/500_meangradnorms_500epochs.png}
\end{subfigure}
\hfill
\begin{subfigure}
    \centering
\includegraphics[width=0.30\textwidth]{fig/5000_meangradnorms_100epochs.png}
\end{subfigure}
\hfill
\begin{subfigure}
    \centering
\includegraphics[width=0.30\textwidth]{fig/10000_meangradnorms_40epochs.png}
\end{subfigure}
    \caption{Norm of gradient Vs Training Epochs: Convergence of the norm of gradient in differentially private neural process during training with different number of users: 500 (left), 5000 (middle), and 10000 (right), under privacy budgets ($\varepsilon=1$ (red), 3 (orange), 5 (green), 10 (blue)). As the number of users increases, convergence is achieved faster (note the different x-axis scales). The model trained with 500 users converge slower, however convergence can still be achieved with more training epochs.}
    \label{fig:theorem-proof}
\end{figure*}
\subsubsection{On convergence} 
The utility bound of Theorem \ref{Thrm1}  implies that the norm of the gradient (as used for convergence analysis of non-convex stochastic optimization \cite{stochastic-nonconvex-optimization}) decreases as the number of training iterations (No: of training epochs) increases. As evident from Fig.  \ref{fig:theorem-proof} and Fig. \ref{fig:loss-convergence}, where both the norm of the gradient and the training loss converge (stabilize) with increase in training epochs (No: of training epochs). The initial increase in the gradient norm is associated with stability issues early in training. However, as training progresses, the norm values gradually converge. It is also important to note that the convergence becomes slower when we target stricter privacy, which is obvious in differential privacy. Also, the trend of the gradient norm across epochs aligns with the loss curves, supporting our observation that models with fewer users require more training epochs to converge, while increasing the number of users leads to faster convergence.
\subsection{Menu Search Environment}
In our second experiment, we test our method with simulated users following the Menu Search model of \citet{kangasraasio2017inferring}.
This cognitive model captures the generative process behind human search behaviour, when they are searching for a specific item in a computer dropdown menu. 
\begin{figure} 
\centering
\includegraphics[height=0.3\textwidth]{fig/ex2_acc_vs_users_alleps.png}
\caption{Performance of user models trained with different number of users ($50$ to $1800$) in a non-differentially private setting (black) and differentially private setting with $\varepsilon =1~({\rm red}), 3~({\rm orange}), 5~({\rm green})$, and $10~({\rm blue})$.}
\label{fig:accuracy-users-seen-exp2}
\end{figure}
This model is based on \textit{computational rationality} \citep{gershman2015computational} and models how the user behaviour emerges as optimal behaviour that is constrained by the cognitive limitations of the users. For further details of the menu search environments, see Appendix \ref{Exp2-details} and \ref{Exp3-details} of the supplementary material and \citep{diff_user_models_UAI,kangasraasio2017inferring}.

Similarly as in the first experiment, the tasks consider modeling the behaviours of individual users, here generated by the Menu Search model. In each task, observations come from multiple simulations performed by each user in different menu layouts. We evaluate the method's ability to model the user behaviour in previously unseen menu layouts.

We train user models with $50$, $100$, $150$, $300$, $600$, $900$, $1200$, $1500$, and $1800$ users under strict to modest privacy budgets of $\varepsilon=1, 3, 5, 10$ and a non-DP Neural process model for each. The results are illustrated in terms of test accuracy evaluations when presented with $5000$ new users (test) is shown in Fig.~\ref{fig:accuracy-users-seen-exp2}. In this setup we can see there is no major gap in accuracy differences exists between non-DP and DP Neural processes. It also important to note that the DP based user model with tighter to relaxed privacy trained with $50$ users even have slightly better test accuracy. 
\subsubsection{Menu Search AI-assistant}
We examine the integration of a differentially private AI assistant into an interactive system within a simulated environment (refer Appendix \ref{Exp3-details} and \citep{diff_user_models_UAI}). Our study enhances a structured search space with hierarchical navigation and adaptive guidance. The assistant models user intent through observed behaviours while balancing assistance and autonomy via interaction constraints. Using data from 300 users, we compare accuracy between non-DP (0.718) and DP models with $\varepsilon=1, 3, 5, 10 (0.685, 0.689, 0.67, 0.681)$. Results show minimal accuracy differences across privacy settings.\\

\textbf{Limitation and Challenges
}: The reported results use ACNPs \citep{kim2019attentive} although  ANPs achieve the highest accuracy in \citep{diff_user_models_UAI}. Our experiments identified instability caused by exploding gradients, even in non-DP training. The introduction of differential privacy further exacerbated this issue, intensifying gradient explosions and significantly degrading model performance. While the choice of the NP design does not affect our conclusions, future work might want to consider recent alternatives such as Transformer Neural Processes \citep{nguyen2022transformer} due to the known stability issues with the NP-ELBO loss.

\section{Conclusion}
We proposed a privacy-preserving probabilistic user modeling framework that integrates neural processes with DP-SGD, enabling differentially private few-shot predictions while maintaining real-time inference. Our approach balances privacy and utility, achieving competitive accuracy under strict privacy constraints across diverse user modeling tasks.
Empirical results showed that as user data increases, the performance gap between private and non-private models diminishes, supporting the feasibility of privacy-aware surrogates. Additionally, we established theoretical guarantees on privacy-utility trade-offs in non-convex optimization.
This work advances privacy-conscious AI assistants for sensitive applications like healthcare and personalization, with potential for further improvements in adaptability and robustness.

\begin{acknowledgements} 
The authors thank Jonathan Taylor from Research IT at the University of Manchester for his support in developing the code for model training and evaluation, and all anonymous reviewers for their constructive feedback. The authors also would like to acknowledge the assistance given by Research IT and the use of the Computational Shared Facility at The University of Manchester. This work was supported by the Research Council of Finland Flagship programme: Finnish Center for Artificial Intelligence FCAI and decisions 358958, 359567. Amir Sonee, Haripriya Harikumar, and Samuel Kaski were supported by the UKRI Turing AI World-Leading Researcher Fellowship, [EP/W002973/1].  
\end{acknowledgements}

\bibliography{uai2025-template}

\clearpage
\appendix

\thispagestyle{empty}
\interdisplaylinepenalty=2500

\newpage

\onecolumn
\begin{center}
    \LARGE \textbf{Supplementary Material}
\end{center}
The proofs of our theorems are provided in Sections ~\ref{sec:proof-thm1} through ~\ref{sec:app-mechanism-bound}, and the experiment details are presented in Section ~\ref{Exp-details}.

\section{Proof of Theorem \ref{Thrm1}}
\label{sec:proof-thm1}

In this section, the convergence of the DP-SGD meta-learning algorithm is discussed for learning nonconvex ELBO loss function to build DP probabilistic user model. 
To this end, we need to investigate the gradient of the loss function ${\sf E}\left[\nabla{\rm L}(\psi^t,\phi^t)\right]$ or the optimality gap ${\sf E}[{\rm L}\left(\psi^T,\phi^T\right)]-{\rm L}(\psi^\ast,\phi^\ast)$ after $T$ rounds of training iterations. As the loss function is not convex in terms of the parameters $\psi$ and $\phi$, the common analysis of the DP-SGD meta-learning for convex functions \citep{tasklevel-DP22,DP-GBML20}, can not be extended to this setting straightforwardly. We provide a convergence analysis based on the DP-SGD method of \citep{cpsgd-nips18} for stochastic non-convex optimization studied in \citep{stochastic-nonconvex-optimization}. We modify this analysis to consider the optimization in terms of both $\psi$ and $\phi$. 

Considering the ELBO function is $\lambda$-smooth (having Lipschitz continuous gradients), i.e. $\forall\psi,\psi^\prime\in\mathbb{R}^{d_\psi}$ and $\phi,\phi^\prime\in\mathbb{R}^{d_\phi}$ we have $\left\|\nabla{\rm L}(\psi^\prime,\phi^\prime)-\nabla{\rm L}(\psi,\phi)\right\|\leq\lambda\|(\psi^\prime,\phi^\prime)-(\psi,\phi)\|$, the gap of loss function for two consecutive iterations can be formulated as follows considering parameter updates $\psi^{t+1}=\psi^{t}-\gamma_\psi\tilde{\mathbf{g}}_\psi^t$, $\phi^{t+1}=\phi^{t+1}-\gamma_\phi\mathbf{g}_\phi^t$ and
$\boldsymbol{\xi}_\psi^t\triangleq\nabla_\psi{\rm L}(\psi^t,\phi^t)-\tilde{\mathbf{g}}_\psi^t$, $\boldsymbol{\xi}_\phi^t\triangleq\nabla_\phi{\rm L}(\psi^t,\phi^t)-\mathbf{g}_\phi^t$ at iteration \(t+1\) where \(\nabla_\psi\), \(\nabla_\phi\) denote for the gradient with respect to weights \(\psi\) and \(\phi\), respectively.  
\begin{eqnarray}
    {\rm L}\left(\psi^{t+1},\phi^{t+1}\right)-{\rm L}\left(\psi^t,\phi^t\right)&\stackrel{(a)}\leq&\nabla_\psi{\rm L}^{\sf T}(\psi^t,\phi^t)(\psi^{t+1}-\psi^t)+\nabla_\phi{\rm L}^{\sf T}(\psi^t,\phi^t)(\phi^{t+1}-\phi^t)\nonumber\\
    &&+\frac{\lambda}{2}\|\psi^{t+1}-\psi^t\|^2+\frac{\lambda}{2}\|\phi^{t+1}-\phi^t\|^2\nonumber\\
    &\stackrel{(b)}=&-\gamma_{\psi}\nabla_\psi{\rm L}^{\sf T}\left(\psi^t,\phi^t\right)\tilde{\mathbf{g}}_\psi^t-\gamma_{\phi}\nabla_\phi{\rm L}^{\sf T}(\psi^t,\phi^t)\mathbf{g}_\phi^t\nonumber\\
    &&+\frac{\lambda}{2}\gamma_\psi^2\|\tilde{\mathbf{g}}_\psi^t\|^2+\frac{\lambda}{2}\gamma_\phi^2\|\mathbf{g}_\phi^t\|^2\nonumber\\
    &\leq&-\gamma_{\psi}\nabla_\psi{\rm L}^{\sf T}\left(\psi^t,\phi^t\right)\left(\nabla_\psi{\rm L}(\psi^t,\phi^t)-\boldsymbol{\xi}_\psi^t\right)+\frac{\lambda}{2}\gamma_\psi^2\|\nabla_\psi{\rm L}(\psi^t,\phi^t)-\boldsymbol{\xi}_\psi^t\|^2\nonumber\\
    && -\gamma_{\phi}\nabla_\phi{\rm L}^{\sf T}\left(\psi^t,\phi^t\right)\left(\nabla_\phi{\rm L}(\psi^t,\phi^t)-\boldsymbol{\xi}_\phi^t\right)+\frac{\lambda}{2}\gamma_\phi^2\|\nabla_\phi{\rm L}(\psi^t,\phi^t)-\boldsymbol{\xi}_\phi^t\|^2\nonumber\\
    &=&-\gamma_{\psi}\left(1-\frac{\lambda\gamma_\psi}{2}\right)\|\nabla_\psi{\rm L}(\psi^t,\phi^t)\|^2+\gamma_\psi(1-\lambda\gamma_\psi)\nabla_\psi{\rm L}^{\sf T}(\psi^t,\phi^t)\boldsymbol{\xi}_\psi^t+\frac{\lambda}{2}\gamma_\psi^2\|\boldsymbol{\xi}_\psi^t\|^2\nonumber\\
    &&-\gamma_{\phi}\left(1-\frac{\lambda\gamma_\phi}{2}\right)\|\nabla_\phi{\rm L}(\psi^t,\phi^t)\|^2+\gamma_\phi(1-\lambda\gamma_\phi)\nabla_\phi{\rm L}^{\sf T}(\psi^t,\phi^t)\boldsymbol{\xi}_\phi^t+\frac{\lambda}{2}\gamma_\phi^2\|\boldsymbol{\xi}_\phi^t\|^2
    % \nonumber\\
\end{eqnarray}
where $(a)$ follows by the smoothness condition, $(b)$ follows by the SGD updates of learning rates $\gamma_\psi$ and $\gamma_\phi$. 
% and $(b)$ follows by assuming $\lambda\gamma_\psi\leq1$ and $\lambda\gamma_\phi\leq1$. 
Considering $\gamma=\max\{\gamma_\psi,\gamma_\phi\}\leq1/\lambda$, and taking the summation over the whole iterations, then the sum of the norm of the gradient vectors with respect to both parameters can be bounded as
\begin{eqnarray}
    &&\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\gamma\left( 1-\frac{\lambda\gamma}{2}\right)\sum_{t=1}^T\left(\|\nabla_\psi{\rm L}(\psi^t,\phi^t)\|^2+\|\nabla_\phi{\rm L}(\psi^t,\phi^t)\|^2\right)\nonumber\\
    &&\leq \sum_{t=1}^T\left[{\rm L}\left(\psi^t,\phi^t\right)-{\rm L}\left(\psi^{t+1},\phi^{t+1}\right)+\gamma_\psi(1-\lambda\gamma_\psi)\nabla_\psi{\rm L}^{\sf T}\left(\psi^t,\phi^t\right)\boldsymbol{\xi}_\psi^t\right.\nonumber\\
    &&\left.\qquad\quad+\gamma_\phi(1-\lambda\gamma_\phi)\nabla_\phi{\rm L}^{\sf T}\left(\psi^t,\phi^t\right)\boldsymbol{\xi}_\phi^t+\frac{\lambda\gamma^2}{2}\left(\|\boldsymbol{\xi}_\phi^t\|^2+\|\boldsymbol{\xi}_\phi^t\|^2\right)\right]\nonumber\\
    &&=\sum_{t=1}^T\left[\gamma_\psi(1-\lambda\gamma_\psi)\nabla_\psi{\rm L}^{\sf T}\left(\psi^t,\phi^t\right)\boldsymbol{\xi}_\psi^t+\gamma_\phi(1-\lambda\gamma_\phi)\nabla_\phi{\rm L}^{\sf T}\left(\psi^t,\phi^t\right)\boldsymbol{\xi}_\phi^t\right.\nonumber\\
    &&\left.\qquad\quad+\frac{\lambda\gamma^2}{2}\left(\|\boldsymbol{\xi}_\phi^t\|^2+\|\boldsymbol{\xi}_\phi^t\|^2\right)\right]+{\rm L}\left(\psi^0,\phi^0\right)-{\rm L}\left(\psi^T,\phi^T\right)\nonumber\\
    &&\leq\sum_{t=1}^T\left[\gamma_\psi(1-\lambda\gamma_\psi)\nabla_\psi{\rm L}^{\sf T}\left(\psi^t,\phi^t\right)\boldsymbol{\xi}_\psi^t+\gamma_\phi(1-\lambda\gamma_\phi)\nabla_\phi{\rm L}^{\sf T}\left(\psi^t,\phi^t\right)\boldsymbol{\xi}_\phi^t\right.\nonumber\\
    &&\left.\qquad\quad+\frac{\lambda\gamma^2}{2}\left(\|\boldsymbol{\xi}_\phi^t\|^2+\|\boldsymbol{\xi}_\phi^t\|^2\right)\right]+{\rm L}\left(\psi^0,\phi^0\right)-{\rm L}\left(\psi^\ast,\phi^\ast\right)
\end{eqnarray}

According to the perturbation in Alg.~\ref{alg:dp-sgd}, $\tilde{\mathbf{g}}_\psi^t$ and $\mathbf{g}_\phi^t$ are unbiased estimators of the true gradients and so ${\sf E}[\xi_\psi^t]={\sf E}[\xi_\phi^t]=0$. Taking the expectation of both sides of the above inequality, we have
\begin{eqnarray}
    {\sf E}\left[\|\nabla_\psi{\rm L}(\psi^t,\phi^t)\|^2 \right.\!\!\!\!&+&\!\!\!\! \left.\|\nabla_\phi{\rm L}(\psi^t,\phi^t)\|^2\right]\nonumber\\
    &\leq&\frac{2}{T\gamma(2-\lambda\gamma)}\left(\left({\rm L}\left(\psi^0,\phi^0\right)-{\rm L}\left(\psi^\ast,\phi^\ast\right)\right)+\frac{\lambda}{2}\gamma^2T{\sf E}\left[\|\boldsymbol{\xi}_\psi^t\|^2+\|\boldsymbol{\xi}_\phi^t\|^2\right]\right)\nonumber\\
    &\leq& \frac{2}{T\gamma(2-\lambda\gamma)}\left(\Delta{\rm L}^{\ast}+\frac{\lambda}{2}\gamma^2Tc^2 d_\psi\sigma_\psi^2\right)\nonumber\\
    &\stackrel{(c)}\leq& \frac{2}{T\gamma}\left(\Delta{\rm L}^{\ast}+\frac{\lambda}{2}\gamma^2Tc^2 d_\psi \sigma_\psi^2\right)\nonumber\\
    &=&\frac{2}{T\gamma}\Delta{\rm L}^{\ast}+\lambda\gamma c^2  d_\psi\sigma_\psi^2\nonumber\\
    &\stackrel{(d)}\leq&\frac{2}{T}\Delta{\rm L}^{\ast}\max\left\{\lambda,\frac{\sigma_\psi\sqrt{T}}{\tilde{\Delta}_\psi}\right\}+\lambda\frac{\tilde{\Delta}_\psi}{\sigma_\psi\sqrt{T}}c^2d_\psi\sigma_\psi^2\nonumber\\
    &=&\frac{2}{T}\Delta{\rm L}^{\ast}\lambda+\frac{2\sigma_{\psi}\Delta{\rm L}^\ast}{\sqrt{T}\tilde{\Delta}_\psi}+\lambda\frac{\tilde{\Delta}_\psi}{\sqrt{T}}c^2d_\psi\sigma_\psi\nonumber\\
    &=&\frac{2}{T}\Delta{\rm L}^{\ast}\lambda+\frac{\sigma_{\psi}}{\sqrt{T}}\left(\frac{2\Delta{\rm L}^\ast}{\tilde{\Delta}_\psi}+\lambda\tilde{\Delta}_\psi c^2d_\psi\right)\nonumber\\
    &\stackrel{(e)}=&\frac{2}{T}\Delta{\rm L}^{\ast}\lambda+2\sqrt{\frac{2\lambda\Delta{\rm L}^\ast}{T}} c\sqrt{d_\psi}\sigma_{\psi}
\end{eqnarray}
where $(d)$ follows by the assumption of $\gamma=\min\{\frac{1}{\lambda},\frac{\tilde{\Delta}_\psi}{\sigma_\psi\sqrt{T}}\}$ for step size and $(e)$ follows since $\tilde{\Delta}_\psi=\sqrt{2\Delta{\rm L}^\ast/\lambda d_\psi}c$ and tightens the bound. ${\rm L}\left(\psi^\ast,\phi^\ast\right)$ is the optimal value of the loss function that can be related to the parameters of the surrogates as follows.

In our setting, ELBO can be specified and simplified to account for the encoding and decoding mappings of the user modeling problem as
\begin{eqnarray}\label{eq:ELBO-user-modelling}
    \mathcal{L}
    &=&{\sf E}_{\mathbf{z}\sim q_{\psi}(\mathbf{z}|\mathcal{D}_\theta)}\left[\sum_{i=m_\theta+1}^{n_\theta}\log\left( p_\phi(\mathbf{a}_\theta^i|\mathbf{s}_\theta^i,\mathbf{z})\right)+\log\left(\frac{q_\psi\left(\mathbf{z}|\mathcal{D}_\theta^{\sf C}\right)}{q_\psi(\mathbf{z}|\mathcal{D}_\theta)}\right)\right]\nonumber\\
    &=&{\sf E}_{\mathbf{z}\sim q_{\psi}(\mathbf{z}|\mathcal{D}_\theta)}\left[\sum_{i=m_\theta+1}^{n_\theta}\log\left( p_\phi(\mathbf{a}_\theta^i|\mathbf{s}_\theta^i,\mathbf{z})\right)+\log\left( q_\psi\left(\mathbf{z}|\mathcal{D}_\theta^{\sf C}\right)\right)\right]\!+\!H\left( q_\psi(\mathbf{z}|\mathcal{D}_\theta)\right)
\end{eqnarray}
where $H$ is the entropy function, and the surrogate posterior (Gaussian encoder) and likelihood (Gaussian decoder) are given by 
\begin{eqnarray}
    q_\psi\left(\mathbf{z}|\mathcal{D}_\theta^{\sf C}\right)&=&\mathcal{N}\left(\mu_\psi(\mathcal{D}_\theta^{\sf C}),\mathbf{K}_\psi(\mathcal{D}_\theta^{\sf C})\right)\nonumber\\
    p_\phi\left(\mathbf{a}_\theta^i|\mathbf{s}_\theta^i,\mathbf{z}\right)&=&\mathcal{N}\left(\nu_{\phi}(\mathbf{z},\mathbf{s}_\theta^i),\boldsymbol{\Sigma}_{\phi}(\mathbf{z},\mathbf{s}_\theta^i)\right).
\end{eqnarray}
The mean $\mu_\psi(\mathcal{D}_\theta^{\sf C})$ and covariance $\mathbf{K}_\psi(\mathcal{D}_\theta^{\sf C})$ of the encoder are parameterised by two NNs (neural network), ${\rm NN}_\mu$ and ${\rm NN}_\varsigma$ with parameters $\mathbf{V}$ and $\mathbf{T}$, respectively, consisting of all the weight matrices and biases of the corresponding NN for the mean and variance of the encoder:  
\begin{eqnarray}
    \mu_\psi(\mathcal{D}_\theta^{\sf C})&=&\frac{1}{m_\theta}\sum_{\mathbf{d}_\theta^i\in\mathcal{D}_\theta^{\sf C}} {\rm h}_\psi\left(\mathbf{d}_\theta^i\right),\ \ {\rm h}_\psi(\mathbf{d}_\theta^i)={\rm NN}_\mu\left( \mathbf{d}_\theta^i;\mathbf{V}\right) \nonumber\\
    \mathbf{K}_\psi(\mathcal{D}_\theta^{\sf C})&=&{\sf diag}\left(\varsigma_{1}^2(\mathcal{D}_\theta^{\sf C}),\ldots,\varsigma_{d_z}^2(\mathcal{D}_\theta^{\sf C})\right), \ \ \left(\varsigma_{1}^2,\ldots,\varsigma_{d_z}^2\right)^{\sf T}={\rm NN}_\varsigma(\mathcal{D}_\theta^{\sf C};\mathbf{T})
\end{eqnarray}
Accordingly, the parameters of the two NNs contribute to the all parameters of the encoder as $\psi=\left(\mathbf{V},\mathbf{T}\right)$. Similarly, the mean and variance of the decoder can be parameterised by two NNs, ${\rm NN}_\nu$ and ${\rm NN}_\tau$ with parameters $\mathbf{W}$ and $\mathbf{M}$, respectively, implying all the weight matrices and biases of the corresponding conditional NN for the mean and variance of the decoder: 
\begin{eqnarray}
    \nu_\phi(\mathbf{a},\mathbf{s})&=&{\rm g}_\phi\left(\mathbf{a},\mathbf{s}\right),\ \ {\rm g}_\phi(\mathbf{a},\mathbf{s})={\rm NN}_\nu\left(\mathbf{a},\mathbf{s};\mathbf{W}\right) \nonumber\\
    \boldsymbol{\Sigma}_\phi(\mathbf{a},\mathbf{s})&=&{\sf diag}\left(\tau_{1}^2(\mathbf{a},\mathbf{s}),\ldots,\tau_{d_\mathcal{A}}^2(\mathbf{a},\mathbf{s})\right), \ \ \left(\tau_{1}^2,\ldots,\tau_{d_\mathcal{A}}^2\right)^{\sf T}={\rm NN}_\tau(\mathbf{a},\mathbf{s};\mathbf{M}). 
\end{eqnarray}
For the structure of the NN, a concatenation of linear mappings followed by point-wise nonlinear mappings is considered as ${\rm NN}(\mathbf{x},\mathbf{Y})=\mathbf{y}^l f\left(\mathbf{y}^{l-1}f\left(\ldots f\left(\mathbf{y}^0\mathbf{x}+\mathbf{b}^0\right)\ldots\right)+\mathbf{b}^{l-1}\right)+\mathbf{b}^l$
% \begin{eqnarray}\label{eq:nn-assumption}
%     {\rm NN}(\mathbf{x},\mathbf{Y})=\mathbf{y}^l f\left(\mathbf{y}^{l-1}f\left(\ldots f\left(\mathbf{y}^0\mathbf{x}+\mathbf{b}^0\right)\ldots\right)+\mathbf{b}^{l-1}\right)+\mathbf{b}^l
% \end{eqnarray}
where $\mathbf{y}^l$ and $\mathbf{b}^l$ are the weight matrix and bias of the layer $l$, and $f$ is the nonlinear pointwise function considered invariant between layers. Also, $\mathbf{x}$ and $\mathbf{Y}$ represent the input vector and the weights of the NN, respectively, and are set according to the encoder or decoder side, as inferred from the context. 

Applying this setting for the encoder and decoder to build the surrogate posterior and likelihood, the ELBO in \eqref{eq:ELBO-user-modelling} can be characterised as
\begin{eqnarray}\label{eq:ELBO-usermodelling}
    \mathcal{L}&=&{\sf E}_{\mathbf{z}\sim q_{\psi}(\mathbf{z}\mid\mathcal{D}_\theta)}\left[\sum_{i=m_\theta+1}^{n_\theta}\log\left( \mathcal{N}\left(\mathbf{a}_\theta^i\mid{\rm g}_\phi(\mathbf{s}_\theta^i,\mathbf{z}),\Sigma_{\phi}(\mathbf{z},\mathbf{s}_\theta^i)\right)\right)+\log\left(\mathcal{N}\left( \mathbf{z}\mid\sum_{i=1}^{m_\theta}\!{\rm h}_\psi(\mathbf{d}_\theta^i)/m_\theta, \mathbf{K}_\psi(\mathcal{D}_\theta^{\sf C})\right)\right)\right.\nonumber\\
    &&\left.\qquad\qquad\qquad -\log\left(\mathcal{N}\left( \mathbf{z}\mid\sum_{i=1}^{n_\theta}{\rm h}_\psi(\mathbf{d}_\theta^i)/n_\theta, \mathbf{K}_\psi(\mathcal{D}_\theta)\right)\right)\right]
\end{eqnarray}

By adapting the arguments in \citep{ELBO-VAE2023}, it can be shown that the ELBO converges to the sum of the three entropy functions of the Gaussian distributions at the stationary points as given in \eqref{eq:ELBO-usermodelling}. So, optimal value $L^\ast$ can be expressed in terms of the mean and variance parameters of the posterior and likelihood surrogates as
\begin{eqnarray}
    L^{\ast}_\theta&=&\frac{1}{n_\theta}\left[ H\left( q_\psi\left(\mathbf{z}|\mathcal{D}_\theta^{\sf C}\right)-H\left( q_\psi\left(\mathbf{z}|\mathcal{D}_\theta\right)\right)\right)\right] +\frac{1}{n_\theta}\sum_{i=1}^{n_\theta-m_\theta}H\left( p_\phi(\mathbf{a}_\theta^i|\mathbf{z},\mathbf{s}_\theta^i)\right)\nonumber\\
    &=&\frac{1}{2n_\theta}\sum_{j=1}^{d_z}\log\left(2\pi e\varsigma_{j}^2(\mathcal{D}_\theta^{\sf C})\right)-\frac{1}{2n_\theta}\sum_{j=1}^{d_z}\log\left(2\pi e\varsigma_{j}^2(\mathcal{D}_\theta)\right) +\frac{1}{2n_\theta}\sum_{i=1}^{n_\theta-m_\theta}\sum_{j=1}^{d_\mathcal{A}}{\sf E}_{q_\psi(\mathbf{z}|\mathcal{D}_\theta)}\left[\log\left( 2\pi e\tau_{j}^2(\mathbf{z},\mathbf{s}_\theta^i)\right)\right] \nonumber\\
    &=&\frac{1}{2n_\theta}\sum_{j=1}^{d_z}\log\left(\varsigma_{j}^2(\mathcal{D}_\theta^{\sf C})/\varsigma_{j}^2(\mathcal{D}_\theta)\right) +\frac{1}{2n_\theta}\sum_{i=1}^{n_\theta-m_\theta}\sum_{j=1}^{d_\mathcal{A}}{\sf E}_{q_\psi(\mathbf{z}|\mathcal{D}_\theta)}\left[\log\left( \tau_{j}^2(\mathbf{z},\mathbf{s}_\theta^i)\right)\right]
\end{eqnarray}
where $H(.)$ denotes the entropy function of the Gaussian distributions considered for the surrogates and the prior $\mathbf{z}\sim\mathcal{N}(0,\mathbf{I})$. 

\section{Details on privacy guarantee Theorem \ref{Thm2}} \label{sec:app-privacy}

We formally establish privacy bounds and find the required variance for the subsampled Gaussian privacy-preserving mechanism applied on the true gradients in Algorithm~\ref{alg:dp-sgd}. We leverage recent results for accounting privacy loss over compositions of elementary mechanisms \citep{Privacy-Accounting-POPETS22}. We consider the substitution neighbouring relation and fixed-sized batch sampling without replacement. For any $\sigma$, the privacy loss random variable for a single iteration $i$ can be established as
\begin{equation}
    \mathcal{L}^p_i = \mathcal{L}^p(Y_i) = \ln \left(\frac{q e^{\frac{2Y_i-1}{2\sigma^2}} + 1 - q}{q e^{\frac{-2Y_i-1}{2\sigma^2}} + 1 - q}\right),
\end{equation}
where $Y_i \sim q \mathcal{N}(1, \sigma^2) + (1-q) \mathcal{N}(0, \sigma^2)$ and $q$ is the subsampling ratio \citep{Sampling-without-replacment-accounting-19}.
For any $\varepsilon$ we then obtain a corresponding $\delta$ such that the algorithm is $(\varepsilon, \delta)$-DP from the following expectation that is derived from the hockey-stick divergence (cf.~\citep{Privacy-Accounting-POPETS22})
\begin{equation}
    \delta(\varepsilon; T, q, \sigma) = {\sf E}\left[ \max\{ 0, e^{\varepsilon - \sum_{i=1}^T \mathcal{L}_i } \}\right]. \label{eq:delta_alg}
\end{equation}
Analytical evaluation of this equation is intractable for the subsampled Gaussian mechanism, but a number of methods for numerical approximations of tight upper bounds have been developed in recent literature, e.g., \citep{Sampling-without-replacment-accounting-19, Privacy-Accounting-POPETS22}. To obtain the required $\sigma$ for desired values of $\varepsilon$, $\delta$ and $T$, we rely on a (numerical) root-finding technique to invert $\delta(\varepsilon; T, \sigma)$.


\section{Details on the utility-privacy trade-off result of Corrollary \ref{Trade-off-result}} \label{sec:app-mechanism-bound}
Privacy amplification by subsampling states that for any $(\varepsilon, \delta)$-DP mechanism $\mathcal{M}$, the composition $\mathcal{M} \circ \mathcal{S}_q$, where $\mathcal{S}_q$ is a subsampling routine that includes any element with probability $q$ is $(O(q)\varepsilon, O(q)\delta)$-DP. Intuitively this is due to the privacy loss random variable being smaller for a given mechanism output value $x$ and thus more concentrated at smaller value. We further generalise this to arrive at a bound on $\delta$ for the same $\varepsilon$ as follows:

\begin{lemma}
    Let $\mathcal{L}^p_1 = \mathcal{L}^p_1(X_1)$ and $\mathcal{L}_2^p = \mathcal{L}_2^p(X_2)$ be privacy loss random variables where $X_1 \sim f_1$ and $X_2 \sim f_2$ for corresponding probability density function $f_1, f_2$. If $x_0$ exists such that for all $x \geq x_0$ it holds that $\mathcal{L}_1^p(x) \leq \mathcal{L}_2^p(x)$, ${\sf sign}(\frac{d\mathcal{L}_1^p}{dx}(x)) = {\sf sign}(\frac{d\mathcal{L}_2^p}{dx}(x)) = \text{const}$, and $f_1(x){\sf sign}(\frac{d\mathcal{L}_1^p}{dx}(x)) \leq f_2(x) {\sf sign}(\frac{d\mathcal{L}_2^p}{dx}(x))$, then $\delta_{\mathcal{L}_1^p}(\mathcal{L}_2^p(x_0)) \leq \delta_{\mathcal{L}_2^p}(\mathcal{L}_2^p(x_0))$.
\end{lemma}
\begin{proof}
\begin{eqnarray}
    \delta_{\mathcal{L}_1^p}(\mathcal{L}_2^p(x_0)) &=& \int_{\mathcal{L}_2^p(x_0)}^\infty f_{\mathcal{L}^p_1}(s) \left(1 - e^{\mathcal{L}_2(x_0) - s}\right) ds \nonumber\\
    &=& \int_{\mathcal{L}_1^{p^{-1}}(\mathcal{L}_2^p(x_0))}^\infty f_1(x) {\sf sign}\left(\frac{d\mathcal{L}_1^p}{dx}(x)\right) \left(1 - e^{\mathcal{L}_2^p(x_0) - \mathcal{L}_1^p(x)}\right) dx \nonumber\\
    &\leq& \int_{\mathcal{L}_1^{p^{-1}}(\mathcal{L}_2^p(x_0))}^\infty f_2(x) {\sf sign}\left(\frac{d\mathcal{L}_2^p}{dx}(x)\right) \left(1 - e^{\mathcal{L}_2^p(x_0) - \mathcal{L}_1^p(x)}\right) dx \nonumber\\
    &=& \int_{\mathcal{L}_2^p(\mathcal{L}_1^{p^{-1}}(\mathcal{L}_2^p(x_0)))}^\infty f_{\mathcal{L}_2^p}(s) \left(1 - e^{\mathcal{L}_2^p(x_0) - \mathcal{L}_1^p(\mathcal{L}_2^{p^{-1}}(s))}\right) ds \\
    &\leq& \int_{\mathcal{L}_2^p(\mathcal{L}_1^{p^{-1}}(\mathcal{L}_2^p(x_0)))}^\infty f_{\mathcal{L}_2^p}(s) \left(1 - e^{\mathcal{L}_2^p(x_0) - s}\right) ds \nonumber\\
    &\leq& \int_{\mathcal{L}_2^p(x_0)}^{\mathcal{L}_2^p(\mathcal{L}_1^{p^{-1}}(\mathcal{L}_2^p(x_0)))} f_{\mathcal{L}_2^p}(s) \left(1 - e^{\mathcal{L}_2^p(x_0) - s}\right) ds \nonumber\\
    &&+ \int_{\mathcal{L}_2^p(\mathcal{L}_1^{p^{-1}}(\mathcal{L}_2^p(x_0)))}^\infty f_{\mathcal{L}_2^p}(s) \left( 1 - e^{\mathcal{L}_2^p(x_0) - s}\right) ds \nonumber\\
    &=& \int_{\mathcal{L}_2^p(x_0)}^\infty f_{\mathcal{L}_2^p}(s) \left( 1 - e^{\mathcal{L}_2^p(x_0) - s}\right) ds \nonumber\\
    &=& \delta_{\mathcal{L}_2^p}\left(\mathcal{L}_2^p(x_0)\right).\nonumber
\end{eqnarray}

The second inequality holds since for all $x \geq x_0$ and $s \geq \mathcal{L}_2^p(x_0)$:
\begin{eqnarray}
    -e^{\mathcal{L}_2^p(x_0) - \mathcal{L}_1^p\left(\mathcal{L}_2^{p^{-1}}(s)\right)} &\leq& -e^{\mathcal{L}_2^p(x_0) - s} \nonumber\\
    \Leftrightarrow \mathcal{L}_1^p(\mathcal{L}_2^{p^{-1}}(s)) &\leq s \\
    \Leftrightarrow \mathcal{L}_1^p(x) &\leq& \mathcal{L}_2^p(x).\nonumber
\end{eqnarray}

The third inequality follows from $1 - e^{\mathcal{L}_2^p(x_0) - s} \leq 0$ for all $s \geq \mathcal{L}_2^p(x_0)$ and $\mathcal{L}_2^p(x_0) \leq \mathcal{L}_2^p(\mathcal{L}_1^{p^{-1}}(\mathcal{L}_2^p(x_0)))$ which in turn follows from the assumptions.
\end{proof}

With the above lemma in place, we now simply insert the privacy loss variables for subsampled and plain Gaussian mechanism, i.e., let
\begin{equation}
    \mathcal{L}_1^p = \mathcal{L}_1^p(X_1) = \ln \left(\frac{q e^{\frac{2X_1-1}{2\sigma^2}} + 1 - q}{q e^{\frac{-2X_1-1}{2\sigma^2}} + 1 - q}\right), \label{eq:privacy_loss_subsampled_gm}
\end{equation}
where $X_1 \sim f_1(x) = q \mathcal{N}(x|1, \sigma^2) + (1-q) \mathcal{N}(x|0, \sigma^2)$ for the subsampled Gaussian mechanism where sampling is done for a fixed batch without replacement. Further, let
\begin{equation}
    \mathcal{L}_2^p = \mathcal{L}_2^p(X_2) = \frac{1}{2\sigma^2}(X_2 - \frac{1}{2}) \label{eq:privacy_loss_gm}
\end{equation}
where $X_2 \sim f_2(x) = \mathcal{N}(x|1, \sigma^2)$ be the PRV for the pure (not subsampled) Gaussian mechanism. We have that for all $x$ $\frac{d\mathcal{L}_1^p(x)}{dx} > 0$ and $\frac{d\mathcal{L}_2^p(x)}{dx} > 0$. The condition $f_1(x) \leq f_2(x)$ then holds for all $x$:
\begin{eqnarray}
    f_1(x) &\leq& f_2(x) \nonumber\\
    \Leftrightarrow q \exp(-\frac{1}{2\sigma^2} (x - 1)^2) + (1-q) \exp(-\frac{1}{2\sigma^2} x^2) &\leq& \exp(-\frac{1}{2\sigma^2} (x-1)^2) \\
%    \exp(-\frac{1}{2\sigma^2} x^2) &\leq \exp(-\frac{1}{2\sigma^2} (x-1)^2) \\
    \Leftrightarrow (x-1)^2 &\leq& x^2.\nonumber
\end{eqnarray}

We further have that $\mathcal{L}_1^p(x) \leq \mathcal{L}_2^p(x)$ for all $x > x_0$ with
\begin{equation}
    x_0 = \sigma^2 \left( \ln \left(1 - q \left( 1 + e^{-\frac{1}{\sigma^2}} \right)\right) - \ln \left( 1 - 2q \right) \right) + \frac{1}{2}, \label{eq:x0}
\end{equation}
provided that $q < 1/2$:
\begin{align}
    \mathcal{L}_1^p(x) &\leq \mathcal{L}_2^p(x) \nonumber\\
    q \exp\left(\frac{2x - 1}{2\sigma^2}\right) + 1 - q &\leq q e^{-\frac{1}{\sigma^2}} + (1 - q) \exp\left(\frac{2x - 1}{2\sigma^2}\right) \nonumber\\
    1 - q \left(1 + e^{-\frac{1}{\sigma^2}}\right) &\leq (1 - 2q) \exp\left(\frac{2x - 1}{2\sigma^2}\right) \nonumber\\
    \exp\left(\frac{2x - 1}{2\sigma^2}\right) &\geq \frac{1 - q \left(1 + e^{-\frac{1}{\sigma^2}}\right)}{1 - 2q} \label{eq:l1_less_l2_explanation} \\
    \frac{1}{\sigma^2}\left(x - \frac{1}{2}\right) &\geq \ln \left(1 - q \left(1 + e^{-\frac{1}{\sigma^2}}\right)\right) - \ln \left(1 - 2q\right) \nonumber\\
    x &\geq \sigma^2 \left(\ln \left(1 - q \left(1 + e^{-\frac{1}{\sigma^2}}\right)\right) - \ln \left(1 - 2q\right)\right) + \frac{1}{2}. \nonumber
\end{align}

The condition $q < 1/2$ is required in \eqref{eq:l1_less_l2_explanation} to avoid flipping the sign there.

Inserting \eqref{eq:x0} into \eqref{eq:privacy_loss_gm} yields
\begin{equation}
    s_0 := \mathcal{L}_2^p(x_0) = \frac{1}{2} \left(\ln \left( 1 - q (1 + e^{-\frac{1}{\sigma^2}} \right) - \ln \left( 1 - 2q \right) \right) \label{eq:bound_min_eps}.
\end{equation}
and it can be shown that $s_0 \leq \frac{1}{2}\left(\ln (1-q) - \ln(1 - 2q)\right)$ for all $\sigma > 0$ (where equality is obtained in the limit of $\sigma \rightarrow 0$). Letting $s_1 \geq \frac{1}{2}\left(\ln (1-q) - \ln(1 - 2q)\right) \geq s_0$, solving for $q$ yields
\begin{equation}
    q \leq \frac{e^{2s_1} - 1}{2e^{2s_1} - 1},
\end{equation}
which reaches its maximum of $\frac{1}{2}$ as $s_1 \rightarrow \infty$.

Putting everything together, we arrive finally at
\begin{corollary}
\label{thm:plain_gm_bounds_subsampled_gm_delta}
For any $\varepsilon \geq 0$, the privacy profile $\delta_{\text{sGM}}$ for the subsampled Gaussian mechanism using sampling without replacement for a fixed batch size $qN$ where $N$ is the total number of samples in the dataset, is upper bound by the privacy profile $\delta_{\text{GM}}$ of the plain Gaussian mechanism, $\delta_{\text{sGM}}(\varepsilon) \leq \delta_{\text{GM}}(\varepsilon)$, provided that 
\begin{equation}
    q \leq \frac{e^{2\varepsilon} - 1}{2e^{2\varepsilon} - 1} < \frac{1}{2}.
\end{equation}
\end{corollary}


\section{Experiment details}\label{Exp-details}
We acknowledge that the structure and content of the experimental details provided in Section ~\ref{Exp-details} of our supplementary material particularly Sections~\ref{Exp1-details} and \ref{Exp2-details}, excluding Section~\ref{Exp1-comparison} are closely inspired by the corresponding supplementary presentation in ~\cite{hamalainen2023differentiable}. We gratefully recognize the original authors, whose work meaningfully guided the design and presentation of the experimental details in our supplementary materials.
\subsection{Experiment 1: Gridworld Environment}\label{Exp1-details}

The first experiment scenario considers modeling MCTS agents in a $10 \times 10$ gridworld environment used in \citep{diff_user_models_UAI}. The gridworld environment is defined as a POMDP with deterministic transition dynamics. The state and action spaces, including the transition function, are shared across the full agent population, while each agent has their individual reward and observation functions. 

The state space $\mathcal{S} = \{1,\dots,10\}^2$ is a set of possible agent locations in the grid. The action space $\mathcal{A} = \{ \text{up}, \text{down}, \text{left}, \text{right} \}$ corresponds to transitioning to grid states adjacent to the agent's current location. The transition function $\mathcal{T}$ handles the transitions accordingly, but does not allow the agent to exit the grid; any actions which would result in the agent exiting the grid do not cause state transitions.

Each agent is described by their parameters $\theta \sim p(\theta)$ which conditions the generative process for their behaviour (MCTS) as $\pi \sim p(\pi \mid \theta)$. The parameters and their corresponding population-level distributions are included in Table~\ref{tab:ex11}. Here, the reward function corresponds to two reward states, one with a positive and one with a negative reward, and are sampled independently for each agent, such that the positive state cannot be the same state as the negative one. The observation function corresponds to a binary value which determines if the agent's vision is limited by a circular vision horizon, which effectively blocks any perceptions about the reward states outside that horizon. For agents with this horizon, the radius of the horizon is further controlled by the "tree depth" parameter, which simultaneously controls the MCTS planning depth. Finally, the memory parameter controls if the agents are allowed to utilise the planning tree from previous time steps for subsequent time steps instead of always starting from scratch.

\begin{table}[h]
    \centering
    \caption{Uniform prior on user model parameters for the user population in the gridworld experiment.}
    \begin{tabular}{l|l}
        \textbf{User Parameter} & \textbf{Distribution} \\ \hline 
         Reward States ($x, y$) &  $\mathcal{U}\{1,\dots,10\}$ \\
         Observation function   &  $\mathcal{U}\{0,1\}$ \\
         Tree Depth             &  $\mathcal{U}\{5,\dots,10\}$ \\
         Memory                 &  $\mathcal{U}\{0,1\}$ \\
    \end{tabular}
    \label{tab:ex11}
\end{table}

The modeling task considers modeling individual users sampled from the population $\theta \sim p(\theta)$. Each user $\theta$ generates $n \sim \mathcal{U}\{1,\dots,8\}$ trajectories of length $10$, each corresponding to one episode in the environment. In each episode, a new initial state is sampled for the user's location as $x,y \sim \mathcal{U}\{1,\dots,10\}$. The resulting trajectories are divided into context and target data for NP training as follows. One target trajectory is randomly selected and truncated at length $l \sim \mathcal{U}\{1, \dots 9\}$. The first half of this trajectory is concatenated with the other trajectories to construct the context dataset while the latter half is held-out acting as the modeling target. Based on the available context data at each task, we evaluate the model's ability to predict the held-out target data.

\subsubsection{Comparison with baselines and alternate models}\label{Exp1-comparison}

Neural processes have been established with results in other works \cite{diff_user_models_UAI} (in Fig.1),\cite{jha2022neural, nguyen2022transformer} to outperform alternatives in meta-learning \cite{MAML17} requiring uncertainty-awareness. For our user modeling purposes, NPs fulfill two critical desiderata that other meta-learning paradigms lack.

First, interactive human-AI applications require instantaneous model adaptation to human feedback. NP models can demonstrably achieve this by reducing the task-specific adaptation into a simple forward pass \cite{diff_user_models_UAI,jha2022neural} while other popular paradigms, such as MAML \cite{MAML17}, require expensive gradient-based optimization which has been shown to be infeasible during online interaction \cite{diff_user_models_UAI}.

Second, calibrated uncertainty estimation is a critical part of many human-centric problems, ranging from Bayesian optimization to experimental design problems. Unlike paradigms such as MAML, NPs are uncertainty-aware meta-learners \cite{jha2022neural} that can demonstrably satisfy the requirements of such settings \cite{diff_user_models_UAI,huang2024amortized}.

In addition to these critical properties, previous literature on NPs has already shown alternative meta-learning paradigms to empirically underperform in contrast to NPs, also in the context of user modeling tasks we use similar setting in \cite{diff_user_models_UAI}.

We conducted a comparison with a PATE-style teacher-student baseline \cite{PATE18}, and obtained favorable results as shown in Table \ref{tab:privacy_comparison}. In this setup, we trained disjoint teacher Neural Processes (NPs) on non-overlapping user subsets. Their predictions were aggregated via averaging and perturbed using Gaussian noise with standard deviations 
$\sigma \in {10, 50, 100}$. A student NP was then trained on these noisy labels. We do not explicitly compute $\epsilon$ in this experiment for PATE. Experiments were conducted using training sets of 500, 2000, and 5000 users:
\begin{itemize}
    \item For the 500-user setting, the data were split into five disjoint subsets: four were used to train teacher models, and the fifth was used to train the student on the noisy teacher-averaged predictions.
    \item The 2000-user setting used four 500-user subsets: three for teachers and one for the student.
    \item The 5000-user setup followed the same protocol as the 500-user case (i.e., 4 teachers, 1 student).
\end{itemize}
Evaluation was performed using the same validation dataset as in Fig. \ref{fig:accuracy-users-seen-ex1} and \ref{fig:accuracy-task-numb}.
Our method shows comparable or slightly better performance than the PATE-style baseline in the 500- and 5000-user scenarios. In the 2000-user case, the PATE-style baseline performs slightly better, which we attribute in part to the smaller number of teacher models used in that configuration.

However, the PATE-style approach becomes increasingly complex as more teacher models are introduced, requiring additional computation and coordination. This makes it less scalable and more prone to underperformance in low-data regimes, where each teacher is trained on limited data. In contrast, our method remains simple, scalable, and end-to-end differentially private via user-level DP training. 
 
\begin{table}[h]
\centering
\begin{tabular}{c c c c}
\hline
Training data (Users) & non-dp NP & PATE ($\sigma$: 10, 50, 100) & dp-NP (Ours) ($\epsilon$: 10, 5, 3, 1) \\
\hline
500  & 0.83 & 0.42, 0.46, 0.46 & 0.59, 0.50, 0.48, 0.44 \\ 
2000 & 0.83 & 0.83, 0.84, 0.84 & 0.81, 0.79, 0.77, 0.51 \\  
5000 & 0.84 & 0.78, 0.80, 0.81 & 0.82, 0.82, 0.80, 0.76 \\
\hline
\end{tabular}
\caption{Performance comparison of non-dp NP, PATE \cite{PATE18}, and dp-NP.}
\label{tab:privacy_comparison}
\end{table}
\subsection{Experiment 2}\label{Exp2-details}
\subsubsection{Menu Search 
Environment}
The second experiment considers modeling simulated users following the Menu Search model of \citet{kangasraasio2017inferring}. The Menu Search model is a cognitive model describing human search behaviour in terms of eye movements when searching for a specific item in a computer dropdown menu. Motivated by \textit{computational rationality} \citep{gershman2015computational}, the model simulates human behaviours as a result of RL-based optimization constrained by human cognitive limitations. Similarly as in \citep{diff_user_models_UAI}, we implement the users as deep-Q learning agents.

The Menu Search environment is specified as POMDP where the environment states capture information about the internal state of the user, including the current knowledge about the menu items, and the current gaze location of the user. Consistently with \citep{diff_user_models_UAI}, we consider a menu of eight elements, where each element is described in terms of its semantic relevance and length in comparison to the target item. 

At each time step, the user can fixate their gaze on a specific menu element, or alternatively to quit the scenario. Fixating on a specific menu item has a chance to reveal the information about the item while also having a chance to reveal the information about adjacent items via peripheral vision. When fixating on the target element, a large positive reward is emitted and the episode is ended.

Each modeling simulation considers a newly generated menu layout. The target item is not present in the menu in $10\%$ of the menus. If the user recognizes that the target element is not present and quits the menu, a large positive reward is emitted. If the user quits the menu when the target is present, a large negative reward is given. Otherwise, the user is given a small negative reward at each action based on the action duration; the action durations are controlled by the cognitive parameters specified in Table~\ref{tab:ex2}. In the first step of each episode, there is a small chance, $p_{rec}$, that the user recalls the menu layout, revealing the information about all menu elements.

\begin{table}[h]
    \centering
    \caption{Distributions for user cognitive properties used in the second experiment.}
    \begin{tabular}{r c|l}
         \textbf{User Parameter} & & \textbf{Distribution}  \\ \hline
         Menu recall probability & $p_{rec}$ & $\text{\textit{Beta}}(3.0, 1.35)$ \\
         Eye fixation duration   & $f_{dur}$ & $\mathcal{N}(3.0, 1.0)$\\
         Target item selection delay & $d_{sel}$ & $\mathcal{N}(0.3, 0.3)$
    \end{tabular}
    \label{tab:ex2}
\end{table}

Similarly as in the first experiment, each modeling task considers modeling individual users which have completed $n \sim \mathcal{U}\{1, \dots, 8\}$ search tasks in independently generated menu layouts with different target elements. We similarly truncate one of the resulting trajectories to form context and target datasets.
  
\subsubsection{Menu Search AI-assistant}
\label{Exp3-details}
In the third experiment, we simulate a more practical AI-assistant scenario by expanding the original menu search environment. The interface now features a two-tiered hierarchy: each complete menu includes a primary layer with items that function both as descriptors and as links to corresponding sub-menus. Within this setting, an AI assistant guided by our proposed user modeling approach is introduced. Its goal is to recommend sub-menus that align with the user’s preferences or intentions. A well-performing assistant is expected to steer users toward options that are likely to contain their target, effectively minimizing the time spent searching.

\textbf{Environment}. The hierarchical menu search environment introduces an $8 \times 8$ two-level menu setting. Importantly, the
environment behaves otherwise similarly to the original non-hierarchical version, with the exception of introducing a
main menu that allows a user to navigate between multiple menus. In addition, we introduce a simple mapping between
user observations (semantic relevancies and lengths w.r.t. the target element) and assistant observations (logical groups).
Specifically, each scenario introduces a set of 8 logical groups $\mathcal{S_{AI}}$ = {1,...,8} and 4 semantic relevance groups
$\mathcal{S_{\text{user}}}$ = \{target(1),high(2),medium(3),low(4)\}and an independently generated bidirectional mapping between $\mathcal{S_{AI}}$
and $\mathcal{S_{\text{user}}}$. The mapping initializes an ordered set of relevancies as r= \{4,4,4,3,3,2,3,3\} and assigns a relevance for each
logical group with a randomized circular shift on r. The intuition of the mapping is simply to mask the semantic information
regarding the target element (via randomization) while allowing a soft prior heuristic for the assistant by conserving semantic
similarity between similar logical groups. We similarly mask the item lengths via randomization.
After the mapping between the observation spaces $\mathcal{S_{AI}}$ and $\mathcal{S_{\text{user}}}$ is constructed, we sample two logical groups for each
sub-menu (such that each group occurs exactly twice in the full menu) and determine a semantic label for the menus
summarizing the relevancies of their respective logical groups. The target element is then assigned randomly into one of the
sub-menus that includes a logical group with highrelevance. The contents for each sub-menu are otherwise determined
by mapping the semantic labels of their logical groups into individual items according to the original menu search model
specifications. The main menu similarly follows the original specifications — however, we utilise the semantic labels of the
corresponding sub-menus as the relevancies for the main menu elements. At the main menu level, we also replace the item
length information with a binary variable denoting if the user has already opened the corresponding sub-menu. Finally, the
transition dynamics between the main menu and sub-menus are defined as follows: selecting an element at the main menu
-level transitions the user to the corresponding sub-menu, while quitting a sub-menu transitions the environment state back
to the main menu. Otherwise, all the transition and reward dynamics follow the original environment specifications.

\textbf{Assistant.} The hierarchical menu search environment is designed to include a basic search assistant that supports the user only when necessary. Initially, the assistant remains passive, activating only if the user fails to locate the target within the first sub-menu they visit. Upon activation, the assistant selects and highlights a specific item from the main menu when the user returns to that level. This highlight is assumed to draw the user’s attention, subtly influencing their next choice.

We further assume the user places a degree of trust in the assistant’s recommendation, which in turn increases the perceived semantic relevance of the highlighted option. Importantly, this does not prevent the user from disregarding the suggestion if it seems unhelpful. The assistant itself is implemented as a straightforward rule-based system that dynamically updates its model of the user as their behavior unfolds. While the assistant can observe where the user is looking, it does not have access to the actual semantic relevance of menu items. Instead, it updates its internal beliefs about both viewed and unseen options using the observation space described previously. Once triggered, the assistant simulates a possible user action at the fully revealed main menu level, conditioned on the user’s ongoing search behavior: \(a\sim p_\phi(a|s,z\), \(z\sim p_\psi(z |s,a)\). The
main menu element corresponding to the estimated most likely user action is then selected as the assistant’s suggestion
\subsection{Implementation and training details.} \label{code-details}
\begin{table}[h]
    \centering
    \caption{Base-architecture of the NP model in experiments.}
    \begin{tabular}{l r|l r}
        \textbf{Encoder} &  & \textbf{Decoder} & \\ \hline 
         Number of layers    & $6$      & Number of layers    & $6$         \\
         Activations       & Leaky ReLU & Activations         & Leaky ReLU  \\
         Hidden dimensions   & $128$    & Hidden dimensions   & $128$       \\ 
         Latent distribution & Gaussian & Output distribution & Categorical \\ 
    \end{tabular}
    \label{tab:ex12}
\end{table}
The code used in the experiments is largely based on the code of 
 ~\citet{diff_user_models_UAI}. The NP models utilised in this experiment are implemented with the Neural process pytorch library, while all the MCTS agents are implemented using the \verb|POMDPs.jl| library. All NP-models used in the experiments are trained on A100 and V100 gpus. Further details of the NP architecture are summarized in Table~\ref{tab:ex12}. The code used to produce the results in our paper is available at \url{https://github.com/AI-Fundamentals/DiffPrivNPUserModeling}.
\end{document}
