\section{Motivating Experiments}
\label{app:motivatingExp}

% \subsection{Synthetic Query Templates and Parameter Options}
% We use the following intent-specific templates and parameter category-option mappings to generate our synthetic queries:

% \paragraph{Intent-specific query templates:}
% \begin{lstlisting}[language=json]
% {
%       "movie_rec": {
%         "value": "I am looking for a movie recommendation.\n
%         The genre should be \"{param_0}\".\n
%         It should have been released \"{param_1}\".\n
%         The intended audience includes \"{param_2}\".\n
%         The runtime should be \"{param_3}\".\n
%         Please provide movie recommendations that satisfy all of my requirements."
%       },

%       "gift_rec": {
%         "value": "I am looking for a gift recommendation.\n
%         The recipient likes \"{param_0}\".\n
%         The recipient is \"{param_1}\" years old.\n
%         The recipient prefers gifts to be \"{param_2}\" in nature.\n
%         My budget to purchase the recipient a gift is in the \"{param_3}\" range.\n
%         Please provide gift recommendations that satisfy all of my requirements."
%       },

%         "plant_rec": {
%         "value": "I am looking for a house plant recommendation.\n
%         I prefer a plant that \"{param_0}\".\n
%         I'm willing to expend a \"{param_1}\" amount of effort to care for the plant.\n
%         My house gets a \"{param_2}\" amount of natural light.\n
%         I live \"{param_3}\".\n
%         Please provide house plant recommendations that satisfy all of my requirements."
%       }
% }
% \end{lstlisting}

% \paragraph{Intent-specific query parameter category-option mappings}:
% \begin{lstlisting}[language=json]
% {
%       "movie_rec": {
%         "param_0": {"cat": "genre",
%                     "opts": ["Action", "Adventure", "Animation", "Biography", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film Noir", "History", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Sport", "Superhero", "Thriller", "War", "Western"],
%                     "max_sel_allowed": 2,
%                     "pref_constraint_type": "set_valued"},
%         "param_1": {"cat": "release date",
%                     "opts": ["in the 1980s", "in the 1990s", "in the 2000s", "in the past few years"],
%                     "max_sel_allowed":1,
%                     "pref_constraint_type": "numeric_range"},
%         "param_2": {"cat": "who will be watching",
%                     "opts": ["children", "adults and children", "teenagers younger than 17", "adults only"],
%                     "max_sel_allowed": 1,
%                     "pref_constraint_type": "set_valued"},
%         "param_3": {"cat":  "runtime",
%                     "opts": ["less than 90 minutes", "90-104 minutes", "105-119 minutes", "120 minutes or more"],
%                     "max_sel_allowed": 1,
%                     "pref_constraint_type": "set_valued"}
%       },

%       "gift_rec": {
%         "param_0": {"cat": "recipient interests",
%                     "opts":  ["outdoors", "crafts", "technology", "books", "active play/sports/fitness", "food/cooking", "music and arts", "apparel/fashion/style"],
%                     "max_sel_allowed":  2,
%                     "pref_constraint_type": "set_valued"},
%         "param_1": {"cat": "recipient age range",
%                     "opts":  ["3-5", "6-12", "13-17", "18-40", "41-60", "61+"],
%                     "max_sel_allowed":  1,
%                     "pref_constraint_type": "numeric_range"},
%         "param_2": {"cat": "recipient preferred gift type",
%                     "opts":  ["practical and everyday", "personalized and sentimental", "adventurous and experience-driven", "luxurious and pampering", "high-tech and innovative", "creative and artistic", "sustainable and eco-friendly"],
%                     "max_sel_allowed":  1,
%                     "pref_constraint_type": "set_valued"},
%         "param_3": {"cat": "giver budget",
%                     "opts":  ["less than $20", "$20-49", "$50-99", "$100-199", "$200+"],
%                     "max_sel_allowed":  1,
%                     "pref_constraint_type": "numeric_range"}
%       },

%       "plant_rec": {
%         "param_0": {"cat": "plant physical attributes",
%                     "opts": ["has light green leaves", "has dark green leaves", "has multi-colored leaves", "yields edible produce", "produces a single bloom", "produces multiple blooms", "is short in stature", "is tall in stature"],
%                     "max_sel_allowed":  2,
%                     "pref_constraint_type": "set_valued"},
%         "param_1": {"cat": "level of effort",
%                     "opts":  ["low", "moderate", "high"],
%                     "max_sel_allowed":  1,
%                     "pref_constraint_type": "set_valued"},
%         "param_2": {"cat": "amount of natural light",
%                     "opts":  ["low", "indirect", "moderate", "bright"],
%                     "max_sel_allowed":  1,
%                     "pref_constraint_type": "set_valued"},
%         "param_3": {"cat": "household members",
%                     "opts":  ["by myself", "with young children", "with pets", "with a roommate with seasonal allergies"],
%                     "max_sel_allowed": 1,
%                     "pref_constraint_type": "set_valued"}
%       }
% }
% \end{lstlisting}


\subsection{Synthetic Dataset Construction}
\label{app:synthetic_data}
%Our synthetic dataset is designed to satisfy the following desiderata: (1) to avoid confounding effects associated with dataset contamination, the LLMs we seek to evaluate should not have seen these queries before (i.e., during training); (2) we should be able to \emph{objectively} evaluate whether the items recommended in response to such queries satisfy their expressed preferences or constraints over the output distribution; and (3) we should be able to modify the degree of under-specification (i.e., map each query generated via full access to a synthetic latent preference space, $\Theta$, to many possible masked versions generated via access to some partially observable subset, $\Theta^r \subsetneq \Theta$) and control exogenous sources of linguistic variation for each query that we evaluate.

%We begin by defining a set of recommendation query intents: \textcolor{magenta}{$\mathcal{I}~\coloneq~\{\text{movie, gift, plant}\}$}. We choose these intents to examine how default and intervention-induced LLM behaviors vary over domains that admit increasingly restricted search spaces and relevant/candidate item subsets of those search spaces~\chComment{is this clear?}. 

%We assume for the sake of exposition that each intent $i \in \mathcal{I}$ can be \emph{sufficiently specified} by a finite set of attributes $\Theta_i$, and that each attribute $\theta \in \Theta_i$ can be mapped in turn to a finite set of possible numeric or categorical options---i.e., $X_{\theta} \coloneq \{x \ | \ x \text{ is an option for } \theta \}$. By ``sufficiently specified'', we mean that it is possible to provide a high-quality, tailored response to a query containing selected options for each $\theta \in \Theta_i$, and that the marginal change in response associated with the introduction of some logically consistent additional attribute, $\theta^\prime \notin \Theta_i$ would be small.

%For illustrative purposes, the specific $\Theta_i$ that we consider are carefully selected to (a) facilitate objective validation of preference and constraint satisfaction and (b) avoid introducing conditional dependencies among individual $\theta$. We view the extension of the approach we describe here to settings with larger and more causally complex attribute spaces as a valuable direction for future work. \chComment{idk quite how to say this, but we want to establish that this is a stylized setting that can still be informative, and to not have reviewers overindex on the specific $\Theta_i$s and/or how they are not \emph{truly} exhaustive.}

% We conjecture that the approach we describe here can extend to settings with larger, more complex attribute spaces, 

% % The assumption that any finite attribute set can 
%In an effort to control exogenous sources of linguistic variation, we construct parameterized templates for each intent. The first sentence of each template declares the user's intent---i.e., ``I am looking for a gift recommendation''. Each subsequent sentence features a single attribute, $\theta \in \Theta_i$, along with slot(s) to be filled in with selected option(s) for that attribute at query generation time. 

%Note that some attributes admit exactly one selection (e.g., a gift recipient's age), while others admit multiple selections (e.g., a gift recipient's interests). Recommended items are assumed to have features that can be objectively computed for the purpose of determining whether an item satisfies the preference or constraint represented by each query attribute. An item is assumed to satisfy an attribute that admits multiple selections if it satisfies \emph{at least one} selected option (i.e., via logical OR). 

We follow the synthetic query construction process outlined in Section~\ref{sec:synthCorpusConstruction} and formalized in Algorithm~\ref{alg:maskedQueryGen}, along with the parameterized templates shown in Appendix~\ref{app:parameterizedQueryTemplates} to generate a corpus of sufficiently specified queries. Then, for each sufficiently specified query, we approximate a \emph{partially specified} version by randomly selecting the number of attributes to omit, $n \in \{1, \dots, |\Theta_i|\}$.
We keep the intent-declaring first sentence unchanged, shuffle the remaining attribute-sentences, draw a subset of sentences to \emph{omit}---i.e., with cardinality $n$--- and concatenate the remaining sentences. 

\begin{minipage}{.99\linewidth}
\begin{algorithm}[H]
\caption{\textsc{Generate Synthetic Query}}
\label{alg:maskedQueryGen}
\begin{algorithmic}[1] 
%\FUNCTION{GenQuery}{$\mathcal{I}, \Theta_{(\cdot)}$}
\STATE \textbf{function} GenQuery($\mathcal{I}, \Theta_{(\cdot)}$)
    \STATE $i \sim \mathcal{I}$ \COMMENT{Draw intent}
    \FOR{$\theta \in \Theta_i$} %\COMMENT{Loop over intent attributes}
      \STATE selected option(s) $\gets X^\prime \sim X_{\theta}$
      \STATE template$_i$ $\gets$ template$_i$ $\cup$ selected option(s)
    \ENDFOR
    \STATE{$q_{s} \gets$ template$_i$} \COMMENT{Sufficient query $\coloneq$ filled-in template}
    \STATE{$n \sim U(\{1, \dots, |\Theta_i|\})$} \COMMENT{Draw \# of attrs to mask}
    \STATE{$\Theta^m_i \sim U(S_n(\Theta_i))$} \COMMENT{Draw $n$ masked attrs}
    \STATE{$\Theta^r_i \coloneq \Theta_i \setminus \Theta^m_i$} \COMMENT{Determine revealed attrs}
    \STATE{$q_{m} \gets \text{concat}(\Theta^r_i)$} \COMMENT{Build masked version of $q_s$}
    \STATE \textbf{return} $q_s, q_m$ \COMMENT{Return sufficient \& masked queries}
%\ENDFUNCTION
\end{algorithmic}
\end{algorithm}
\end{minipage}

\subsection{Synthetic Query Templates and Parameter Options}
\label{app:parameterizedQueryTemplates}
We use the intent-specific templates and parameter category-option mappings shown below in conjunction with the query construction procedure outlined in Appendix~\ref{app:synthetic_data} to generate our synthetic queries:

\par \textbf{Intent-specific query templates:}

\begin{lstlisting}[language=json]
{
      "movie_rec": {
        "value": "I am looking for a movie recommendation.\n
        The genre should be \"{param_0}\".\n
        It should have been released \"{param_1}\".\n
        The intended audience includes \"{param_2}\".\n
        The runtime should be \"{param_3}\".\n
        Please provide movie recommendations that satisfy all of my requirements."
      },

      "gift_rec": {
        "value": "I am looking for a gift recommendation.\n
        The recipient likes \"{param_0}\".\n
        The recipient is \"{param_1}\" years old.\n
        The recipient prefers gifts to be \"{param_2}\" in nature.\n
        My budget to purchase the recipient a gift is in the \"{param_3}\" range.\n
        Please provide gift recommendations that satisfy all of my requirements."
      },

        "plant_rec": {
        "value": "I am looking for a house plant recommendation.\n
        I prefer a plant that \"{param_0}\".\n
        I'm willing to expend a \"{param_1}\" amount of effort to care for the plant.\n
        My house gets a \"{param_2}\" amount of natural light.\n
        I live \"{param_3}\".\n
        Please provide house plant recommendations that satisfy all of my requirements."
      }
}
\end{lstlisting}

\par \textbf{Intent-specific query parameter category-option mappings:}


\begin{lstlisting}[language=json]
{
      "movie_rec": {
        "param_0": {"cat": "genre",
                    "opts": ["Action", "Adventure", "Animation", "Biography", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film Noir", "History", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Sport", "Superhero", "Thriller", "War", "Western"],
                    "max_sel_allowed": 2,
                    "pref_constraint_type": "set_valued"},
        "param_1": {"cat": "release date",
                    "opts": ["in the 1980s", "in the 1990s", "in the 2000s", "in the past few years"],
                    "max_sel_allowed":1,
                    "pref_constraint_type": "numeric_range"},
        "param_2": {"cat": "who will be watching",
                    "opts": ["children", "adults and children", "teenagers younger than 17", "adults only"],
                    "max_sel_allowed": 1,
                    "pref_constraint_type": "set_valued"},
        "param_3": {"cat":  "runtime",
                    "opts": ["less than 90 minutes", "90-104 minutes", "105-119 minutes", "120 minutes or more"],
                    "max_sel_allowed": 1,
                    "pref_constraint_type": "set_valued"}
      },

      "gift_rec": {
        "param_0": {"cat": "recipient interests",
                    "opts":  ["outdoors", "crafts", "technology", "books", "active play/sports/fitness", "food/cooking", "music and arts", "apparel/fashion/style"],
                    "max_sel_allowed":  2,
                    "pref_constraint_type": "set_valued"},
        "param_1": {"cat": "recipient age range",
                    "opts":  ["3-5", "6-12", "13-17", "18-40", "41-60", "61+"],
                    "max_sel_allowed":  1,
                    "pref_constraint_type": "numeric_range"},
        "param_2": {"cat": "recipient preferred gift type",
                    "opts":  ["practical and everyday", "personalized and sentimental", "adventurous and experience-driven", "luxurious and pampering", "high-tech and innovative", "creative and artistic", "sustainable and eco-friendly"],
                    "max_sel_allowed":  1,
                    "pref_constraint_type": "set_valued"},
        "param_3": {"cat": "giver budget",
                    "opts":  ["less than $20", "$20-49", "$50-99", "$100-199", "$200+"],
                    "max_sel_allowed":  1,
                    "pref_constraint_type": "numeric_range"}
      },

      "plant_rec": {
        "param_0": {"cat": "plant physical attributes",
                    "opts": ["has light green leaves", "has dark green leaves", "has multi-colored leaves", "yields edible produce", "produces a single bloom", "produces multiple blooms", "is short in stature", "is tall in stature"],
                    "max_sel_allowed":  2,
                    "pref_constraint_type": "set_valued"},
        "param_1": {"cat": "level of effort",
                    "opts":  ["low", "moderate", "high"],
                    "max_sel_allowed":  1,
                    "pref_constraint_type": "set_valued"},
        "param_2": {"cat": "amount of natural light",
                    "opts":  ["low", "indirect", "moderate", "bright"],
                    "max_sel_allowed":  1,
                    "pref_constraint_type": "set_valued"},
        "param_3": {"cat": "household members",
                    "opts":  ["by myself", "with young children", "with pets", "with a roommate with seasonal allergies"],
                    "max_sel_allowed": 1,
                    "pref_constraint_type": "set_valued"}
      }
}
\end{lstlisting}

% \subsection{Copilot Response Strategies}
% \label{app:meRespStrat}
\pagebreak

\subsection{Stable Diffusion Example: Policy-induced responses}
\label{app:stableDiffusion}
%In Section~\ref{sec:subOptDefaultSingleStep}, 
Here, we present a query from the OpenAssistant dataset~\citep{köpf2023openassistant} as an example of a situation where \textsc{Clarify} may be preferred over \textsc{Direct response} and \textsc{Hedge}. 
%We replicate this query for convenience below; 
Table~\ref{tab:stableDiffusionResponses} presents the policy-induced responses generated by $\defaultPolicy{}$, \textsc{Hedge}, and \textsc{Clarify}.

\paragraph{Query text:} ``Please give me a prompt for stable diffusion to generate a good-looking image.''

\begin{table}[!htb]
\begin{tabular}{p{0.25\linewidth} | p{0.7\linewidth}}
\toprule
  \textbf{Response strategy} $\tau \in \mathcal{T}$ & \textbf{Response text} \\
\midrule
baseline & "Create an image that visually represents the process of stable diffusion, showcasing the gradual and even spread of particles throughout a medium. Use a color gradient to highlight the movement and distribution of particles, and ensure the overall aesthetic is pleasing and balanced." \\
\hline
 hedge & The query is asking for a prompt to generate a good-looking image using stable diffusion. However, the important factors that are not specified include:

1. The type of image or subject matter desired (e.g., landscape, portrait, abstract, etc.)
2. The style or aesthetic the user prefers (e.g., realistic, surreal, impressionistic, etc.)
3. The color scheme or palette the user prefers (e.g., warm, cool, monochromatic, etc.)

Given these factors, here are some plausible responses:

1. If the user wants a landscape image in a realistic style with a warm color scheme, a possible prompt could be: "Generate a sunset over a serene lake with autumn foliage using stable diffusion."
2. If the user prefers a portrait in a surreal style with a cool color scheme, a possible prompt could be: "Generate a portrait of a woman with flowing blue hair and eyes that mirror the ocean using stable diffusion."
3. If the user is interested in an abstract image in a monochromatic color scheme, a possible prompt could be: "Generate an abstract image using stable diffusion that represents the concept of tranquility in shades of blue." \\
\hline 
clarify & 1. What type of image are you looking to create? (e.g., abstract, realistic, geometric)
2. Are there any specific colors or color schemes you would like to incorporate into the image?
3. Do you have any specific themes or elements you want to be included in the image? (e.g., nature, space, patterns) \\
\bottomrule
\end{tabular}
\caption{Policy-induced responses to the stable diffusion example query.}
\label{tab:stableDiffusionResponses}
\end{table}


\subsection{Simulating User Responses to LLM Questions}
\label{app:simulatingUserResponseToQuestions}

In Section~\ref{sec:suboptDefaultPolicyMultiStep}, we discuss how we leverage a series of helper LLM calls to construct templatized responses to LLM questions based on the overlap (or lack thereof) between the questions and the user's true preferences, $\theta$. Here we provide the system messages used in these helper calls. The first system message is for the extraction of recommended items and/or questions from LLM responses. The second system message helps us to construct a  mapping from the LLM's questions to the user's true preferences, $\theta$, such that we can determine what subset of previously masked attributes can be ``revealed'' in the templatized user response we construct. 

\begin{lstlisting}[language=json]
{"extract_recs_and_questions": 

    "For each response in this list <list>{{input.""" + f'{field_to_use}' +"""}}</list>, read the response carefully and:\n
            
            1. Extract the titles of each and every movie recommendation that appears; they may show up as a list of titles.\n
                Do not extract any additional metadata but DO extract any mentioned titles; represent each title as a string.\n
            
                Format your answer for task 1 as shown below:\n

                ["rec" for rec in recommended movies] OR [], ONLY if NO movie recommendations appear.

            2. Extract any questions that appear; they may be prefaced with a request to specify preferences, and/or show up as a list of questions.\n
               DO extract ANY mentioned questions; represent each question as a string.
               
                Format your answer for task 2 as shown below:\n

                ["question" for question in questions] OR [], ONLY if NO questions appear.\n
            
           Return your results as a dict:\n

               {"recs": [response to task 1], "questions": [response to task 2]} \n

           DO NOT answer any questions contained in the response, or include ANY expository text.\n
           The result should be DIRECTLY parsable as a valid dict in Python.""""
}
    
\end{lstlisting}

\begin{lstlisting}[language=json]
{
    "map_questions_to_thetas": 
        
        "You will receive a list containing sets of questions.\n
        Each question is issued by an assistant to a user, in response to a movie recommendation request submitted by the user.\n
        For each set of questions in this list <list>{{input.questions}}</list>,\n
        You have the ability to ask the user about their preferences for each of the following movie attributes: 
            [genre, release date, who will be watching, runtime].\n 
        Note that 'who will be watching' is related to the user's preferences for the movie's rating.
        For each set of questions, map each question to one of these attributes IF asking about this specific attribute would allow you to answer the question.\n
        If none of the attributes would give you the information you need to answer a given question, map that question to "None".\n
        Format your response as a list of strings, as shown in the example below:\n
            ['genre', 'release date', 'None'] \n
        DO NOT answer any questions contained in the response, or include any expository text.\n
        The result should be DIRECTLY parsable as a list of strings in Python."
}
    
\end{lstlisting}

% \subsection{Single-turn Response Evaluation Functions}
% \label{app:meEvalFuncs}

% % this corresponds to Section 2.2.4, should we re-name?
% \subsection{Motivating Experiment 1}
% \label{app:meOneClassify}

% \subsubsection{Real-world query corpus}
% \label{app:meThreeReal}
% % TODO: provide information and prompts related to how we construct fully-specified versions of the under-specified/original queries contained in the StackExchange dataset

