%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}

\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\usepackage{multirow}%add
%\usepackage[colorlinks,linkcolor=red]{hyperref}
\usepackage{float}

\externaldocument{sigproc2022}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{An Effective Negotiating Agent Framework based on Deep Offline Reinforcement Learning\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<siqichen@tju.edu.cn>?Subject=An Effective Negotiating Agent Framework based on Deep Offline Reinforcement Learning}{Siqi Chen}{}}
\author[1]{Jianing Zhao}
\author[2]{Gerhard Weiss}
\author[1]{Ran Su}
\author[3]{Kaiyou Lei\thanks{Corresponding author, Kaiyou Lei <kylei2022@163.com>}}
% Add affiliations after the authors
\affil[1]{%
    College of Intelligence and Computing\\
    Tianjin University\\
    Tianjin, China
}
\affil[2]{%
    Department of Advanced Computing Sciences\\ 
    Maastricht University\\
    Maastricht, the Netherlands
}
\affil[3]{%
    College of Computer and Information Science\\ 
    Southwest University\\
    Chongqing, China
  }

  
\begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

% This Supplementary Material should be submitted as a separate file. Please do not append the Supplementary Material to the main paper. 

% Fig. \ref{fig:pitt} and Eq \ref{eq:example} in the main paper can be cross referenced using \texttt{xr}. 
%\href{https://github.com/vitchyr/rlkit}{(https://github.com/vitchyr/rlkit)}
\appendix


\section{Experimental Setup Details} 
\label{sec:exp}
Overall, all of our implementations are based on the rikit\footnote{See \url{https://github.com/vitchyr/rlkit}
}, a collection of reinforcement learning techniques.
Using the same library helps us reduces the impact of implementation differences.
We created several agents that are trained by a reinforcement learning algorithm. 
SAC is used to implement all agents, including baseline agents. In addition, we train a neural network called the density ratio estimator to estimate density ratios of samples. 
This network can calculate the priority of different trajectories in the reinforcement learning process, allowing the online fine-tuning process to safely utilize online samples by leveraging relevant, near-on-policy offline samples.
First, we present our DOREA agent and baseline agents' detailed implementation and corresponding hyper-parameters. Second, we present the model structure and hyper-parameters for the neural network-based density ratio estimator.
\subsection{IMPLEMENTAIN DETAILS FOR ALL AGENTS}
Due to all the below agents based on SAC, Table \ref{tab:para2} shows the general hyper-parameters used in our experiments and training process.

\textbf{Offline RL.} 
For training offline DOREA agents, we train ensemble DOREA w/o sft by training N=5 (ensemble size) CQL based Q-functions and policies for 5 random seeds separately then combine them.
For every offline strategies, we use 2-layer MLPs for value and policy network. 
Other parameter settings are identical to the setup of \cite{kumar2020conservative}.
See Table \ref{tab:para1} for specific parameters.
% For every offline strategies, we implement them based on the open source SAC version\href{https://github.com/vitchyr/rlkit}{(https://github.com/vitchyr/rlkit)}, other parameter settings are identical to the setup of \cite{kumar2020conservative}.See Section \ref{sec:parameter} for specific parameters.

\textbf{Online fine-tuning.} 
For fine-tuning processing, we initialize parameter of ensemble agent by the parameter of offline DOREA w/o sft.
We also use ensemble size N=5, and trained for 1000 steps every 1000 additional samples were collected.
The model has the same network structure, we employed the Adam optimizer \citep{kingma2014adam} with policy learning rate of 3e-4 and value learning rate of 3e-4.
See Table \ref{tab:para1} for specific parameters and architecture.

\textbf{Baseline.} For the SAC-sft, we initialize its strategy with an offline trained CQL agent (We don't use ensemble here).
Additionally, both SAC and SAC-sft used the implementation from rlkit with default hyperparameters describe in Table \ref{tab:para2}.

Furthermore, it should be noted that in all the above agent, we use trainable temperature factor $\alpha$ in our SAC version.
Besides this, two critic networks with the same structure are created, each with its own layers and weights.
The second critic networks is commonly referred to as the target critic network.
The weights from the critic network are copied with smoothing via $target$ $update$ $\tau$ to the target critic network after each $target$ $update$ $period$ train step.

\subsection{IMPLEMENTAIN DETAILS FOR DENSITY RATIO ESTIMATOR}
\textbf{Density ratio estimaor.}
In balanced replay (BR) component, for training the density ratio estimation network $\omega_\phi(s,a)$, which was designed as a 2-layer MLP, we use batch size 256 (i.e., 256 offline samples and 256 online samples), and learning rate is 3e-4.
We apply self-normalization to the estimated density ratio over $\mathcal{B}^{off}$, we calculate priority values:
\begin{equation}    
\widetilde{w}_{\psi}(x)=\frac{w_{\psi}(x)^{1/T}}{{E}_{x \sim P}\left[w_{\psi}(x)^{1/T}\right]}
\label{eq:x} 
\end{equation}
where $x$ and $P$ denote $(s,a)$ and $\mathcal{B}^{off}$ respectively, and $T$ is the temperature hyper-parameter and we set it 5.
Before starting fine-tuning, we will add offline samples to the replay buffer at a priority of 1.0.
Since it is necessary to ensure that new online samples can be updated at the initial stage, we set a high default priority for newly added samples to ensure this.
Specifically, letting $M$ denote the size of the offline buffer.
We set the default priority to make the initial 1000 online samples collected have the probability $\rho$ of been seen, where $\rho$ is a hyper-parameter, i.e., priority value of $P_{0}:=\frac{M}{1000} \cdot \frac{\rho}{1-\rho}$.
We used $\rho=0.75$.
After the used online data is updated in RL, the priority of the given sample will be updated appropriately, and then the default priority value will be updated to the maximum priority value seen during fine tuning.
For detailed algorithm, please refer to ~\cite{lee22d}.



% \section{PERFORMANCE OF DOREA WITH CHANGES IN THE OPPONENT STRATEGIES WITH ANAC2022 WINNER}  
% need?

\section{HYPER-PARAMETERS} 
\label{sec:parameter}
\begin{table}[H]    
\centering    
\caption{Specific hyper-parameters for different Algo, CQL and BR stand for offline and online training, respectively. The meaning of hyper-parameter names can be found in the original paper.}      
\begin{tabular}{cll}  
\toprule 
\multicolumn{1}{l}{Algo} & Hyper-param name & Value \\ \hline     
\multirow{2}[0]{*}{CQL} & conservative weight & 10 \\           
& \# of actions sampled & 10 \\     \hline 
\multirow{5}[0]{*}{BR} & offline buffer size & 2e6 \\            
& online buffer size & 2e6 \\           
& density ratio estimation network arch. &  [|$\mathcal{S}$|+|$\mathcal{A}$|, 256, 256, 1] \\           
& density ratio estimation network temp $T$ & 5 \\ 
& $\rho$ & 0.75 \\ 
\bottomrule 
\end{tabular} 
\label{tab:para1} 
\end{table}  
\begin{table}[H]     
\centering     
\caption{General hyper-parameters, CQL and BR stand for offline and online training, respectively.}       
\begin{tabular}{l|cccc}   
\toprule  
& \multicolumn{1}{l|}{CQL} & \multicolumn{1}{l}{BR} & \multicolumn{1}{l}{SAC-ft} & \multicolumn{1}{l}{SAC(scratch)} \\ \hline        
Phase & \multicolumn{1}{l|}{offline} & \multicolumn{3}{c}{online} \\ \hline         
\multicolumn{5}{c}{General hyper-params} \\    \hline    
$\pi$ Arch. & \multicolumn{4}{c}{[|$\mathcal{S}$|+|$\mathcal{A}$|,256,256,1]} \\       
$Q$ Arch.     & \multicolumn{4}{c}{[|$\mathcal{S}$|+|$\mathcal{A}$|,256,256,1]} \\ \hline       
\# $Q$ nets & \multicolumn{4}{c}{2} \\
\# target update period & \multicolumn{4}{c}{1} \\      
soft target update $\tau$ & \multicolumn{4}{c}{0.005} \\\hline
Activation & \multicolumn{4}{c}{ReLU} \\      
Optimizer & \multicolumn{4}{c}{Adam for all} \\       
Adam params & \multicolumn{4}{c}{betas = (0.9, 0.999), eps = 1e-8, weight decay = 0} \\   \hline     $\pi$ lr  & \multicolumn{1}{c|}{1e-4} & \multicolumn{3}{c}{3e-4} \\       
$Q$ lr  & \multicolumn{1}{c|}{3e-4} & \multicolumn{3}{c}{3e-4} \\       
$\alpha$ lr  & \multicolumn{1}{c|}{1e-4} & \multicolumn{3}{c}{3e-4} \\  \hline      
\# epochs &  \multicolumn{4}{c}{1000} \\      
\# step/epoch & \multicolumn{4}{c}{1000} \\       
\# batch/step & \multicolumn{4}{c}{1} \\        
Batch size & \multicolumn{4}{c}{256} \\ \hline        
\multicolumn{5}{c}{Hyper-params for the base SAC impl.} \\   \hline     
Entropy target H & \multicolumn{4}{c}{|$\mathcal{A}$| } \\       
Uni-model Gaussian & \multicolumn{4}{c}{Yes} \\       
Squashed Gaussian & \multicolumn{4}{c}{Yes} \\     
\bottomrule 
\end{tabular}   
\label{tab:para2}  
\end{table}


\bibliography{sigproc2022}

\end{document}
