%\documentclass{uai2024} % for initial submission
\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
           
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\usepackage{thmtools,amsfonts}
\newcommand{\cmark}{\ding{51}}%
\newcommand{\xmark}{\ding{55}}%
\usepackage{thmtools}
\usepackage{thm-restate}
\usepackage{algorithmic,algorithm}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{wrapfig}
\usepackage{multirow,multicol}
\usepackage{amsthm}
\input{defs}

\title{Online Policy Optimization for Robust Markov Decision Process}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Jing Dong \thanks{Authors are ordered alphabetically. Corresponding to: Jing Dong, Jingwei Li, Baoxiang Wang, Jingzhao Zhang.}}
\author[2]{Jingwei Li $^\ast$}
\author[1,3]{Baoxiang Wang $^\ast$}
\author[2,4]{Jingzhao Zhang $^\ast$}
% Add affiliations after the authors
\affil[1]{%
    The Chinese University of Hong Kong, Shenzhen
}
\affil[2]{%
    Institute for Interdisciplinary Information Sciences, Tsinghua University
}
\affil[3]{%
    Vector Institute 
}
\affil[4]{%
    Shanghai Qizhi Institute 
}

\begin{document}
\maketitle

\begin{abstract}
Reinforcement learning (RL) has exceeded human performance in many synthetic settings such as video games and Go. However, real-world deployment of end-to-end RL models is less common, as RL models can be very sensitive to perturbations in the environment. 
The robust Markov decision process (MDP) framework---in which the transition probabilities belong to an uncertainty set around a nominal model---provides one way to develop robust models. 
While previous analysis for robust MDP shows RL algorithms are effective assuming access to a generative model, it remains unclear whether RL can be efficient under a more realistic online setting, which requires a careful balance between exploration and exploitation. 
In this work, we consider online robust MDP by interacting with an unknown nominal system. We propose a robust optimistic policy optimization algorithm that is provably efficient. 
To address the additional uncertainty caused by an adversarial environment, our model features a new optimistic update rule derived via Fenchel conjugates. 
Our analysis establishes the first regret bound for online robust MDPs. 
\end{abstract}

\section{Introduction}
\input{intro}

\section{Related work}
\input{related}

\section{Robust MDP and uncertainty sets}
\input{formulation}

\section{Algorithm}
\input{algorithm}

\section{Theoretical results}


\input{theoretical_results}

\section{Empirical results} 
\input{exp}
%\section{Experiments}
%\begin{itemize}
%    \item Gridworld environment?
%    \item Range of rho?
%\end{itemize}
\section{Conclusion and future directions}
\input{conclusion}

\section*{Acknowledgement}
% References

Jing Dong and Baoxiang Wang are partially supported by the National Natural Science Foundation of China (62106213, 72394361).


\bibliography{uai2024-template}

% \newpage




\onecolumn
\appendix
\title{Online Policy Optimization for Robust Markov Decision Process\\(Supplementary Material)}
\maketitle


%\vspace{-4cm}
\input{appendix}

\end{document}
