\documentclass[11pt]{article}
\usepackage{fullpage,graphicx,psfrag,amsmath,amsfonts,verbatim,bm}
%\usepackage{xcolor}
\usepackage[table,xcdraw]{xcolor}

\input{ICLR/math_commands}
\usepackage[small,bf]{caption}
%\usepackage[table,xcdraw]{xcolor}
\usepackage{natbib}
\bibliographystyle{plainnat}
\usepackage{amsthm,apxproof}
\usepackage{thmtools,amsmath,amsfonts} \usepackage{amssymb}% http://ctan.org/pkg/amssymb
\usepackage{pifont}% http://ctan.org/pkg/pifont
\newcommand{\cmark}{\ding{51}}%
\newcommand{\xmark}{\ding{55}}%
\usepackage{thm-restate}
\usepackage{booktabs}
\newcommand{\dong}[1]{\textcolor{blue}{\textbf{[Dong: #1]}}}
\usepackage{algorithmic,algorithm}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{wrapfig}
\usepackage{mathtools}
\usepackage{multirow}
\usepackage{hyperref}
\usepackage{url}
\usepackage{booktabs}
\newtheorem{thm}{Theorem}
\newtheorem{lem}{Lemma}
\newtheorem{cor}{Corollary}[section]
\newtheorem{prop}{Proposition}[section]
\newtheorem{asmp}{Assumption}[section]
\newtheorem{defn}{Definition}[section]
\newtheorem{oracle}{Oracle}[section]
\newtheorem{claim}{Claim}[section]
\newtheorem{conj}{Conjecture}[section]
\newtheorem{rem}{Remark}[section]
\newtheorem{example}{Example}[section]
\newtheorem{condition}{Condition}[section]
\allowdisplaybreaks
\let\svthefootnote\thefootnote
\newcommand\freefootnote[1]{%
  \let\thefootnote\relax%
  \footnotetext{#1}%
  \let\thefootnote\svthefootnote%
}
\title{Online Policy Optimization for Robust MDP}
\author{
  Jing Dong \thanks{The Chinese University of Hong Kong, Shenzhen}\\
  \texttt{jingdong@link.cuhk.edu.cn}
  \and
  Jingwei Li \thanks{Tsinghua University}\\
  \texttt{ljw22@mails.tsinghua.edu.cn}
  \and
  Baoxiang Wang \footnotemark[1] \\
  \texttt{bxiangwang@cuhk.edu.cn}
  \and
  Jingzhao Zhang \footnotemark[2]\\
  \texttt{jingzhaoz@mail.tsinghua.edu.cn}
}
\date{}
\begin{document} \freefootnote{Authors are listed in alphabetical order.}
\maketitle


\begin{abstract}
Reinforcement learning (RL) has exceeded human performance in many synthetic settings such as video games and Go. However, real-world deployment of end-to-end RL models is less common, as RL models can be very sensitive to slight perturbation of the environment. 
The robust Markov decision process (MDP) framework---in which the transition probabilities belong to an uncertainty set around a nominal model---provides one way to develop robust models. 
While previous analysis shows RL algorithms are effective assuming access to a generative model, it remains unclear whether RL can be efficient under a more realistic online setting, which requires a careful balance between exploration and exploitation. 
In this work, we consider online robust MDP by interacting with an unknown nominal system. We propose a robust optimistic policy optimization algorithm that is provably efficient. 
To address the additional uncertainty caused by an adversarial environment, our model features a new optimistic update rule derived via Fenchel conjugates. 
Our analysis establishes the first regret bound for online robust MDPs. 
\end{abstract}

\section{Introduction}
\input{intro}

\section{Related work}
\input{related}

\section{Robust MDP and uncertainty sets}
\input{formulation}

\section{Algorithm}
\input{algorithm}

\section{Theoretical results}


\input{theoretical_results}

\section{Empirical results} 
\input{exp}

\section{Conclusion and future directions}
\input{conclusion}


\bibliography{ref}

\clearpage

\appendix
\input{appendix}



\end{document}
