
\documentclass{article} % For LaTeX2e
\usepackage{iclr2023_conference,times}

% Optional math commands from https://github.com/goodfeli/dlbook_notation.
\input{ICLR/math_commands}
\usepackage[small,bf]{caption}
%\usepackage[table,xcdraw]{xcolor}
\usepackage{amsthm,apxproof}
\usepackage{thmtools,amsmath,amsfonts} 
\usepackage{amssymb}% http://ctan.org/pkg/amssymb
\usepackage{pifont}% http://ctan.org/pkg/pifont
\newcommand{\cmark}{\ding{51}}%
\newcommand{\xmark}{\ding{55}}%
\usepackage{thm-restate}
\usepackage{booktabs}
\newcommand{\dong}[1]{\textcolor{blue}{\textbf{[Dong: #1]}}}
\usepackage{algorithmic,algorithm}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{wrapfig}
\usepackage{mathtools}
\usepackage{multirow}
\usepackage{hyperref}
\usepackage{url}
\usepackage{booktabs}
\newtheorem{thm}{Theorem}
\newtheorem{lem}{Lemma}
\newtheorem{cor}{Corollary}[section]
\newtheorem{prop}{Proposition}[section]
\newtheorem{asmp}{Assumption}[section]
\newtheorem{defn}{Definition}[section]
\newtheorem{oracle}{Oracle}[section]
\newtheorem{claim}{Claim}[section]
\newtheorem{conj}{Conjecture}[section]
\newtheorem{rem}{Remark}[section]
\newtheorem{example}{Example}[section]
\newtheorem{condition}{Condition}[section]

\title{Online Policy Optimization for Robust MDP}

% Authors must not appear in the submitted version. They should be hidden
% as long as the \iclrfinalcopy macro remains commented out below.
% Non-anonymous submissions will be rejected without review.

\author{Antiquus S.~Hippocampus, Natalia Cerebro \& Amelie P. Amygdale \thanks{ Use footnote for providing further information
about author (webpage, alternative address)---\emph{not} for acknowledging
funding agencies.  Funding acknowledgements go at the end of the paper.} \\
Department of Computer Science\\
Cranberry-Lemon University\\
Pittsburgh, PA 15213, USA \\
\texttt{\{hippo,brain,jen\}@cs.cranberry-lemon.edu} \\
\And
Ji Q. Ren \& Yevgeny LeNet \\
Department of Computational Neuroscience \\
University of the Witwatersrand \\
Joburg, South Africa \\
\texttt{\{robot,net\}@wits.ac.za} \\
\AND
Coauthor \\
Affiliation \\
Address \\
\texttt{email}
}

% The \author macro works with any number of authors. There are two commands
% used to separate the names and addresses of multiple authors: \And and \AND.
%
% Using \And between authors leaves it to \LaTeX{} to determine where to break
% the lines. Using \AND forces a linebreak at that point. So, if \LaTeX{}
% puts 3 of 4 authors names on the first line, and the last on the second
% line, try using \AND instead of \And before the third author name.

\newcommand{\fix}{\marginpar{FIX}}
\newcommand{\new}{\marginpar{NEW}}

%\iclrfinalcopy % Uncomment for camera-ready version, but NOT for submission.
\begin{document}


\maketitle

\begin{abstract}
Reinforcement learning (RL) has exceeded human performance in many synthetic settings such as video games and Go. However, real-world deployment of end-to-end RL models is less common, as RL models can be very sensitive to slight perturbation of the environment. 
The robust Markov decision process (MDP) framework---in which the transition probabilities belong to an uncertainty set around a nominal model---provides one way to develop robust models. 
While previous analysis shows RL algorithms are effective assuming access to a generative model, it remains unclear whether RL can be efficient under a more realistic online setting, which requires a careful balance between exploration and exploitation. 
In this work, we consider online robust MDP by interacting with an unknown nominal system. We propose a robust optimistic policy optimization algorithm that is provably efficient. 
To address the additional uncertainty caused by an adversarial environment, our model features a new optimistic update rule derived via Fenchel conjugates. 
Our analysis establishes the first regret bound for online robust MDPs. 
\end{abstract}

\section{Introduction}
\input{intro}

\section{Related work}
\input{related}

\section{Robust MDP and uncertainty sets}
\input{formulation}

\section{Algorithm}
\input{algorithm}

\section{Theoretical results}


\input{theoretical_results}

\section{Empirical results} 
\input{exp}
%\section{Experiments}
%\begin{itemize}
%    \item Gridworld environment?
%    \item Range of rho?
%\end{itemize}
\vspace{-0.5cm}
\section{Conclusion and future directions}
\input{conclusion}

\bibliography{iclr2023_conference}
\bibliographystyle{iclr2023_conference}

\clearpage

\appendix
\input{appendix}

\end{document}
