{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1753b0d4",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "from src.rl_agent import RLAgent\n",
    "from src.state_representation import StateRepresentation\n",
    "\n",
    "from src.env_pendulum import EnvironmentPendulum\n",
    "from src.env_redpillbluepill import EnvironmentRedPillBluePill\n",
    "\n",
    "from src.rl_experiments import RLExperiments"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9e773e23",
   "metadata": {},
   "source": [
    "## Red-Pill Blue-Pill (RED CVaR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "680d8232",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# For single tau\n",
    "\n",
    "# define environment\n",
    "env = EnvironmentRedPillBluePill() \n",
    "\n",
    "# define agent\n",
    "actions = ['red_pill', 'blue_pill']\n",
    "\n",
    "states = ['redworld', 'blueworld']      \n",
    "\n",
    "policy = None\n",
    "\n",
    "agent = RLAgent(agent_type='q_learning', \n",
    "                states=states, \n",
    "                actions=actions, \n",
    "                policy=policy, \n",
    "                use_average_reward=True,\n",
    "                policy_type='tabular',\n",
    "                value_type='tabular',\n",
    "                use_cvar=True,\n",
    "                initial_avg_reward=0.0, \n",
    "                var_quantile=0.25, \n",
    "                initial_var_reward=0.0)\n",
    "\n",
    "# run experiment\n",
    "rl_experiments = RLExperiments()\n",
    "\n",
    "step_sizes = {\n",
    "    'value': 0.02,\n",
    "    'avg_reward': 0.01,\n",
    "    'var': 0.01,\n",
    "}\n",
    "\n",
    "df_rpbp_cvar = rl_experiments.redpillbluepill(agent, env,\n",
    "                                              num_runs=50,\n",
    "                                              max_steps=100000,\n",
    "                                              epsilon=0.1,\n",
    "                                              step_size=step_sizes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c7b043e6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# For multiple taus\n",
    "\n",
    "# define environment\n",
    "env = EnvironmentRedPillBluePill() \n",
    "\n",
    "# define agent\n",
    "actions = ['red_pill', 'blue_pill']\n",
    "\n",
    "states = ['redworld', 'blueworld']      \n",
    "\n",
    "policy = None\n",
    "\n",
    "tau_experiment_results = {}\n",
    "for tau in [0.1, 0.25, 0.5, 0.75, 0.85, 0.9]:\n",
    "    print(tau)\n",
    "    agent = RLAgent(agent_type='q_learning', \n",
    "                    states=states, \n",
    "                    actions=actions, \n",
    "                    policy=policy, \n",
    "                    use_average_reward=True,\n",
    "                    policy_type='tabular',\n",
    "                    value_type='tabular',\n",
    "                    use_cvar=True,\n",
    "                    initial_avg_reward=0.0, \n",
    "                    var_quantile=tau, \n",
    "                    initial_var_reward=0.0)\n",
    "\n",
    "    # run experiment\n",
    "    rl_experiments = RLExperiments()\n",
    "    \n",
    "    step_sizes = {\n",
    "    'value': 0.02,\n",
    "    'avg_reward': 0.01,\n",
    "    'var': 0.1,\n",
    "    }\n",
    "\n",
    "    df_rpbp_tau = rl_experiments.redpillbluepill(agent, env,\n",
    "                                                 num_runs=10,\n",
    "                                                 max_steps=100000,\n",
    "                                                 epsilon=0.1,\n",
    "                                                 step_size=step_sizes)\n",
    "    \n",
    "    tau_experiment_results[tau] = df_rpbp_tau"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "09d42f98",
   "metadata": {},
   "source": [
    "## Red-Pill Blue-Pill (Differential)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "46ae2afc",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# define environment\n",
    "env = EnvironmentRedPillBluePill() \n",
    "\n",
    "# define agent\n",
    "actions = ['red_pill', 'blue_pill']\n",
    "\n",
    "states = ['redworld', 'blueworld']      \n",
    "\n",
    "policy = None\n",
    "\n",
    "agent = RLAgent(agent_type='q_learning', \n",
    "                states=states, \n",
    "                actions=actions, \n",
    "                policy=policy, \n",
    "                use_average_reward=True,\n",
    "                policy_type='tabular',\n",
    "                value_type='tabular',\n",
    "                use_cvar=False,\n",
    "                initial_avg_reward=0.0)\n",
    "\n",
    "# run experiment\n",
    "rl_experiments = RLExperiments()\n",
    "\n",
    "step_sizes = {\n",
    "    'value': 0.0002,\n",
    "    'avg_reward': 1,\n",
    "    'var': 1,\n",
    "}\n",
    "\n",
    "df_rpbp_reg = rl_experiments.redpillbluepill(agent, env,\n",
    "                                              num_runs=50,\n",
    "                                              max_steps=100000,\n",
    "                                              epsilon=0.1,\n",
    "                                              step_size=step_sizes)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "278ee01e",
   "metadata": {},
   "source": [
    "## Inverted Pendulum (RED CVaR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "de3cde50",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# define environment\n",
    "env = EnvironmentPendulum() \n",
    "\n",
    "# define agent with function approximation\n",
    "num_tiles = 8\n",
    "num_tilings = 32\n",
    "iht_size = 4096\n",
    "state_representation = StateRepresentation(num_tiles=num_tiles,\n",
    "                                           num_tilings=num_tilings,\n",
    "                                           iht_size=iht_size,\n",
    "                                           min_pose = -1*np.pi,\n",
    "                                           max_pose = np.pi,\n",
    "                                           min_vel = -1*env.min_max_velocity,\n",
    "                                           max_vel = env.min_max_velocity,\n",
    "                                          )\n",
    "\n",
    "actions = ['accelerate_left', 'dont_accelerate', 'accelerate_right']\n",
    "states = np.zeros((iht_size, 1))\n",
    "policy = None\n",
    "\n",
    "agent = RLAgent(agent_type='td', \n",
    "                states=states, \n",
    "                actions=actions, \n",
    "                policy=policy, \n",
    "                use_average_reward=True,\n",
    "                policy_type='linear',\n",
    "                policy_softmax_tau = 1.0,\n",
    "                policy_update_type='stochastic_gradient_descent',\n",
    "                value_type='linear',\n",
    "                value_update_type='stochastic_gradient_descent',\n",
    "                use_cvar=True,\n",
    "                initial_avg_reward=0.0, \n",
    "                var_quantile=0.1, \n",
    "                initial_var_reward=0.0)\n",
    "\n",
    "# run experiment\n",
    "rl_experiments = RLExperiments()\n",
    "\n",
    "step_sizes = {\n",
    "    'value': 0.002,\n",
    "    'policy': 1,\n",
    "    'avg_reward': 0.01,\n",
    "    'var': 0.01,\n",
    "}\n",
    "\n",
    "df_pendulum_cvar = rl_experiments.pendulum(agent, env, \n",
    "                                           state_representation=state_representation.dynamicstate_tilecoding,\n",
    "                                           num_runs=50,\n",
    "                                           max_steps=100000,\n",
    "                                           step_size=step_sizes)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6b0515d3",
   "metadata": {},
   "source": [
    "## Inverted Pendulum (Differential)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "40cbfcff",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# define environment\n",
    "env = EnvironmentPendulum() \n",
    "\n",
    "# define agent with function approximation\n",
    "num_tiles = 8\n",
    "num_tilings = 32\n",
    "iht_size = 4096\n",
    "state_representation = StateRepresentation(num_tiles=num_tiles,\n",
    "                                           num_tilings=num_tilings,\n",
    "                                           iht_size=iht_size,\n",
    "                                           min_pose = -1*np.pi,\n",
    "                                           max_pose = np.pi,\n",
    "                                           min_vel = -1*env.min_max_velocity,\n",
    "                                           max_vel = env.min_max_velocity,\n",
    "                                          )\n",
    "\n",
    "actions = ['accelerate_left', 'dont_accelerate', 'accelerate_right']\n",
    "states = np.zeros((iht_size, 1))\n",
    "policy = None\n",
    "\n",
    "agent = RLAgent(agent_type='td', \n",
    "                states=states, \n",
    "                actions=actions, \n",
    "                policy=policy, \n",
    "                use_average_reward=True,\n",
    "                policy_type='linear',\n",
    "                policy_softmax_tau = 1.0,\n",
    "                policy_update_type='stochastic_gradient_descent',\n",
    "                value_type='linear',\n",
    "                value_update_type='stochastic_gradient_descent',\n",
    "                use_cvar=False,\n",
    "                initial_avg_reward=0.0)\n",
    "\n",
    "# run experiment\n",
    "rl_experiments = RLExperiments()\n",
    "\n",
    "step_sizes = {\n",
    "    'value': 0.002,\n",
    "    'policy': 1,\n",
    "    'avg_reward': 0.001,\n",
    "    'var': 1,\n",
    "}\n",
    "\n",
    "df_pendulum_reg = rl_experiments.pendulum(agent, env, \n",
    "                                          state_representation=state_representation.dynamicstate_tilecoding,\n",
    "                                          num_runs=50,\n",
    "                                          max_steps=100000,\n",
    "                                          step_size=step_sizes)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "97931e69",
   "metadata": {},
   "source": [
    "## Figures"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0f82aef4",
   "metadata": {},
   "outputs": [],
   "source": [
    "rl_experiments = RLExperiments()\n",
    "rl_experiments.get_performance_figure(experiment='redpillbluepill',\n",
    "                                      df_cvar=df_rpbp_cvar, \n",
    "                                      df_reg=df_rpbp_reg, \n",
    "                                      quantile=0.25, \n",
    "                                      rolling_average_amount=1000,\n",
    "                                      x_max=50000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a17fdc37",
   "metadata": {},
   "outputs": [],
   "source": [
    "rl_experiments = RLExperiments()\n",
    "rl_experiments.get_performance_figure(experiment='pendulum',\n",
    "                                      df_cvar=df_pendulum_cvar, \n",
    "                                      df_reg=df_pendulum_reg, \n",
    "                                      quantile=0.1, \n",
    "                                      rolling_average_amount=1000,\n",
    "                                      x_max=15000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6fc37bdc",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# define environment\n",
    "env = EnvironmentRedPillBluePill() \n",
    "\n",
    "# define agent\n",
    "actions = ['red_pill', 'blue_pill']\n",
    "\n",
    "states = ['redworld', 'blueworld']      \n",
    "\n",
    "policy = None\n",
    "\n",
    "agent = RLAgent(agent_type='q_learning', \n",
    "                states=states, \n",
    "                actions=actions, \n",
    "                policy=policy, \n",
    "                use_average_reward=True,\n",
    "                policy_type='tabular',\n",
    "                value_type='tabular',\n",
    "                use_cvar=True,\n",
    "                initial_avg_reward=0.0, \n",
    "                var_quantile=0.25, \n",
    "                initial_var_reward=-1.0)\n",
    "\n",
    "# run experiment\n",
    "rl_experiments = RLExperiments()\n",
    "\n",
    "step_sizes = {\n",
    "    'value': 0.02,\n",
    "    'avg_reward': 0.01,\n",
    "    'var': 0.01,\n",
    "}\n",
    "\n",
    "plots = {\n",
    "   'case a)': {\n",
    "       'init_avg_reward': 0.0, \n",
    "       'init_var_reward': -1.25,\n",
    "       'z_max': 100000,\n",
    "       'color': '#4F1379',\n",
    "   }, \n",
    "   \n",
    "   'case b)': {\n",
    "       'init_avg_reward': 1.0, \n",
    "       'init_var_reward': -1.0,\n",
    "       'z_max': 100000,\n",
    "       'color': '#9A2E7F',\n",
    "   },\n",
    "    \n",
    "    'case c)': {\n",
    "       'init_avg_reward': 0.0, \n",
    "       'init_var_reward': 0.0,\n",
    "       'z_max': 100000,\n",
    "       'color': '#D7486C',\n",
    "   }, \n",
    "    \n",
    "    'case d)': {\n",
    "       'init_avg_reward': -1.0, \n",
    "       'init_var_reward': 0.0,\n",
    "       'z_max': 100000,\n",
    "       'color': '#FEB780',\n",
    "   }, \n",
    "}\n",
    "\n",
    "rl_experiments.get_3d_plot(agent, env,\n",
    "                           num_runs=1,\n",
    "                           max_steps=100000,\n",
    "                           epsilon=0.1,\n",
    "                           step_size=step_sizes,\n",
    "                           plots=plots,\n",
    "                          )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bccea16d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# define environment\n",
    "env = EnvironmentRedPillBluePill() \n",
    "\n",
    "# define agent\n",
    "actions = ['red_pill', 'blue_pill']\n",
    "\n",
    "states = ['redworld', 'blueworld']      \n",
    "\n",
    "policy = None\n",
    "\n",
    "agent = RLAgent(agent_type='q_learning', \n",
    "                states=states, \n",
    "                actions=actions, \n",
    "                policy=policy, \n",
    "                use_average_reward=True,\n",
    "                policy_type='tabular',\n",
    "                value_type='tabular',\n",
    "                use_cvar=True,\n",
    "                initial_avg_reward=0.0, \n",
    "                var_quantile=0.25, \n",
    "                initial_var_reward=0.0)\n",
    "\n",
    "# run experiment\n",
    "rl_experiments = RLExperiments()\n",
    "\n",
    "step_sizes = {\n",
    "    'value': 0.02,\n",
    "    'avg_reward': 0.01,\n",
    "    'var': 0.01,\n",
    "}\n",
    "\n",
    "rl_experiments.cvar_redpillbluepill_comparison(agent=agent, \n",
    "                                               env=env,\n",
    "                                               experiment='cvar_rpbp_estimates_reg',\n",
    "                                               # experiment='cvar_rpbp_estimates_hardcoded',\n",
    "                                               num_runs=1,\n",
    "                                               max_steps=100000,\n",
    "                                               epsilon=0.1,\n",
    "                                               step_size=step_sizes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "98937267",
   "metadata": {},
   "outputs": [],
   "source": [
    "# define environment\n",
    "env = EnvironmentPendulum() \n",
    "\n",
    "# define agent with function approximation\n",
    "num_tiles = 8\n",
    "num_tilings = 32\n",
    "iht_size = 4096\n",
    "state_representation = StateRepresentation(num_tiles=num_tiles,\n",
    "                                           num_tilings=num_tilings,\n",
    "                                           iht_size=iht_size,\n",
    "                                           min_pose = -1*np.pi,\n",
    "                                           max_pose = np.pi,\n",
    "                                           min_vel = -1*env.min_max_velocity,\n",
    "                                           max_vel = env.min_max_velocity,\n",
    "                                          )\n",
    "\n",
    "actions = ['accelerate_left', 'dont_accelerate', 'accelerate_right']\n",
    "states = np.zeros((iht_size, 1))\n",
    "policy = None\n",
    "\n",
    "agent = RLAgent(agent_type='td', \n",
    "                states=states, \n",
    "                actions=actions, \n",
    "                policy=policy, \n",
    "                use_average_reward=True,\n",
    "                policy_type='linear',\n",
    "                policy_softmax_tau = 1.0,\n",
    "                policy_update_type='stochastic_gradient_descent',\n",
    "                value_type='linear',\n",
    "                value_update_type='stochastic_gradient_descent',\n",
    "                use_cvar=True,\n",
    "                initial_avg_reward=0.0, \n",
    "                var_quantile=0.1, \n",
    "                initial_var_reward=0.0)\n",
    "\n",
    "# run experiment\n",
    "rl_experiments = RLExperiments()\n",
    "\n",
    "step_sizes = {\n",
    "    'value': 0.002,\n",
    "    'policy': 1,\n",
    "    'avg_reward': 0.01,\n",
    "    'var': 0.01,\n",
    "}\n",
    "\n",
    "rl_experiments.cvar_pendulum_comparison(agent=agent, \n",
    "                                        env=env,\n",
    "                                        experiment='cvar_pendulum_estimates_reg',\n",
    "                                        # experiment='cvar_pendulum_estimates_hardcoded',\n",
    "                                        state_representation=state_representation.dynamicstate_tilecoding,\n",
    "                                        num_runs=1,\n",
    "                                        max_steps=500000,\n",
    "                                        step_size=step_sizes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "938fce3f",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# Compare CVaR values of red and blue policies (estimated using monte carlo)\n",
    "rl_experiments = RLExperiments()\n",
    "rl_experiments.get_cvar_by_tau_plot(\n",
    "    n_samples=100000,\n",
    "    epsillon=0.1,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a0de99d3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# tau experiment\n",
    "rl_experiments = RLExperiments()\n",
    "rl_experiments.get_tau_results_figure(experiment='rpbp_by_tau',\n",
    "                                      results_dict=tau_experiment_results, \n",
    "                                      n_runs=10, \n",
    "                                      rolling_average_amount=1000,\n",
    "                                      x_max=50000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "22554a81",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
