{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gym\n",
    "import numpy as np\n",
    "import torch\n",
    "import torchkit.pytorch_utils as ptu\n",
    "\n",
    "# import environments\n",
    "import envs.pomdp\n",
    "\n",
    "# import recurrent model-free RL (separate architecture)\n",
    "from policies.models.policy_rnn import ModelFreeOffPolicy_Separate_RNN as Policy_RNN\n",
    "\n",
    "# import the replay buffer\n",
    "from buffers.seq_replay_buffer_vanilla import SeqReplayBuffer\n",
    "from utils import helpers as utl"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Build a POMDP environment: Pendulum-V (only observe the velocity)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<TimeLimit<POMDPWrapper<TimeLimit<PendulumEnv<Pendulum-V-v0>>>>> 1 1 200\n"
     ]
    }
   ],
   "source": [
    "cuda_id = 0  # -1 if using cpu\n",
    "ptu.set_gpu_mode(torch.cuda.is_available() and cuda_id >= 0, cuda_id)\n",
    "\n",
    "env_name = \"Pendulum-V-v0\"\n",
    "env = gym.make(env_name)\n",
    "max_trajectory_len = env._max_episode_steps\n",
    "act_dim = env.action_space.shape[0]\n",
    "obs_dim = env.observation_space.shape[0]\n",
    "print(env, obs_dim, act_dim, max_trajectory_len)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Build a recurent model-free RL agent: separate architecture, `lstm` encoder, `oar` policy input space, `td3` RL algorithm (context length set later)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "source": [
    "agent = Policy_RNN(\n",
    "            obs_dim=obs_dim,\n",
    "            action_dim=act_dim,\n",
    "            encoder=\"lstm\",\n",
    "            algo=\"td3\",\n",
    "            action_embedding_size=8,\n",
    "            observ_embedding_size=32,\n",
    "            reward_embedding_size=8,\n",
    "            rnn_hidden_size=128,\n",
    "            dqn_layers=[128, 128],\n",
    "            policy_layers=[128, 128],\n",
    "            lr=0.0003,\n",
    "            gamma=0.9,\n",
    "            tau=0.005,\n",
    "        ).to(ptu.device)\n"
   ],
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Critic_RNN(\n",
      "  (observ_embedder): FeatureExtractor(\n",
      "    (fc): Linear(in_features=1, out_features=32, bias=True)\n",
      "  )\n",
      "  (action_embedder): FeatureExtractor(\n",
      "    (fc): Linear(in_features=1, out_features=8, bias=True)\n",
      "  )\n",
      "  (reward_embedder): FeatureExtractor(\n",
      "    (fc): Linear(in_features=1, out_features=8, bias=True)\n",
      "  )\n",
      "  (rnn): LSTM(48, 128)\n",
      "  (current_observ_action_embedder): FeatureExtractor(\n",
      "    (fc): Linear(in_features=2, out_features=48, bias=True)\n",
      "  )\n",
      "  (qf1): FlattenMlp(\n",
      "    (fc0): Linear(in_features=176, out_features=128, bias=True)\n",
      "    (fc1): Linear(in_features=128, out_features=128, bias=True)\n",
      "    (last_fc): Linear(in_features=128, out_features=1, bias=True)\n",
      "  )\n",
      "  (qf2): FlattenMlp(\n",
      "    (fc0): Linear(in_features=176, out_features=128, bias=True)\n",
      "    (fc1): Linear(in_features=128, out_features=128, bias=True)\n",
      "    (last_fc): Linear(in_features=128, out_features=1, bias=True)\n",
      "  )\n",
      ") \n",
      "Actor_RNN(\n",
      "  (observ_embedder): FeatureExtractor(\n",
      "    (fc): Linear(in_features=1, out_features=32, bias=True)\n",
      "  )\n",
      "  (action_embedder): FeatureExtractor(\n",
      "    (fc): Linear(in_features=1, out_features=8, bias=True)\n",
      "  )\n",
      "  (reward_embedder): FeatureExtractor(\n",
      "    (fc): Linear(in_features=1, out_features=8, bias=True)\n",
      "  )\n",
      "  (rnn): LSTM(48, 128)\n",
      "  (current_observ_embedder): FeatureExtractor(\n",
      "    (fc): Linear(in_features=1, out_features=32, bias=True)\n",
      "  )\n",
      "  (policy): DeterministicPolicy(\n",
      "    (fc0): Linear(in_features=160, out_features=128, bias=True)\n",
      "    (fc1): Linear(in_features=128, out_features=128, bias=True)\n",
      "    (last_fc): Linear(in_features=128, out_features=1, bias=True)\n",
      "  )\n",
      ") \n"
     ]
    }
   ],
   "source": [
    "agent = Policy_RNN(\n",
    "    obs_dim=obs_dim,\n",
    "    action_dim=act_dim,\n",
    "    encoder=\"lstm\",\n",
    "    algo=\"td3\",\n",
    "    action_embedding_size=8,\n",
    "    state_embedding_size=32,\n",
    "    reward_embedding_size=8,\n",
    "    rnn_hidden_size=128,\n",
    "    dqn_layers=[128, 128],\n",
    "    policy_layers=[128, 128],\n",
    "    lr=0.0003,\n",
    "    gamma=0.9,\n",
    "    tau=0.005,\n",
    ").to(ptu.device)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Define other training parameters such as context length and training frequency"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "total env episodes 155 total env steps 31000\n"
     ]
    }
   ],
   "source": [
    "num_updates_per_iter = 1.0  # training frequency\n",
    "sampled_seq_len = 64  # context length\n",
    "buffer_size = 1e6\n",
    "batch_size = 32\n",
    "\n",
    "num_iters = 150\n",
    "num_init_rollouts_pool = 5\n",
    "num_rollouts_per_iter = 1\n",
    "total_rollouts = num_init_rollouts_pool + num_iters * num_rollouts_per_iter\n",
    "n_env_steps_total = max_trajectory_len * total_rollouts\n",
    "_n_env_steps_total = 0\n",
    "print(\"total env episodes\", total_rollouts, \"total env steps\", n_env_steps_total)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Define key functions: collect rollouts and policy update"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "@torch.no_grad()\n",
    "def collect_rollouts(\n",
    "    num_rollouts, random_actions=False, deterministic=False, train_mode=True\n",
    "):\n",
    "    \"\"\"collect num_rollouts of trajectories in task and save into policy buffer\n",
    "    :param\n",
    "        random_actions: whether to use policy to sample actions, or randomly sample action space\n",
    "        deterministic: deterministic action selection?\n",
    "        train_mode: whether to train (stored to buffer) or test\n",
    "    \"\"\"\n",
    "    if not train_mode:\n",
    "        assert random_actions == False and deterministic == True\n",
    "\n",
    "    total_steps = 0\n",
    "    total_rewards = 0.0\n",
    "\n",
    "    for idx in range(num_rollouts):\n",
    "        steps = 0\n",
    "        rewards = 0.0\n",
    "        obs = ptu.from_numpy(env.reset())\n",
    "        obs = obs.reshape(1, obs.shape[-1])\n",
    "        done_rollout = False\n",
    "\n",
    "        # get hidden state at timestep=0, None for mlp\n",
    "        action, reward, internal_state = agent.get_initial_info()\n",
    "\n",
    "        if train_mode:\n",
    "            # temporary storage\n",
    "            obs_list, act_list, rew_list, next_obs_list, term_list = (\n",
    "                [],\n",
    "                [],\n",
    "                [],\n",
    "                [],\n",
    "                [],\n",
    "            )\n",
    "\n",
    "        while not done_rollout:\n",
    "            if random_actions:\n",
    "                action = ptu.FloatTensor([env.action_space.sample()])  # (1, A)\n",
    "            else:\n",
    "                # policy takes hidden state as input for rnn, while takes obs for mlp\n",
    "                (action, _, _, _), internal_state = agent.act(\n",
    "                    prev_internal_state=internal_state,\n",
    "                    prev_action=action,\n",
    "                    reward=reward,\n",
    "                    obs=obs,\n",
    "                    deterministic=deterministic,\n",
    "                )\n",
    "            # observe reward and next obs (B=1, dim)\n",
    "            next_obs, reward, done, info = utl.env_step(env, action.squeeze(dim=0))\n",
    "            done_rollout = False if ptu.get_numpy(done[0][0]) == 0.0 else True\n",
    "\n",
    "            # update statistics\n",
    "            steps += 1\n",
    "            rewards += reward.item()\n",
    "\n",
    "            # early stopping env: such as rmdp, pomdp, generalize tasks. term ignores timeout\n",
    "            term = (\n",
    "                False\n",
    "                if \"TimeLimit.truncated\" in info or steps >= max_trajectory_len\n",
    "                else done_rollout\n",
    "            )\n",
    "\n",
    "            if train_mode:\n",
    "                # append tensors to temporary storage\n",
    "                obs_list.append(obs)  # (1, dim)\n",
    "                act_list.append(action)  # (1, dim)\n",
    "                rew_list.append(reward)  # (1, dim)\n",
    "                term_list.append(term)  # bool\n",
    "                next_obs_list.append(next_obs)  # (1, dim)\n",
    "\n",
    "            # set: obs <- next_obs\n",
    "            obs = next_obs.clone()\n",
    "\n",
    "        if train_mode:\n",
    "            # add collected sequence to buffer\n",
    "            policy_storage.add_episode(\n",
    "                observations=ptu.get_numpy(torch.cat(obs_list, dim=0)),  # (L, dim)\n",
    "                actions=ptu.get_numpy(torch.cat(act_list, dim=0)),  # (L, dim)\n",
    "                rewards=ptu.get_numpy(torch.cat(rew_list, dim=0)),  # (L, dim)\n",
    "                terminals=np.array(term_list).reshape(-1, 1),  # (L, 1)\n",
    "                next_observations=ptu.get_numpy(\n",
    "                    torch.cat(next_obs_list, dim=0)\n",
    "                ),  # (L, dim)\n",
    "            )\n",
    "        print(\n",
    "            \"Mode:\",\n",
    "            \"Train\" if train_mode else \"Test\",\n",
    "            \"env_steps\",\n",
    "            steps,\n",
    "            \"total rewards\",\n",
    "            rewards,\n",
    "        )\n",
    "        total_steps += steps\n",
    "        total_rewards += rewards\n",
    "\n",
    "    if train_mode:\n",
    "        return total_steps\n",
    "    else:\n",
    "        return total_rewards / num_rollouts\n",
    "\n",
    "\n",
    "def update(num_updates):\n",
    "    rl_losses_agg = {}\n",
    "    # print(num_updates)\n",
    "    for update in range(num_updates):\n",
    "        # sample random RL batch: in transitions\n",
    "        batch = ptu.np_to_pytorch_batch(policy_storage.random_episodes(batch_size))\n",
    "        # RL update\n",
    "        rl_losses = agent.update(batch)\n",
    "\n",
    "        for k, v in rl_losses.items():\n",
    "            if update == 0:  # first iterate - create list\n",
    "                rl_losses_agg[k] = [v]\n",
    "            else:  # append values\n",
    "                rl_losses_agg[k].append(v)\n",
    "    # statistics\n",
    "    for k in rl_losses_agg:\n",
    "        rl_losses_agg[k] = np.mean(rl_losses_agg[k])\n",
    "    return rl_losses_agg"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train and Evaluate the agent: only costs < 20 min"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mode: Train env_steps 200 total rewards -1215.5405168533325\n",
      "Mode: Train env_steps 200 total rewards -1309.3240714073181\n",
      "Mode: Train env_steps 200 total rewards -1070.255422860384\n",
      "Mode: Train env_steps 200 total rewards -1716.9817371368408\n",
      "Mode: Train env_steps 200 total rewards -1348.119238615036\n",
      "Mode: Train env_steps 200 total rewards -1794.5983276367188\n",
      "Mode: Train env_steps 200 total rewards -1641.6694905161858\n",
      "Mode: Train env_steps 200 total rewards -1590.8518767878413\n",
      "Mode: Train env_steps 200 total rewards -1717.778513431549\n",
      "Mode: Train env_steps 200 total rewards -1716.919951915741\n",
      "Mode: Test env_steps 200 total rewards -1690.6299517154694\n",
      "Mode: Test env_steps 200 total rewards -1667.401160120964\n",
      "Mode: Test env_steps 200 total rewards -1683.2179251909256\n",
      "Mode: Test env_steps 200 total rewards -1629.752505838871\n",
      "Mode: Test env_steps 200 total rewards -1730.7712788581848\n",
      "Mode: Test env_steps 200 total rewards -1709.7121629714966\n",
      "Mode: Test env_steps 200 total rewards -1737.636411190033\n",
      "Mode: Test env_steps 200 total rewards -1724.8275074958801\n",
      "Mode: Test env_steps 200 total rewards -1644.5090357661247\n",
      "Mode: Test env_steps 200 total rewards -1670.3785852193832\n",
      "2000 -1688.8836524367332\n",
      "Mode: Train env_steps 200 total rewards -1675.8528361320496\n",
      "Mode: Train env_steps 200 total rewards -1658.8392679691315\n",
      "Mode: Train env_steps 200 total rewards -1519.6182126998901\n",
      "Mode: Train env_steps 200 total rewards -1543.8249187469482\n",
      "Mode: Train env_steps 200 total rewards -1378.7394891306758\n",
      "Mode: Test env_steps 200 total rewards -1243.581422328949\n",
      "Mode: Test env_steps 200 total rewards -1279.0839395523071\n",
      "Mode: Test env_steps 200 total rewards -1115.5180749297142\n",
      "Mode: Test env_steps 200 total rewards -1240.0015530586243\n",
      "Mode: Test env_steps 200 total rewards -1131.4246773123741\n",
      "Mode: Test env_steps 200 total rewards -1271.0484585762024\n",
      "Mode: Test env_steps 200 total rewards -1296.8658256530762\n",
      "Mode: Test env_steps 200 total rewards -1268.0181958675385\n",
      "Mode: Test env_steps 200 total rewards -1105.4287464022636\n",
      "Mode: Test env_steps 200 total rewards -1221.9913232326508\n",
      "3000 -1217.29622169137\n",
      "Mode: Train env_steps 200 total rewards -1086.907365836203\n",
      "Mode: Train env_steps 200 total rewards -809.5890567302704\n",
      "Mode: Train env_steps 200 total rewards -1509.1656613349915\n",
      "Mode: Train env_steps 200 total rewards -875.1950886547565\n",
      "Mode: Train env_steps 200 total rewards -883.6977178305387\n",
      "Mode: Test env_steps 200 total rewards -932.8838503956795\n",
      "Mode: Test env_steps 200 total rewards -916.5262511968613\n",
      "Mode: Test env_steps 200 total rewards -853.4724770113826\n",
      "Mode: Test env_steps 200 total rewards -972.6363238096237\n",
      "Mode: Test env_steps 200 total rewards -916.7851620316505\n",
      "Mode: Test env_steps 200 total rewards -892.7446937561035\n",
      "Mode: Test env_steps 200 total rewards -911.9960522651672\n",
      "Mode: Test env_steps 200 total rewards -862.5102658420801\n",
      "Mode: Test env_steps 200 total rewards -909.3836004137993\n",
      "Mode: Test env_steps 200 total rewards -902.3712181299925\n",
      "4000 -907.1309894852341\n",
      "Mode: Train env_steps 200 total rewards -896.5191862247884\n",
      "Mode: Train env_steps 200 total rewards -1148.8554611206055\n",
      "Mode: Train env_steps 200 total rewards -919.8976370096207\n",
      "Mode: Train env_steps 200 total rewards -894.6185926496983\n",
      "Mode: Train env_steps 200 total rewards -777.0896812826395\n",
      "Mode: Test env_steps 200 total rewards -800.0095049291849\n",
      "Mode: Test env_steps 200 total rewards -729.1357635855675\n",
      "Mode: Test env_steps 200 total rewards -790.4656649529934\n",
      "Mode: Test env_steps 200 total rewards -658.2100356258452\n",
      "Mode: Test env_steps 200 total rewards -678.3389454782009\n",
      "Mode: Test env_steps 200 total rewards -764.867270976305\n",
      "Mode: Test env_steps 200 total rewards -711.1784103494138\n",
      "Mode: Test env_steps 200 total rewards -704.299937158823\n",
      "Mode: Test env_steps 200 total rewards -703.3847205489874\n",
      "Mode: Test env_steps 200 total rewards -769.4560797959566\n",
      "5000 -730.9346333401278\n",
      "Mode: Train env_steps 200 total rewards -774.3973034918308\n",
      "Mode: Train env_steps 200 total rewards -863.303290605545\n",
      "Mode: Train env_steps 200 total rewards -754.3786760801449\n",
      "Mode: Train env_steps 200 total rewards -787.7701032310724\n",
      "Mode: Train env_steps 200 total rewards -814.8449696339667\n",
      "Mode: Test env_steps 200 total rewards -641.1826608031988\n",
      "Mode: Test env_steps 200 total rewards -673.1848703697324\n",
      "Mode: Test env_steps 200 total rewards -636.2317231073976\n",
      "Mode: Test env_steps 200 total rewards -636.3841380421072\n",
      "Mode: Test env_steps 200 total rewards -634.7440396994352\n",
      "Mode: Test env_steps 200 total rewards -1434.365993976593\n",
      "Mode: Test env_steps 200 total rewards -639.5609966111369\n",
      "Mode: Test env_steps 200 total rewards -638.4026339892298\n",
      "Mode: Test env_steps 200 total rewards -629.0861927568913\n",
      "Mode: Test env_steps 200 total rewards -635.3440890386701\n",
      "6000 -719.8487338394392\n",
      "Mode: Train env_steps 200 total rewards -624.8576611503959\n",
      "Mode: Train env_steps 200 total rewards -731.2055732905865\n",
      "Mode: Train env_steps 200 total rewards -643.7517330273986\n",
      "Mode: Train env_steps 200 total rewards -512.888639099896\n",
      "Mode: Train env_steps 200 total rewards -678.9873680695891\n",
      "Mode: Test env_steps 200 total rewards -649.3965282291174\n",
      "Mode: Test env_steps 200 total rewards -541.0664244294167\n",
      "Mode: Test env_steps 200 total rewards -656.5433887466788\n",
      "Mode: Test env_steps 200 total rewards -701.5938144102693\n",
      "Mode: Test env_steps 200 total rewards -570.9794048666954\n",
      "Mode: Test env_steps 200 total rewards -526.0970221487805\n",
      "Mode: Test env_steps 200 total rewards -528.7169065512717\n",
      "Mode: Test env_steps 200 total rewards -791.1858232319355\n",
      "Mode: Test env_steps 200 total rewards -760.1559834107757\n",
      "Mode: Test env_steps 200 total rewards -796.3674455285072\n",
      "7000 -652.2102741553448\n",
      "Mode: Train env_steps 200 total rewards -575.0728849545121\n",
      "Mode: Train env_steps 200 total rewards -538.9270869866014\n",
      "Mode: Train env_steps 200 total rewards -703.1943583320826\n",
      "Mode: Train env_steps 200 total rewards -522.5574248465709\n",
      "Mode: Train env_steps 200 total rewards -526.6231522634625\n",
      "Mode: Test env_steps 200 total rewards -471.21681063994765\n",
      "Mode: Test env_steps 200 total rewards -407.10355828516185\n",
      "Mode: Test env_steps 200 total rewards -429.82667701132596\n",
      "Mode: Test env_steps 200 total rewards -396.4019733443856\n",
      "Mode: Test env_steps 200 total rewards -1491.0763459205627\n",
      "Mode: Test env_steps 200 total rewards -326.2651424361393\n",
      "Mode: Test env_steps 200 total rewards -464.98171285912395\n",
      "Mode: Test env_steps 200 total rewards -392.0769012141973\n",
      "Mode: Test env_steps 200 total rewards -269.7005622461438\n",
      "Mode: Test env_steps 200 total rewards -509.407666021958\n",
      "8000 -515.8057349978947\n",
      "Mode: Train env_steps 200 total rewards -639.5204429877922\n",
      "Mode: Train env_steps 200 total rewards -396.447283314541\n",
      "Mode: Train env_steps 200 total rewards -519.2145761235151\n",
      "Mode: Train env_steps 200 total rewards -386.9386151973158\n",
      "Mode: Train env_steps 200 total rewards -393.6131444051862\n",
      "Mode: Test env_steps 200 total rewards -136.34055368886766\n",
      "Mode: Test env_steps 200 total rewards -130.04246410355336\n",
      "Mode: Test env_steps 200 total rewards -137.05444939476\n",
      "Mode: Test env_steps 200 total rewards -134.1194399067317\n",
      "Mode: Test env_steps 200 total rewards -131.07375583963585\n",
      "Mode: Test env_steps 200 total rewards -130.39294535505906\n",
      "Mode: Test env_steps 200 total rewards -256.4807607967232\n",
      "Mode: Test env_steps 200 total rewards -133.45546923366783\n",
      "Mode: Test env_steps 200 total rewards -137.30824294477497\n",
      "Mode: Test env_steps 200 total rewards -397.2588393399783\n",
      "9000 -172.3526920603752\n",
      "Mode: Train env_steps 200 total rewards -260.3047589848429\n",
      "Mode: Train env_steps 200 total rewards -260.44967386405915\n",
      "Mode: Train env_steps 200 total rewards -9.588460055063479\n",
      "Mode: Train env_steps 200 total rewards -503.4001742233813\n",
      "Mode: Train env_steps 200 total rewards -132.90466969866975\n",
      "Mode: Test env_steps 200 total rewards -245.46063787024468\n",
      "Mode: Test env_steps 200 total rewards -258.87249805172905\n",
      "Mode: Test env_steps 200 total rewards -253.1965181294363\n",
      "Mode: Test env_steps 200 total rewards -256.33532144408673\n",
      "Mode: Test env_steps 200 total rewards -122.02367229596712\n",
      "Mode: Test env_steps 200 total rewards -378.40153571846895\n",
      "Mode: Test env_steps 200 total rewards -129.97556851245463\n",
      "Mode: Test env_steps 200 total rewards -256.6560115632601\n",
      "Mode: Test env_steps 200 total rewards -128.58447807095945\n",
      "Mode: Test env_steps 200 total rewards -468.4694554193411\n",
      "10000 -249.79756970759482\n",
      "Mode: Train env_steps 200 total rewards -253.84205745416693\n",
      "Mode: Train env_steps 200 total rewards -258.597339340964\n",
      "Mode: Train env_steps 200 total rewards -249.67442950383338\n",
      "Mode: Train env_steps 200 total rewards -264.99233946722234\n",
      "Mode: Train env_steps 200 total rewards -123.49480776841665\n",
      "Mode: Test env_steps 200 total rewards -386.33284205210657\n",
      "Mode: Test env_steps 200 total rewards -374.89824844955365\n",
      "Mode: Test env_steps 200 total rewards -127.82263034246353\n",
      "Mode: Test env_steps 200 total rewards -3.396543635226408\n",
      "Mode: Test env_steps 200 total rewards -0.3892205822030519\n",
      "Mode: Test env_steps 200 total rewards -127.58443048472691\n",
      "Mode: Test env_steps 200 total rewards -123.29965032166001\n",
      "Mode: Test env_steps 200 total rewards -405.617472100781\n",
      "Mode: Test env_steps 200 total rewards -131.20015325089298\n",
      "Mode: Test env_steps 200 total rewards -270.9554879873649\n",
      "11000 -195.1496679206979\n",
      "Mode: Train env_steps 200 total rewards -128.46735045554306\n",
      "Mode: Train env_steps 200 total rewards -385.3559364905559\n",
      "Mode: Train env_steps 200 total rewards -133.3203926575943\n",
      "Mode: Train env_steps 200 total rewards -130.180486971527\n",
      "Mode: Train env_steps 200 total rewards -129.11331324546154\n",
      "Mode: Test env_steps 200 total rewards -259.27573602375924\n",
      "Mode: Test env_steps 200 total rewards -127.15911891811993\n",
      "Mode: Test env_steps 200 total rewards -131.78587026067544\n",
      "Mode: Test env_steps 200 total rewards -124.41451870201854\n",
      "Mode: Test env_steps 200 total rewards -120.47274359833682\n",
      "Mode: Test env_steps 200 total rewards -124.89280595941818\n",
      "Mode: Test env_steps 200 total rewards -121.65913894737605\n",
      "Mode: Test env_steps 200 total rewards -249.62018572923262\n",
      "Mode: Test env_steps 200 total rewards -1.0191547659342177\n",
      "Mode: Test env_steps 200 total rewards -130.19940298219444\n",
      "12000 -139.04986758870655\n",
      "Mode: Train env_steps 200 total rewards -130.7861404924015\n",
      "Mode: Train env_steps 200 total rewards -128.20895186233065\n",
      "Mode: Train env_steps 200 total rewards -240.80124919944137\n",
      "Mode: Train env_steps 200 total rewards -127.05305419189972\n",
      "Mode: Train env_steps 200 total rewards -389.74735507116566\n",
      "Mode: Test env_steps 200 total rewards -125.799274083809\n",
      "Mode: Test env_steps 200 total rewards -126.80654663550376\n",
      "Mode: Test env_steps 200 total rewards -128.47082148335176\n",
      "Mode: Test env_steps 200 total rewards -125.38395279903489\n",
      "Mode: Test env_steps 200 total rewards -265.4943495452462\n",
      "Mode: Test env_steps 200 total rewards -391.3820340028615\n",
      "Mode: Test env_steps 200 total rewards -124.5938728672918\n",
      "Mode: Test env_steps 200 total rewards -115.8693172446583\n",
      "Mode: Test env_steps 200 total rewards -121.6324416497664\n",
      "Mode: Test env_steps 200 total rewards -403.91459427748487\n",
      "13000 -192.93472045890084\n",
      "Mode: Train env_steps 200 total rewards -120.75656462824372\n",
      "Mode: Train env_steps 200 total rewards -244.2110134603572\n",
      "Mode: Train env_steps 200 total rewards -271.4861283576247\n",
      "Mode: Train env_steps 200 total rewards -299.46712611912517\n",
      "Mode: Train env_steps 200 total rewards -276.9068454174121\n",
      "Mode: Test env_steps 200 total rewards -130.26577123824973\n",
      "Mode: Test env_steps 200 total rewards -122.85300587835081\n",
      "Mode: Test env_steps 200 total rewards -125.84164321703429\n",
      "Mode: Test env_steps 200 total rewards -127.25999846162449\n",
      "Mode: Test env_steps 200 total rewards -245.0846909333195\n",
      "Mode: Test env_steps 200 total rewards -251.7522211139776\n",
      "Mode: Test env_steps 200 total rewards -117.7094244834152\n",
      "Mode: Test env_steps 200 total rewards -249.07677362083632\n",
      "Mode: Test env_steps 200 total rewards -259.21219713821483\n",
      "Mode: Test env_steps 200 total rewards -118.03599187266809\n",
      "14000 -174.7091717957691\n",
      "Mode: Train env_steps 200 total rewards -242.31402633567632\n",
      "Mode: Train env_steps 200 total rewards -127.27280326851178\n",
      "Mode: Train env_steps 200 total rewards -243.62500214390457\n",
      "Mode: Train env_steps 200 total rewards -126.50611761247274\n",
      "Mode: Train env_steps 200 total rewards -123.3945286332164\n",
      "Mode: Test env_steps 200 total rewards -257.4191315458156\n",
      "Mode: Test env_steps 200 total rewards -119.91926783090457\n",
      "Mode: Test env_steps 200 total rewards -4.727449198719114\n",
      "Mode: Test env_steps 200 total rewards -378.35922101838514\n",
      "Mode: Test env_steps 200 total rewards -123.7072509995196\n",
      "Mode: Test env_steps 200 total rewards -280.62047006061766\n",
      "Mode: Test env_steps 200 total rewards -248.55686107743531\n",
      "Mode: Test env_steps 200 total rewards -125.25552876619622\n",
      "Mode: Test env_steps 200 total rewards -245.17300941608846\n",
      "Mode: Test env_steps 200 total rewards -263.7774709605146\n",
      "15000 -204.75156608741963\n",
      "Mode: Train env_steps 200 total rewards -369.5970004310366\n",
      "Mode: Train env_steps 200 total rewards -117.8776598579716\n",
      "Mode: Train env_steps 200 total rewards -266.6137974287849\n",
      "Mode: Train env_steps 200 total rewards -247.84643931523897\n",
      "Mode: Train env_steps 200 total rewards -133.65093973837793\n",
      "Mode: Test env_steps 200 total rewards -132.58213516324759\n",
      "Mode: Test env_steps 200 total rewards -317.6314685828984\n",
      "Mode: Test env_steps 200 total rewards -120.63207617402077\n",
      "Mode: Test env_steps 200 total rewards -134.50522946193814\n",
      "Mode: Test env_steps 200 total rewards -249.93733799178153\n",
      "Mode: Test env_steps 200 total rewards -126.03254494443536\n",
      "Mode: Test env_steps 200 total rewards -127.51484705973417\n",
      "Mode: Test env_steps 200 total rewards -133.02907354477793\n",
      "Mode: Test env_steps 200 total rewards -131.04472528398037\n",
      "Mode: Test env_steps 200 total rewards -133.04624734260142\n",
      "16000 -160.59556855494156\n",
      "Mode: Train env_steps 200 total rewards -131.4692294076085\n",
      "Mode: Train env_steps 200 total rewards -257.0220946841873\n",
      "Mode: Train env_steps 200 total rewards -132.60133136808872\n",
      "Mode: Train env_steps 200 total rewards -252.69747569982428\n",
      "Mode: Train env_steps 200 total rewards -122.5156181063503\n",
      "Mode: Test env_steps 200 total rewards -120.0488967075944\n",
      "Mode: Test env_steps 200 total rewards -125.59240189334378\n",
      "Mode: Test env_steps 200 total rewards -122.92463257256895\n",
      "Mode: Test env_steps 200 total rewards -266.6653274325654\n",
      "Mode: Test env_steps 200 total rewards -129.52725801430643\n",
      "Mode: Test env_steps 200 total rewards -386.4986750278622\n",
      "Mode: Test env_steps 200 total rewards -127.47746223770082\n",
      "Mode: Test env_steps 200 total rewards -131.84532477753237\n",
      "Mode: Test env_steps 200 total rewards -123.68566208239645\n",
      "Mode: Test env_steps 200 total rewards -133.80112480558455\n",
      "17000 -166.80667655514554\n",
      "Mode: Train env_steps 200 total rewards -130.1032104054466\n",
      "Mode: Train env_steps 200 total rewards -5.792526931327302\n",
      "Mode: Train env_steps 200 total rewards -129.94445695829927\n",
      "Mode: Train env_steps 200 total rewards -1.8074299860745668\n",
      "Mode: Train env_steps 200 total rewards -371.67741363390815\n",
      "Mode: Test env_steps 200 total rewards -129.01796465553343\n",
      "Mode: Test env_steps 200 total rewards -255.2657772154198\n",
      "Mode: Test env_steps 200 total rewards -124.8317355401814\n",
      "Mode: Test env_steps 200 total rewards -127.61366206099046\n",
      "Mode: Test env_steps 200 total rewards -130.1721339863725\n",
      "Mode: Test env_steps 200 total rewards -128.43343426752836\n",
      "Mode: Test env_steps 200 total rewards -264.26960422779666\n",
      "Mode: Test env_steps 200 total rewards -3.667812744155526\n",
      "Mode: Test env_steps 200 total rewards -251.8668613290938\n",
      "Mode: Test env_steps 200 total rewards -251.72904552519321\n",
      "18000 -166.68680315522653\n",
      "Mode: Train env_steps 200 total rewards -129.41188386362046\n",
      "Mode: Train env_steps 200 total rewards -122.25436197966337\n",
      "Mode: Train env_steps 200 total rewards -132.0075741810724\n",
      "Mode: Train env_steps 200 total rewards -125.08316496918269\n",
      "Mode: Train env_steps 200 total rewards -120.87805001712695\n",
      "Mode: Test env_steps 200 total rewards -130.77035507211986\n",
      "Mode: Test env_steps 200 total rewards -130.97795120121737\n",
      "Mode: Test env_steps 200 total rewards -285.9067427550326\n",
      "Mode: Test env_steps 200 total rewards -130.19821366295218\n",
      "Mode: Test env_steps 200 total rewards -248.72471698420122\n",
      "Mode: Test env_steps 200 total rewards -131.5111675742737\n",
      "Mode: Test env_steps 200 total rewards -252.134106502519\n",
      "Mode: Test env_steps 200 total rewards -249.68509305920452\n",
      "Mode: Test env_steps 200 total rewards -259.2564549049275\n",
      "Mode: Test env_steps 200 total rewards -131.86590750053256\n",
      "19000 -195.10307092169805\n",
      "Mode: Train env_steps 200 total rewards -336.72006702711224\n",
      "Mode: Train env_steps 200 total rewards -3.6598976548411883\n",
      "Mode: Train env_steps 200 total rewards -128.5459162555635\n",
      "Mode: Train env_steps 200 total rewards -389.0736679392867\n",
      "Mode: Train env_steps 200 total rewards -132.46394797693938\n",
      "Mode: Test env_steps 200 total rewards -127.63480124925263\n",
      "Mode: Test env_steps 200 total rewards -132.9844055683352\n",
      "Mode: Test env_steps 200 total rewards -350.4678683485836\n",
      "Mode: Test env_steps 200 total rewards -1491.0205211639404\n",
      "Mode: Test env_steps 200 total rewards -123.56267284578644\n",
      "Mode: Test env_steps 200 total rewards -253.39906679093838\n",
      "Mode: Test env_steps 200 total rewards -131.26202398515306\n",
      "Mode: Test env_steps 200 total rewards -375.1163965202868\n",
      "Mode: Test env_steps 200 total rewards -132.37188876396976\n",
      "Mode: Test env_steps 200 total rewards -254.79661067272536\n",
      "20000 -337.26162559089715\n",
      "Mode: Train env_steps 200 total rewards -127.21233860775828\n",
      "Mode: Train env_steps 200 total rewards -397.9239173475653\n",
      "Mode: Train env_steps 200 total rewards -261.70106873475015\n",
      "Mode: Train env_steps 200 total rewards -136.95836029946804\n",
      "Mode: Train env_steps 200 total rewards -130.52756336517632\n",
      "Mode: Test env_steps 200 total rewards -127.33369559422135\n",
      "Mode: Test env_steps 200 total rewards -283.45684512890875\n",
      "Mode: Test env_steps 200 total rewards -136.14634452015162\n",
      "Mode: Test env_steps 200 total rewards -137.2795043103397\n",
      "Mode: Test env_steps 200 total rewards -248.97463169554248\n",
      "Mode: Test env_steps 200 total rewards -8.958229891955853\n",
      "Mode: Test env_steps 200 total rewards -10.105981927365065\n",
      "Mode: Test env_steps 200 total rewards -132.38649014476687\n",
      "Mode: Test env_steps 200 total rewards -133.52735120104626\n",
      "Mode: Test env_steps 200 total rewards -132.87370552495122\n",
      "21000 -135.1042779939249\n",
      "Mode: Train env_steps 200 total rewards -135.44952426105738\n",
      "Mode: Train env_steps 200 total rewards -136.6360167451203\n",
      "Mode: Train env_steps 200 total rewards -126.07958034798503\n",
      "Mode: Train env_steps 200 total rewards -129.10063152387738\n",
      "Mode: Train env_steps 200 total rewards -254.23420189972967\n",
      "Mode: Test env_steps 200 total rewards -9.132988084107637\n",
      "Mode: Test env_steps 200 total rewards -122.19331623334438\n",
      "Mode: Test env_steps 200 total rewards -253.2292528897524\n",
      "Mode: Test env_steps 200 total rewards -291.03938596788794\n",
      "Mode: Test env_steps 200 total rewards -127.90111041348428\n",
      "Mode: Test env_steps 200 total rewards -7.189530588919297\n",
      "Mode: Test env_steps 200 total rewards -122.86703424248844\n",
      "Mode: Test env_steps 200 total rewards -252.5274507328868\n",
      "Mode: Test env_steps 200 total rewards -126.35793518205173\n",
      "Mode: Test env_steps 200 total rewards -252.72059313277714\n",
      "22000 -156.51585974677\n",
      "Mode: Train env_steps 200 total rewards -132.3777971100062\n",
      "Mode: Train env_steps 200 total rewards -263.93837735801935\n",
      "Mode: Train env_steps 200 total rewards -380.18561655655503\n",
      "Mode: Train env_steps 200 total rewards -408.3316973443143\n",
      "Mode: Train env_steps 200 total rewards -134.41268048726488\n",
      "Mode: Test env_steps 200 total rewards -252.1836907789111\n",
      "Mode: Test env_steps 200 total rewards -136.87916581658646\n",
      "Mode: Test env_steps 200 total rewards -130.30568698607385\n",
      "Mode: Test env_steps 200 total rewards -295.1264161616564\n",
      "Mode: Test env_steps 200 total rewards -285.27469485998154\n",
      "Mode: Test env_steps 200 total rewards -257.36417460720986\n",
      "Mode: Test env_steps 200 total rewards -122.39938643248752\n",
      "Mode: Test env_steps 200 total rewards -136.13417248800397\n",
      "Mode: Test env_steps 200 total rewards -251.1970808338374\n",
      "Mode: Test env_steps 200 total rewards -135.31905758287758\n",
      "23000 -200.21835265476255\n",
      "Mode: Train env_steps 200 total rewards -265.19849015702493\n",
      "Mode: Train env_steps 200 total rewards -268.84571858868003\n",
      "Mode: Train env_steps 200 total rewards -137.15437516197562\n",
      "Mode: Train env_steps 200 total rewards -131.01147694559768\n",
      "Mode: Train env_steps 200 total rewards -389.00455401837826\n",
      "Mode: Test env_steps 200 total rewards -123.15574537939392\n",
      "Mode: Test env_steps 200 total rewards -264.8135799880838\n",
      "Mode: Test env_steps 200 total rewards -359.71586162620224\n",
      "Mode: Test env_steps 200 total rewards -121.86481238342822\n",
      "Mode: Test env_steps 200 total rewards -134.40076231583953\n",
      "Mode: Test env_steps 200 total rewards -127.8359218480764\n",
      "Mode: Test env_steps 200 total rewards -252.95195665210485\n",
      "Mode: Test env_steps 200 total rewards -133.68351730890572\n",
      "Mode: Test env_steps 200 total rewards -249.9511700947769\n",
      "Mode: Test env_steps 200 total rewards -416.6168870218098\n",
      "24000 -218.49902146186213\n",
      "Mode: Train env_steps 200 total rewards -133.75552151724696\n",
      "Mode: Train env_steps 200 total rewards -249.84270376106724\n",
      "Mode: Train env_steps 200 total rewards -119.0928434144007\n",
      "Mode: Train env_steps 200 total rewards -252.1334647499025\n",
      "Mode: Train env_steps 200 total rewards -4.308382875751704\n",
      "Mode: Test env_steps 200 total rewards -250.32012339681387\n",
      "Mode: Test env_steps 200 total rewards -130.86303978820797\n",
      "Mode: Test env_steps 200 total rewards -268.61977915861644\n",
      "Mode: Test env_steps 200 total rewards -256.51407427561935\n",
      "Mode: Test env_steps 200 total rewards -268.53248357982375\n",
      "Mode: Test env_steps 200 total rewards -131.89295327838045\n",
      "Mode: Test env_steps 200 total rewards -247.8418615491828\n",
      "Mode: Test env_steps 200 total rewards -132.06573122669943\n",
      "Mode: Test env_steps 200 total rewards -246.07906676083803\n",
      "Mode: Test env_steps 200 total rewards -128.755500536412\n",
      "25000 -206.1484613550594\n",
      "Mode: Train env_steps 200 total rewards -268.73735208273865\n",
      "Mode: Train env_steps 200 total rewards -249.699738193769\n",
      "Mode: Train env_steps 200 total rewards -257.7146478953655\n",
      "Mode: Train env_steps 200 total rewards -132.48573947069235\n",
      "Mode: Train env_steps 200 total rewards -117.73745695047546\n",
      "Mode: Test env_steps 200 total rewards -117.13273281010333\n",
      "Mode: Test env_steps 200 total rewards -125.37805172341177\n",
      "Mode: Test env_steps 200 total rewards -246.70760537590832\n",
      "Mode: Test env_steps 200 total rewards -126.25057095201919\n",
      "Mode: Test env_steps 200 total rewards -356.92420602519996\n",
      "Mode: Test env_steps 200 total rewards -247.3438758761622\n",
      "Mode: Test env_steps 200 total rewards -123.14953158609569\n",
      "Mode: Test env_steps 200 total rewards -127.49349682836328\n",
      "Mode: Test env_steps 200 total rewards -130.86493495781906\n",
      "Mode: Test env_steps 200 total rewards -131.28574351139832\n",
      "26000 -173.2530749646481\n",
      "Mode: Train env_steps 200 total rewards -129.1364300606656\n",
      "Mode: Train env_steps 200 total rewards -131.16975290200207\n",
      "Mode: Train env_steps 200 total rewards -121.95525176647061\n",
      "Mode: Train env_steps 200 total rewards -347.63898885797244\n",
      "Mode: Train env_steps 200 total rewards -1516.262550830841\n",
      "Mode: Test env_steps 200 total rewards -125.29759021170321\n",
      "Mode: Test env_steps 200 total rewards -116.29971585396561\n",
      "Mode: Test env_steps 200 total rewards -132.65588944178307\n",
      "Mode: Test env_steps 200 total rewards -242.80255369469523\n",
      "Mode: Test env_steps 200 total rewards -120.76851275190711\n",
      "Mode: Test env_steps 200 total rewards -129.98449951899238\n",
      "Mode: Test env_steps 200 total rewards -263.6801114343107\n",
      "Mode: Test env_steps 200 total rewards -133.65415045432746\n",
      "Mode: Test env_steps 200 total rewards -247.21006692014635\n",
      "Mode: Test env_steps 200 total rewards -117.64420653533307\n",
      "27000 -162.99972968171642\n",
      "Mode: Train env_steps 200 total rewards -130.20218588324497\n",
      "Mode: Train env_steps 200 total rewards -118.29003828013083\n",
      "Mode: Train env_steps 200 total rewards -247.1906664679991\n",
      "Mode: Train env_steps 200 total rewards -251.76994302743697\n",
      "Mode: Train env_steps 200 total rewards -380.8231740617193\n",
      "Mode: Test env_steps 200 total rewards -128.14449329604395\n",
      "Mode: Test env_steps 200 total rewards -133.00257929693907\n",
      "Mode: Test env_steps 200 total rewards -121.33280960656703\n",
      "Mode: Test env_steps 200 total rewards -117.21745651622768\n",
      "Mode: Test env_steps 200 total rewards -260.304541438818\n",
      "Mode: Test env_steps 200 total rewards -129.4903052574955\n",
      "Mode: Test env_steps 200 total rewards -123.66184103582054\n",
      "Mode: Test env_steps 200 total rewards -4.47467941895593\n",
      "Mode: Test env_steps 200 total rewards -136.82730377465487\n",
      "Mode: Test env_steps 200 total rewards -128.40459588193335\n",
      "28000 -128.2860605523456\n",
      "Mode: Train env_steps 200 total rewards -359.02930258901324\n",
      "Mode: Train env_steps 200 total rewards -126.99004180729389\n",
      "Mode: Train env_steps 200 total rewards -130.01239318959415\n",
      "Mode: Train env_steps 200 total rewards -132.86401597573422\n",
      "Mode: Train env_steps 200 total rewards -131.5378251487855\n",
      "Mode: Test env_steps 200 total rewards -377.7228271923959\n",
      "Mode: Test env_steps 200 total rewards -388.79292901046574\n",
      "Mode: Test env_steps 200 total rewards -134.40097275190055\n",
      "Mode: Test env_steps 200 total rewards -121.09551488608122\n",
      "Mode: Test env_steps 200 total rewards -238.15228960616514\n",
      "Mode: Test env_steps 200 total rewards -131.88327238895\n",
      "Mode: Test env_steps 200 total rewards -246.09436088893563\n",
      "Mode: Test env_steps 200 total rewards -5.141647985205054\n",
      "Mode: Test env_steps 200 total rewards -130.17426304146647\n",
      "Mode: Test env_steps 200 total rewards -125.60784388473257\n",
      "29000 -189.90659216362982\n",
      "Mode: Train env_steps 200 total rewards -376.1674876296893\n",
      "Mode: Train env_steps 200 total rewards -375.34097828599624\n",
      "Mode: Train env_steps 200 total rewards -127.59093644656241\n",
      "Mode: Train env_steps 200 total rewards -136.18268738826737\n",
      "Mode: Train env_steps 200 total rewards -129.42341559915803\n",
      "Mode: Test env_steps 200 total rewards -391.77343064476736\n",
      "Mode: Test env_steps 200 total rewards -254.3057643007487\n",
      "Mode: Test env_steps 200 total rewards -134.01842796755955\n",
      "Mode: Test env_steps 200 total rewards -391.50856303423643\n",
      "Mode: Test env_steps 200 total rewards -265.35276218969375\n",
      "Mode: Test env_steps 200 total rewards -136.64729456044734\n",
      "Mode: Test env_steps 200 total rewards -133.1267894115299\n",
      "Mode: Test env_steps 200 total rewards -5.491715028416365\n",
      "Mode: Test env_steps 200 total rewards -133.11291719414294\n",
      "Mode: Test env_steps 200 total rewards -127.73738552071154\n",
      "30000 -197.3075049852254\n",
      "Mode: Train env_steps 200 total rewards -121.78846242744476\n",
      "Mode: Train env_steps 200 total rewards -131.7180840705987\n",
      "Mode: Train env_steps 200 total rewards -3.245894107458298\n",
      "Mode: Train env_steps 200 total rewards -129.29797964007594\n",
      "Mode: Train env_steps 200 total rewards -379.41606050374685\n",
      "Mode: Test env_steps 200 total rewards -121.7213050108403\n",
      "Mode: Test env_steps 200 total rewards -131.86788710579276\n",
      "Mode: Test env_steps 200 total rewards -264.3296286612749\n",
      "Mode: Test env_steps 200 total rewards -126.13307171873748\n",
      "Mode: Test env_steps 200 total rewards -269.3273641727865\n",
      "Mode: Test env_steps 200 total rewards -126.06584425829351\n",
      "Mode: Test env_steps 200 total rewards -138.2838618159294\n",
      "Mode: Test env_steps 200 total rewards -128.50390940532088\n",
      "Mode: Test env_steps 200 total rewards -255.43328048475087\n",
      "Mode: Test env_steps 200 total rewards -273.4956193007529\n",
      "31000 -183.51617719344796\n"
     ]
    }
   ],
   "source": [
    "policy_storage = SeqReplayBuffer(\n",
    "    max_replay_buffer_size=int(buffer_size),\n",
    "    observation_dim=obs_dim,\n",
    "    action_dim=act_dim,\n",
    "    sampled_seq_len=sampled_seq_len,\n",
    "    sample_weight_baseline=0.0,\n",
    ")\n",
    "\n",
    "env_steps = collect_rollouts(\n",
    "    num_rollouts=num_init_rollouts_pool, random_actions=True, train_mode=True\n",
    ")\n",
    "_n_env_steps_total += env_steps\n",
    "\n",
    "# evaluation parameters\n",
    "last_eval_num_iters = 0\n",
    "log_interval = 5\n",
    "eval_num_rollouts = 10\n",
    "learning_curve = {\n",
    "    \"x\": [],\n",
    "    \"y\": [],\n",
    "}\n",
    "\n",
    "while _n_env_steps_total < n_env_steps_total:\n",
    "\n",
    "    env_steps = collect_rollouts(num_rollouts=num_rollouts_per_iter, train_mode=True)\n",
    "    _n_env_steps_total += env_steps\n",
    "\n",
    "    train_stats = update(int(num_updates_per_iter * env_steps))\n",
    "\n",
    "    current_num_iters = _n_env_steps_total // (\n",
    "        num_rollouts_per_iter * max_trajectory_len\n",
    "    )\n",
    "    if (\n",
    "        current_num_iters != last_eval_num_iters\n",
    "        and current_num_iters % log_interval == 0\n",
    "    ):\n",
    "        last_eval_num_iters = current_num_iters\n",
    "        average_returns = collect_rollouts(\n",
    "            num_rollouts=eval_num_rollouts,\n",
    "            train_mode=False,\n",
    "            random_actions=False,\n",
    "            deterministic=True,\n",
    "        )\n",
    "        learning_curve[\"x\"].append(_n_env_steps_total)\n",
    "        learning_curve[\"y\"].append(average_returns)\n",
    "        print(_n_env_steps_total, average_returns)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Draw the learning curve"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZQAAAEGCAYAAABCa2PoAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAuUklEQVR4nO3deXxV1bn/8c8DIYEkDAkhTCGEGYEiQ0Cc6gAqHaxDtdqr1d72V2693rb39rYOPzu3ttV66692sLW211prq7ZSZ61DrRURDJOMgWAChDkkhCRkPs/vj7PRgIcQkpOcc5Lv+/U6r+ystfc5z85JzpO11t5rmbsjIiLSUb1iHYCIiHQPSigiIhIVSigiIhIVSigiIhIVSigiIhIVSbEOIFaysrI8Ly8v1mGIiCSUFStWlLn7kEh1PTah5OXlUVBQEOswREQSipltO16durxERCQqlFBERCQqlFBERCQqlFBERCQqlFBERCQqlFBERCQqlFBERCQqlFBERLpIc8j5+6Z9PPP2brrj0iE99sZG6b6aQ051fRMD+/WJdSg9Sijk3P3SZtJSkjhj3GCmjhhI714W67DiQkVNA48W7OChZdvYUV4LwPzJ2dx15alkpCXHOLroUUKRbqVoXzX//dgaNu46xJcWTGDRB8fSp7ca4l1hydYyfvpK0bvf9++bxLyxgzlj3GDOHJ/FhOx0zHpWgllbWsmDS0t4cs0u6ptCzM3L5KaLJnOgup7vP7uJD9/zT+755Ezm5GXGOtSoUEKRbqE55PzvkmJ+9EIh/ZJ7c8b4wfzohUKeeXs3d14xnWkjB8Y6xG7voTe3MTgtmSe/cBYrtlWwdGsZb2w9wIsb9gKQlZ7M6eOyOGNcOMnkZqZ2ywRT39TMs2t38+DSbazafpB+fXrz8dk5XHf6aCYPG/DufrNHZ/Iff1zJ1fe9yZcvmMgN54yjV4K36Kw79uO1RX5+vmsur+5h24EavvLYGt4qqWDBKUP5/uXTyO7fl+fX7eZrf11PxeEG/u2DY/ni/An07dM71uG+q7ishuXFBxjQtw+DUpPJTEsmIzW8nZyUWK2qPZV1nHnHK3zu7LHc8qHJR9WVVhzmja0HWLr1AEuKythXVQ/AuCFpfHH+BC6ePiLhP0jdnR3ltTxSsJ0/Ld/BgZoGxmal8anTR/Px2TkM6Bu5+7WqrpFbH1/L02/v5uwJWdx91Qyy0lO6OPqTY2Yr3D0/Yp0SiiSqUMj5w7JtfP/ZTST1Nr558VQ+PmvkUf/1Vh5u5LvPbODPK0oZOySNOz8+nfwYdy/sPVTHT17ewiNv7aA5FPnvLy25N4NSk8lI60NGajKDUpPp0+JDt+VRx/4ND0pNZuG0YczJy+yyMYy7X9zMPa9s4bWvnseozNTj7ufuvFNWwxtFZfxh2XY27anilOEDuOmiSZw7aUjctlhCIWdfVT2lFYfZebCW0opaSisOU1pRy86KWkoP1tLQFKKXwfxThnLd6aM5c1xWmxKlu/PH5Tv41lPrGdivDz+5egZnjMvqgrNqHyWUCJRQOk8o5PzrA28xe3QGXzh/fKd8SOw8WMvNf36b14vKOHtCFndeMZ3hA/sdd//XNu/n1sfXsquyluvmjeamhZNJSzlxj29VXSNrdlSyansFVfVNLJw2jJmjBrXrnCoPN3LvP7bywBvFNIecf5mby6dOz6OhKcTBww1UHG6k/HADB2vC2+Gy8HZlbSNNodBRz2e8F0PLcPYeqqOuMUR2/xQ+Mn04F586ot0xt0Vjc4iz7niFU4YP4IF/ndvm40Ih58k1u/jxi5vZXn6YOXkZ3LRwclyMJ7g7a3dW8sfl21m69QC7DtbR0Hz0z39wWjI5Gf3IyUhlZEY/cjL6cf7kbHIyjp9QW7Nx9yFufHglxWU1fPH8CXxx/oS4vKghoRKKmf0IuBhoALYC/+ruB4O6W4HPAs3AF939haB8NvAA0A94FviSn+DElFA6z6Y9h1j4//4JwPWnj+abF0+NWpeGu/NYQSnffXoDze7c9pFT+Je5uW36sKypb+JHLxTyu6UljBjYjx9c/gE+OPG9ZR1CIWfr/mpWbq9g1faDrNp+kM37qjjym9Snt9HY7OQNTuXSmSO5bOZIRg9OO+Hr1jY088AbJdz7ahFV9U1cOmMk/7VgIrmD2/fBcyKHG5p4eeM+nlqzi1c376ehKcTIQf346KnDuXj6CKaOGBDV5PL8ut18/qGV3H9dPgumDD3p4xuaQjxSsIN7Xt7C/qp6zps0hK9cNImpI7p+3OtQXSNPrN7FH5dtZ8PuQ/Tt04tzJ2aTl5VGTkY/Rmb0Y1RGP0YM6kdqcvSHoGvqm/j6X9fx+KqdzBubyU+unsnQAX07/LwNTSFKDtSwZW81m/dWseCUoXwgp30/30RLKBcCr7h7k5ndAeDuN5vZFOCPwFxgBPASMNHdm81sOfAl4E3CCeUed3+utddRQuk8Dy4t4RtPrOfymSN5fNVOLps5kjuvmN7hq632HarjlsfX8sqmfZw2JpMfXXFquz6UC0rKuekvb/PO/hounzWSnIxUVm2vYPX2g1TVNwEwsF8fZuYOYuaoDGaNHsT0nEGYwfPr9rB45U7eLD6AO8wencFlM0fy0enDGZR69OWfjc0hHiso5Scvb2bvoXrOn5zNVy+axCnDB0QKq1McqmvkxfV7eertXby+pYymkDM2K42PnjqCi6cPZ8LQ/h1+jWvvX8Y7+6v5583nd+g/6iOJ95f/2EplbSMXnzqCL18wkTFZJ07aHeHurNpxkD8t385Ta3ZT29jMlOED+ORpuVwyY8Rxxz8602MFO/jGE+tJTe7NOROHkNU/haz0ZAanpby7PSQ9hcy0ZJJa/F0dmzi27Ktiy95qistqaAq6V83gu5dM49p5o9sVW0IllJbM7DLgCne/Jmid4O4/COpeAL4FlAB/d/fJQfkngXPd/d9ae24llM5z48MrWbWtgiW3nM8vXt3Kj14oZMEpQ/nZv8xs16C4u7N41U6+/dQG6hqbuXnhZD59Rl6HWj11jc3c8/IWfvXaO7g7k4cNCCeQ3Axm5g5ibFZaq//F7zpYy19X72Txyp1s2VdNn97GeZOyuXzWSM6dlM3LG/fxP38r5J2yGmaPzuDmhZOZOya2XTkVNQ08v34PT63ZxdJ3wgnxmxdP4V/PHNPu5ywuq+G8u17lvy+YyBfmT4hKnJW1jdz32lZ++3oJDc0hPpE/istmjmR6zsCoXlRRWdvIX1ft5I/Lw2M5qcm9uWTGCD45N5cPjBwY8/GcLXur+PZTGyguq2F/dT0NTaH37WMGGanJZKUnE3IoOSZxjM5MZXx2fyYOTWfi0P6Mz05nfHZ6h36OiZxQngIecfeHzOxnwJvu/lBQ9xvgOcIJ5YfuviAoPxu42d0/GuH5FgGLAHJzc2dv23bchcekndydud9/mTPGDeYnV88E4PdLS/j6E+s5fexgfn19PultGLs4Yk9lHf93cbhVMnt0BndeMZ1xQ9KjFm9ZdT39+vRu03hKJO7O+l2HWLxqJ0+s3kVZdT3JvXvR0Bxi4tB0brpoMvNPyY75h9Ox9lXV8d+PrmHltgr+/pVzyW5nt8r3nt7AA2+U8Mat55Pdv+NdM8fG+PNXinh4+XYam52kXsYpwwcwK0j8s3IzGJXZr00/27rGZorLat59bNh9iJc27KW+KcT0nIFcPSeXj80YcVK/m13JPXyzbll1A2XV9ZRV1VNWXc/+6gYOVIe3Qw4TsqOXOI4n7hKKmb0EDItQdZu7PxHscxuQD1zu7m5mPweWHpNQngW2Az84JqHc5O4XtxaDWiido6SshnPvepXbL5vGNae916RevKqUrzz2NtNGhAduT3R3sLvzaMEOvvf0RhpDIW66aDLXn5EXl4OURzQ1h1iy9QAvbdjLjFGDuHTmyLiOt6Sshgvvfo2LTx3B/3zi1JM+vq6xmdO+/zJnjc/i59fM6oQIw8prGli5rYJVOypYue0ga0oPcrihGQjf2zJjVLhVOSs3g2ED+1JyoIbi/TVHJZCdB2uPes5hA/oy/5RsPjk3V/conaTWEkpM0vGRD//jMbPrgY8C81sMrpcCo1rslgPsCspzIpRLDCwvLgfgtGO6dy6bmUN6Sh9ufHgln/jVUn7/2dMYNjDyf7SlFYe59fG1/HNLGaeNyeTOK6a3afA71pJ69+KciUM4p8VAfzzLy0rjs2eP4d5Xt3LtvFxm5mac1PFPv72bytpGrpmX20kRhmWmJbNgytB3B/ybQ07hnqp3E8yqHRW8tHHv+47r3zeJsUPSmTsmkzFZaYwdksaYrDTyBqe1u0UqrYu7Li8zWwj8GDjH3fe3KJ8KPMx7g/IvAxOCQfm3gC8Aywi3Wn7q7s+29jpqoXSO/350DX8v3MeKry2I2BXxxtYyPve7AjLTk/nDZ+cdNageCjl/WL6dHz67EQdu/dBkrjltdMLf9BbPquubOP+uVxk+qB+LbzjjpH7Wl/1iCYdqG3npy+fEvEvv4OEGVu04yP6qesZkhRPH4LTkmMfVHbXWQonH23F/BvQHXjSz1Wb2SwB3Xw88CmwAngdudPfm4JgbgPuBIsKXGrd6hZd0nuUlB5iTl3HcP+QzxmXx8OfmUVXXxBW/fIPCPVUAbD9wmGvuX8bX/7qOmbkZvPCfH+RTp3ds4F1OLD0liVs+NJk1Ow7yl5WlbT5u/a5KVm0/yDWnjY6LD+1BqcmcNymbT+SPYk5eJlnpKXERV08Td+0+dx/fSt3twO0RyguAaZ0Zl5zY7spadpTX8ukzWr9q6NRRg3j0307n2vuX8YlfLeXaebn89vUSknoZP7z8A1w1Z5Q+DLrQpTNG8vs3t3HH84UsnDaM/m24TPahN7fTt08vPj4754T7Ss8Rjy0USVDHGz+JZOLQ/vz582cwsF8ffv73rZw2NpMX/uuDXN3GmxQlenr1Mr518VQO1NTzsxazBR9PVV0jT6zeycdOHaElAuQocddCkcS1vLic9JSkNt+4lzs4lcX/fgYbdh/irPFZSiQxdOqoQVw5O4ffLinmqjmjGNvKpdmLV+3kcENzu2+Mk+5LLRSJmuXF5eTnZZzUpbKD01M4e0L8TgrYk3z1osn0TerN957ZeNx93J2H3tzG9JyBTM8Z1HXBSUJQQpGoKK9pYMu+6riY2E/aZ0j/FL44fwKvbNrH3zfti7jPWyUVbN5bzbWnqXUi76eEIlHxVknbx08kfl1/Rh5jh6Tx3ac3RJzq46E3t9G/bxIXnzoiBtFJvFNCkahYXlxOSlKvds9gKvEhOakX3/joFN4pq+GBN4qPqiurrue5dbv5+Kwc+iXHz0JlEj+UUCQqlheXMzN3EClJ+qBJdOdOymb+5GzuebmIfVV175Y/WrCDxmbn2k6+M14SlxKKdFh1fRPrd1UyV+Mn3cbXPjqF+qZmfvR8IRCe7uThZduZNzaT8dkdn/JeuiclFOmwFdsqCDnMHTM41qFIlIzJSuMzZ43hsRWlrN5xkNc276e0olaXCkurdB+KdNjy4gMk9TJmjR4U61Akir5w/gQeX7mTbz25nsy0ZLLSU7hwSqRJwkXC1EKRDlteXM60kQM7ZUlUiZ30lCRuWTiZ1TsO8sqmfVw9ZxTJSfrIkOPTb4d0SF1jM2t2VMZ8NULpHJfNHMmMUYPoZfDJ0zQYL63Tv5TSIWt2HKShOaQB+W6qVy/jZ/8yky17qxk5qF+sw5E4p4QiHbK8uBwzdId8N5aTkUpORuqJd5QeT11e0iHLS8qZNLQ/A1M166xIT6eEIu3W1BxixbYKTbciIoASinTA+l2HONzQzBwlFBEhjhOKmX3FzNzMslqU3WpmRWZWaGYXtSifbWZrg7p7THOhd4kjC2ppQF5EIE4TipmNAi4AtrcomwJcDUwFFgK/MLMjE0fdCywCJgSPhV0acA+1rLicMVlpZA/oG+tQRCQOxGVCAe4GbgK8RdklwJ/cvd7di4EiYK6ZDQcGuPtSd3fgQeDSrg64pwmFnLdKytU6EZF3xV1CMbOPATvdfc0xVSOBHS2+Lw3KRgbbx5ZHeu5FZlZgZgX79++PYtQ9z5Z91VTWNmr8RETeFZP7UMzsJSDSpEC3Af8XuDDSYRHKvJXy9xe63wfcB5Cfnx9xH2mb5cUHAC2oJSLviUlCcfcFkcrN7APAGGBNMK6eA6w0s7mEWx6jWuyeA+wKynMilEsnWlZczvCBfcnJ0N3TIhIWV11e7r7W3bPdPc/d8wgni1nuvgd4ErjazFLMbAzhwffl7r4bqDKzecHVXdcBT8TqHHoCd2d5cTlzx2SiC+pE5IiEmXrF3deb2aPABqAJuNHdm4PqG4AHgH7Ac8FDOsn28sPsq6rXdCsicpS4TihBK6Xl97cDt0fYrwCY1kVh9XjLgvtPNH4iIi3FVZeXJIblxeVkpiUzPjs91qGISBxRQpGTtry4nDl5GRo/EZGjKKHISdlTWcf28sNaP15E3kcJRU7K8hLN3yUikSmhyElZXnyA9JQkThneP9ahiEicUUKRk7K8uJzZozNI6q1fHRE5mj4VpM0qahrYvLeaubpcWEQiUEKRNnvryPiJEoqIRKCEIm3i7jy0bDv9U5KYnjMw1uGISBxSQpE2eW7dHl7bvJ8vXziRlKTeJz5ARHocJRQ5oer6Jr7z1AamDB/Ap+aNjnU4IhKn4nouL4kP97y8hT2H6vj5NbN0dZeIHJc+HaRVhXuq+O3rxVyVP4rZozNiHY6IxDElFDkud+frT6wjvW8SN39ocqzDEZE4p4Qix7V41U6WF5dz88LJZKYlxzocEYlzSigSUeXhRr7/7EZmjBrEVfmjTnyAiPR4cZlQzOwLZlZoZuvN7M4W5beaWVFQd1GL8tlmtjaou8c0r3qH3fW3QsprGvjepdPo1Us/ThE5sbi7ysvMzgMuAaa7e72ZZQflU4CrganACOAlM5sYLAN8L7AIeBN4FliIlgFut7WllTy0bBvXn57HtJG6iVFE2iYeWyg3AD9093oAd98XlF8C/Mnd6929GCgC5prZcGCAuy91dwceBC6NQdzdQnPI+dpf1zI4LYUvXzgx1uGISAKJx4QyETjbzJaZ2T/MbE5QPhLY0WK/0qBsZLB9bPn7mNkiMysws4L9+/d3QuiJ709vbWdNaSVf+8gpDOjbJ9bhiEgCiUmXl5m9BAyLUHUb4ZgygHnAHOBRMxsLROrI91bK31/ofh9wH0B+fn7EfXqyA9X13Pl8IfPGZnLJjBGxDkdEEkxMEoq7LzhenZndADwedF8tN7MQkEW45dHycqMcYFdQnhOhXE7SD5/bRE19E9+9ZJrWixeRkxaPXV5/Bc4HMLOJQDJQBjwJXG1mKWY2BpgALHf33UCVmc0Lru66DngiJpEnsIKSch5bUcr/OXssE4ZqNUYROXlxd5UX8Fvgt2a2DmgArg9aK+vN7FFgA9AE3Bhc4QXhgfwHgH6Er+7SFV4noak5xNf+uo4RA/vyxfnjYx2OiCSouEso7t4AXHucutuB2yOUFwDTOjm0buuBN0rYtKeKX147m9TkuPuVEJEEEY9dXtKFnn57F3f9rZBzJw3hoqlDYx2OiCQw/TvaQ4VCzo9f3MzP/l7E7NEZ3HXlqRqIF5EOUULpgarqGvmvR9bw0sa9XJU/iu9cOlWrMIpIhymh9DDbDtTwf35XwDtlNXzr4ilcf0aeWiYiEhVKKD3I61vKuPHhlZjBg5+Zy5njs2Idkoh0I0ooPYC7879LSrj92Y2MG5LG/dfNIXdwaqzDEpFuRgmlm6tvauZri9fx2IpSLpgylLuvmkF6it52EYk+fbJ0Y/uq6vj871ewcvtBvjh/Av85f4LWNhGRTqOE0k24O+U1DWwvPxx+HDjMH5Ztp7K2kV9cM4sPf2B4rEMUkW5OCSXBlNc0sHZnJdvLD7Oj/DDbDtSwvbyWHeWHqa5vOmrfCdnp/ObT+UwdoUWyRKTzKaEkkMrDjSz48T8or2kAICWpF6MyUxmdmcppYzLJzUwNPwanMiojlX7JurdERLqOEkoCeXBpCeU1Dfzy2lnMzM1gSHqKxkREJG4ooSSI2oZm/veNEs6fnM3CaRoPEZH4o8khE8Qjb22nvKaBG84dF+tQREQiUkJJAI3NIX79z2Lm5GUwJy8z1uGIiESkhJIAnly9i50Ha9U6EZG4FncJxcxmmNmbZrbazArMbG6LulvNrMjMCs3sohbls81sbVB3j3Wj2Q5DIeeX/9jK5GH9OW9SdqzDERE5rrhLKMCdwLfdfQbwjeB7zGwKcDUwFVgI/MLMjlwXey+wiPA68xOC+m7hpY172bKvmhvOHadZgUUkrsVjQnFgQLA9ENgVbF8C/Mnd6929GCgC5prZcGCAuy8N1p5/ELi0i2PuFO7OL17dyqjMfnxEd7qLSJyLx8uG/xN4wczuIpzwzgjKRwJvttivNChrDLaPLX8fM1tEuCVDbm5uVIPuDMuKy1m94yDfvXQaSb3jMfeLiLwnJgnFzF4ChkWoug2YD/yXu//FzD4B/AZYAETq7/FWyt9f6H4fcB9Afn5+xH3iyS9e3UpWejJXzs6JdSgiIifUpoRiZhOBrwKjWx7j7ue350XdfUErr/Ug8KXg28eA+4PtUmBUi11zCHeHlQbbx5YntHU7K3lt836+etEk+vbRFCoiEv/a2kJ5DPgl8GugufPCAcLJ4BzgVeB8YEtQ/iTwsJn9GBhBePB9ubs3m1mVmc0DlgHXAT/t5Bg73S//sZX+KUl86vTRsQ5FRKRN2ppQmtz93k6N5D2fA35iZklAHcGYh7uvN7NHgQ1AE3Cjux9JbjcADwD9gOeCR8IqKavh2bW7WfTBcQzo2yfW4YiItElbE8pTZvbvwGKg/kihu5dHOyB3fx2YfZy624HbI5QXANOiHUus/Oq1d0jq3YvPnJUX61BERNqsrQnl+uDrV1uUOTA2uuHI3kN1/GVFKVfm55Ddv2+swxERabMTJhQz6wXc4u6PdEE8Pd5vXy+mKRRi0QeVq0UksZzw5gZ3DwE3dkEsPV7l4UYeenMbH50+gtGD02IdjojISWnr3XIvmtlXzGyUmWUeeXRqZD3Q798soaahmc+fo0kgRSTxtHUM5TPB15YtFY2hRFFtQzP/u6SE8yYNYcqIASc+QEQkzrQpobj7mM4OpKd7tGAHB2oauOHc8bEORUSkXdp6p/x1kcrd/cHohtMzNTaHuO+1d5g9OoM5eRmxDkdEpF3a2uU1p8V2X8Lzba0kPLOvdNCza3ez82At37lkqqaoF5GE1dYury+0/N7MBgK/75SIeqAXN+wlu3+KFtASkYTW3jnRDxOeS0s6KBRy3th6gLPGZ9Grl1onIpK42jqG8hTvTQnfC5hCeMJI6aANuw9RXtPAWROyYh2KiEiHtHUM5a4W203ANncvPd7O0nZLisoAOHO8EoqIJLa2dnl92N3/ETyWuHupmd3RqZH1EK8XlTFxaDpDB2jeLhFJbG1NKBdEKPtQNAPpieoam1leXK7WiYh0C612eZnZDcC/A2PN7O0WVf2BJZ0ZWE+wYlsF9U0hzlJCEZFu4ERjKA8TXqzqB8AtLcqrOmMtlJ7m9aIyknoZp40dHOtQREQ6rNUuL3evdPcSd/8k4fXcz3f3bUAvM2v3dCxmdqWZrTezkJnlH1N3q5kVmVmhmV3Uony2ma0N6u6x4A5AM0sxs0eC8mVmltfeuLrakqIyZuYOIj2lrddGiIjErzaNoZjZN4GbgVuDomTgoQ687jrgcuC1Y15nCnA1MBVYCPzCzHoH1fcSXg54QvBYGJR/Fqhw9/HA3UBCXCxQUdPA2p2VnDV+SKxDERGJirYOyl8GfAyoAXD3XYTHUdrF3Te6e2GEqkuAP7l7vbsXA0XAXDMbDgxw96Xu7oSnfLm0xTG/C7b/DMy3BJi/ZOk7B3CHsyaou0tEuoe2JpSG4IPcAcyss1Z/GgnsaPF9aVA2Mtg+tvyoY9y9CagE4v5T+p9bykhPSWJ6zqBYhyIiEhVtWQLYgKfN7FfAIDP7HOH1UX59guNeAoZFqLrN3Z843mERyryV8taOiRTTIsLdZuTm5h4nhK6xpKiMeWMH06d3e2e/ERGJLydMKO7uZnYp4TGUQ8Ak4Bvu/uIJjlvQjnhKCQ/+H5ED7ArKcyKUtzym1MySgIFAxCvQ3P0+4D6A/Pz8iEmnK2w/cJjt5Yf5zJl5sQpBRCTq2vrv8VLgoLt/1d2/cqJk0gFPAlcHV26NITz4vtzddwNVZjYvaDFdBzzR4pjrg+0rgFeC7rm49Xow3Yrm7xKR7qSt16ueB/ybmW0jGJgHcPfp7XlRM7sM+CkwBHjGzFa7+0Xuvt7MHgU2EJ4z7EZ3bw4OuwF4AOhH+N6Y54Ly3wC/N7Miwi2Tq9sTU1daUlTGsAF9GTckPdahiIhETVsTSlSnWXH3xcDi49TdDtweobwAmBahvA64MprxdaZQyFmytYz5k4dqMS0R6VbausDWts4OpKdYv+sQBw83cra6u0Skm9ElRl3syPjJGePj/spmEZGTooTSxV4v2s+kof3J7q/p6kWke1FC6UJ1jc28VVKhq7tEpFtSQulCBSUVNGi6ehHpppRQutA/i/bTp7cxd0xmrEMREYk6JZQuFJ6uPoM0TVcvIt2QEkoXKa9pYP2uQ+ruEpFuSwmli7yxtSyYrl4JRUS6JyWULrKkqIz+KUlMHzkw1qGIiHQKJZQu4O78c0sZ88YNJknT1YtIN6VPty6wvfwwpRW1mm5FRLo1JZQu8M8t4elWztSAvIh0Y0ooXWBJURkjBvZlbFZnrZwsIhJ7SiidrDnkvLH1AGeOz9J09SLSrSmhdLJ1OyuprG3U5cIi0u0poXSyd6erH6eEIiLdW0wSipldaWbrzSxkZvktyi8wsxVmtjb4en6LutlBeZGZ3ROsLU+w/vwjQfkyM8uLwSkd1+tbypg8rD9D+qfEOhQRkU4VqxbKOuBy4LVjysuAi939A8D1wO9b1N0LLAImBI+FQflngQp3Hw/cDdzRiXGflNqGZlZsq9B0KyLSI8Qkobj7RncvjFC+yt13Bd+uB/oGLZDhwAB3X+ruDjwIXBrsdwnwu2D7z8B8i5PR77dKymloDmn8RER6hHgeQ/k4sMrd64GRQGmLutKgjODrDgB3bwIqgYjr65rZIjMrMLOC/fv3d1rgR7xeVEZy716arl5EeoROm0fdzF4ChkWous3dnzjBsVMJd11deKQowm7ehrqjC93vA+4DyM/Pj7hPNL2+pYxZoweRmqzp6kWk++u0Tzp3X9Ce48wsB1gMXOfuW4PiUiCnxW45wK4WdaOAUjNLAgYC5e0KOooONzSxcc8hvnD+hFiHIiLSJeKqy8vMBgHPALe6+5Ij5e6+G6gys3nB+Mh1wJFWzpOEB/ABrgBeCcZZYmrz3mrcYcrw/rEORUSkS8TqsuHLzKwUOB14xsxeCKr+AxgPfN3MVgeP7KDuBuB+oAjYCjwXlP8GGGxmRcCXgVu66jxaU7jnEACThg2IcSQiIl0jJp377r6YcLfWseXfA753nGMKgGkRyuuAK6MdY0dt2lNF3z69yM1MjXUoIiJdIq66vLqTzXurmDi0P717xcUVzCIinU4JpZMU7qli0lCNn4hIz6GE0gnKquspq25g0jAlFBHpOZRQOkHhnioAJmtAXkR6ECWUTrApSChqoYhIT6KE0gkK9xxicFqyZhgWkR5FCaUTFO6pUutERHocJZQoC4WczXurlVBEpMdRQomy7eWHqW1sZrISioj0MEooUfbegLyu8BKRnkUJJcoK91RhBhOHpsc6FBGRLqWEEmWFew+Rm5mqNVBEpMdRQomyTZpyRUR6KCWUKKprbKakrEYD8iLSIymhRFHRvmpCrgF5EemZlFCiSFOuiEhPFqsVG680s/VmFjKz/Aj1uWZWbWZfaVE228zWmlmRmd0TLAWMmaWY2SNB+TIzy+vCUzlK4Z5DJCf1Im+wFtUSkZ4nVi2UdcDlwGvHqb+b95b4PeJeYBEwIXgsDMo/C1S4+/jguDuiHm0bbdpTxYTsdJJ6q+EnIj1PTD753H2juxdGqjOzS4F3gPUtyoYDA9x9qbs78CBwaVB9CfC7YPvPwPwjrZeupjm8RKQni6t/pc0sDbgZ+PYxVSOB0hbflwZlR+p2ALh7E1AJDD7O8y8yswIzK9i/f380Q6eipoF9VfW6wktEeqxOSyhm9pKZrYvwuKSVw74N3O3u1cc+XYR9vQ11Rxe63+fu+e6eP2TIkBOfxEnQlCsi0tN12u3c7r6gHYedBlxhZncCg4CQmdUBfwFyWuyXA+wKtkuBUUCpmSUBA4Hy9sbdXoV7DgGohSIiPVZczQ/i7mcf2TazbwHV7v6z4PsqM5sHLAOuA34a7PokcD2wFLgCeCUYZ+lShXurGJTah2wtqiUiPVSsLhu+zMxKgdOBZ8zshTYcdgNwP1AEbOW9q8B+Aww2syLgy8AtnRDyCR2ZciVG1wOIiMRcTFoo7r4YWHyCfb51zPcFwLQI+9UBV0YzvpMVCjmb91RxxeycE+8sItJNxdVVXolq58FaahqaNSAvIj2aEkoUaMoVEREllKg4coWXEoqI9GRKKFGwaU8VORn9SE+Jq4vmRES6lBJKFBTuqdL9JyLS4ymhdFB9UzPvlNWou0tEejwllA7auq+G5pDrCi8R6fGUUDqocK+mXBERASWUDtu0p4o+vY0xWWmxDkVEJKaUUDqocE8V44ak00eLaolID6dPwQ7SoloiImFKKB1QWdvI7so6JRQREZRQOmTz3vCUKxqQFxFRQukQrdIoIvIeJZQOKNxziP59kxgxsG+sQxERiTkllA4o1KJaIiLvitWKjVea2XozC5lZ/jF1081saVC/1sz6BuWzg++LzOweCz7FzSzFzB4JypeZWV5XnIO7h1dp1PiJiAgQuxbKOuBy4LWWhWaWBDwEfN7dpwLnAo1B9b3AImBC8FgYlH8WqHD38cDdwB2dHTzA7so6quqaNCAvIhKISUJx943uXhih6kLgbXdfE+x3wN2bzWw4MMDdl7q7Aw8ClwbHXAL8Ltj+MzDfuqAPqlAD8iIiR4m3MZSJgJvZC2a20sxuCspHAqUt9isNyo7U7QBw9yagEhjc2YG+e4XXULVQREQAOm1FKDN7CRgWoeo2d3+ilXjOAuYAh4GXzWwFcCjCvn7kpVqpOzamRYS7zcjNzT1+8G1QuOcQwwf2ZWBqnw49j4hId9FpCcXdF7TjsFLgH+5eBmBmzwKzCI+r5LTYLwfY1eKYUUBpMAYzECg/Tkz3AfcB5OfnR0w6baUBeRGRo8Vbl9cLwHQzSw2SwznABnffDVSZ2bxgfOQ64Egr50ng+mD7CuCVYJyl0zQ2h9i6v1oJRUSkhVhdNnyZmZUCpwPPmNkLAO5eAfwYeAtYDax092eCw24A7geKgK3Ac0H5b4DBZlYEfBm4pbPjLy6robHZdYWXiEgLndbl1Rp3XwwsPk7dQ4S7uI4tLwCmRSivA66MdoyteW9AXld4iYgcEW9dXgmhcM8hevcyxmVrUS0RkSOUUNqhcE8VY7PSSEnqHetQRETihhJKO+gKLxGR91NCOUnV9U2UVtRqQF5E5BhKKCdJU66IiESmhHKSjiQUtVBERI6mhHKSstKTuWDKUEYO6hfrUERE4kpM7kNJZBdOHcaFUyNNUSYi0rOphSIiIlGhhCIiIlGhhCIiIlGhhCIiIlGhhCIiIlGhhCIiIlGhhCIiIlGhhCIiIlFhnbxabtwys/3AtljH0U5ZQFmsg4ii7nY+0P3OqbudD3S/c+qq8xnt7kMiVfTYhJLIzKzA3fNjHUe0dLfzge53Tt3tfKD7nVM8nI+6vEREJCqUUEREJCqUUBLTfbEOIMq62/lA9zun7nY+0P3OKebnozEUERGJCrVQREQkKpRQREQkKpRQ4oSZlZjZWjNbbWYFQVmmmb1oZluCrxkt9r/VzIrMrNDMLmpRPjt4niIzu8fMrIvi/62Z7TOzdS3Koha/maWY2SNB+TIzy4vROX3LzHYG79NqM/twopyTmY0ys7+b2UYzW29mXwrKE/J9auV8Evk96mtmy81sTXBO3w7KE+M9cnc94uABlABZx5TdCdwSbN8C3BFsTwHWACnAGGAr0DuoWw6cDhjwHPChLor/g8AsYF1nxA/8O/DLYPtq4JEYndO3gK9E2DfuzwkYDswKtvsDm4O4E/J9auV8Evk9MiA92O4DLAPmJcp71Kl/kHqc1C9SCe9PKIXA8GB7OFAYbN8K3NpivxeCX5zhwKYW5Z8EftWF55DH0R++UYv/yD7BdhLhO4ItBud0vA+rhDmnFrE8AVzQHd6nY86nW7xHQCqwEjgtUd4jdXnFDwf+ZmYrzGxRUDbU3XcDBF+zg/KRwI4Wx5YGZSOD7WPLYyWa8b97jLs3AZXA4E6LvHX/YWZvB11iR7oeEuqcgm6OmYT/A0749+mY84EEfo/MrLeZrQb2AS+6e8K8R0oo8eNMd58FfAi40cw+2Mq+kcZFvJXyeNOe+OPl3O4FxgEzgN3A/wTlCXNOZpYO/AX4T3c/1NquEcri7pwinE9Cv0fu3uzuM4AcYK6ZTWtl97g6JyWUOOHuu4Kv+4DFwFxgr5kNBwi+7gt2LwVGtTg8B9gVlOdEKI+VaMb/7jFmlgQMBMo7LfLjcPe9wR98CPg14ffpqPgCcXlOZtaH8IfvH9z98aA4Yd+nSOeT6O/REe5+EHgVWEiCvEdKKHHAzNLMrP+RbeBCYB3wJHB9sNv1hPuICcqvDq7WGANMAJYHTeEqM5sXXNFxXYtjYiGa8bd8riuAVzzoBO5KR/6oA5cRfp8gAc4peP3fABvd/cctqhLyfTre+ST4ezTEzAYF2/2ABcAmEuU96orBJT1OOPg2lvCVGmuA9cBtQflg4GVgS/A1s8UxtxG+oqOQFldyAfmE/4C2Aj+j6wYQ/0i4e6GR8H9An41m/EBf4DGgiPDVK2NjdE6/B9YCbxP+wxyeKOcEnEW4a+NtYHXw+HCivk+tnE8iv0fTgVVB7OuAbwTlCfEeaeoVERGJCnV5iYhIVCihiIhIVCihiIhIVCihiIhIVCihiIhIVCihiMQxM5vRcrZckXimhCIS32YQvrdCJO4poYi0g5ldG6xbsdrMfmVmvYPyajO7PVjP4k0zG2pmAy283k2vYJ9UM9sRTBvS8jmvNLN1wbGvmVky8B3gquB1rgpmVfitmb1lZqvM7JLg2E+b2RNm9nywLsY3g/I0M3smeM51ZnZV1/6kpCdRQhE5SWZ2CnAV4Qk9ZwDNwDVBdRrwprufCrwGfM7dKwnPgnBOsM/FwAvu3njMU38DuCg49mPu3hCUPeLuM9z9EcJ3Rb/i7nOA84AfBdP1QHjOqmsIt2quNLN8wvNA7XL3U919GvB8NH8WIi0poYicvPnAbOCtYJrx+YSnzwFoAJ4OtlcQXk8F4BHCSQiCRY0iPO8S4AEz+xzQ+zivfSFwS/C6rxKeRiM3qHvR3Q+4ey3wOOGpSdYCC8zsDjM7O0huIp0iKdYBiCQgA37n7rdGqGv09+Yzaua9v7EngR+YWSbhZPTKsQe6++fN7DTgI8BqM5txnNf+uLsXHlUYPu7YeZTc3Teb2WzC4zA/MLO/uft32nSWIidJLRSRk/cycIWZZcO7632Pbu0Ad68mPBHfT4Cn3b352H3MbJy7L3P3bxBeRW8UUEV4edsjXgC+EMwgi5nNbFF3QRBLP+BSYImZjQAOu/tDwF2ElzQW6RRqoYicJHffYGZfI7zCZi/CsxHfCGw7waGPEJ7l9dzj1P/IzCYQboW8THjcZTvvdXH9APgu8P+At4OkUgJ8NDj+dcIz7Y4HHnb3AjO7KHjeUBDnDSd7viJtpdmGRboBM/s0kO/u/xHrWKTnUpeXiIhEhVooIiISFWqhiIhIVCihiIhIVCihiIhIVCihiIhIVCihiIhIVPx/HMDGd8wzKDgAAAAASUVORK5CYII=",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "plt.plot(learning_curve[\"x\"], learning_curve[\"y\"])\n",
    "plt.xlabel(\"env steps\")\n",
    "plt.ylabel(\"return\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "b5f49fb63f78fde0f27b95a7c8e14eeaa9af6d816174ff450f7bbbcd21c7c97c"
  },
  "kernelspec": {
   "display_name": "Python 3.8.5 64-bit ('dyn': conda)",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
