{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import torch\n",
    "import gym\n",
    "from torch import nn\n",
    "from torch.nn import functional as F\n",
    "import matplotlib.pyplot as plt\n",
    "from torch.utils import tensorboard\n",
    "\n",
    "import sys\n",
    "sys.path.insert(0, \"Code/src/\")\n",
    "\n",
    "from environment import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# helper function to convert numpy arrays to tensors\n",
    "def t(x): return torch.from_numpy(x).float()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def mish(input):\n",
    "    return input * torch.tanh(F.softplus(input))\n",
    "\n",
    "class Mish(nn.Module):\n",
    "    def __init__(self): super().__init__()\n",
    "    def forward(self, input): return mish(input)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Actor(nn.Module):\n",
    "    def __init__(self, state_dim, n_actions, activation=nn.Tanh):\n",
    "        super().__init__()\n",
    "        self.model = nn.Sequential(\n",
    "            nn.Linear(state_dim, 64),\n",
    "            activation(),\n",
    "            nn.Linear(64, 32),\n",
    "            activation(),\n",
    "            nn.Linear(32, n_actions),\n",
    "            nn.Softmax()\n",
    "        )\n",
    "    \n",
    "    def forward(self, X):\n",
    "        return self.model(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Critic module\n",
    "class Critic(nn.Module):\n",
    "    def __init__(self, state_dim, activation=nn.Tanh):\n",
    "        super().__init__()\n",
    "        self.model = nn.Sequential(\n",
    "            nn.Linear(state_dim, 64),\n",
    "            activation(),\n",
    "            nn.Linear(64, 32),\n",
    "            activation(),\n",
    "            nn.Linear(32, 1)\n",
    "        )\n",
    "    \n",
    "    def forward(self, X):\n",
    "        return self.model(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [],
   "source": [
    "env = gym.make(\"CartPole-v1\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<torch._C.Generator at 0x7fa947896910>"
      ]
     },
     "execution_count": 99,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# config\n",
    "state_dim = env.observation_space.shape[0]\n",
    "n_actions = env.action_space.n\n",
    "actor = Actor(state_dim, n_actions, activation=Mish)\n",
    "critic = Critic(state_dim, activation=Mish)\n",
    "adam_actor = torch.optim.Adam(actor.parameters(), lr=3e-4)\n",
    "adam_critic = torch.optim.Adam(critic.parameters(), lr=1e-3)\n",
    "\n",
    "torch.manual_seed(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def clip_grad_norm_(module, max_grad_norm):\n",
    "    nn.utils.clip_grad_norm_([p for g in module.param_groups for p in g[\"params\"]], max_grad_norm)\n",
    "\n",
    "def policy_loss(old_log_prob, log_prob, advantage, eps):\n",
    "    ratio = (log_prob - old_log_prob).exp()\n",
    "    clipped = torch.clamp(ratio, 1-eps, 1+eps)*advantage\n",
    "    \n",
    "    m = torch.min(ratio*advantage, clipped)\n",
    "    return -m\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(0) ACTION\n",
      "[-0.0142331  -0.04852999 -0.0067896   0.02096501] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0152037  -0.24355392 -0.0063703   0.31149803] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02007478 -0.04834179 -0.00014034  0.01681292] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-2.10416161e-02 -2.43461729e-01  1.95920342e-04  3.09451570e-01] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02591085 -0.43858647  0.00638495  0.60219628] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03468258 -0.63379715  0.01842888  0.89688349] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04735852 -0.43892981  0.03636655  0.61004991] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05613712 -0.24433457  0.04856755  0.32903945] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06102381 -0.04993646  0.05514833  0.05205936] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06202254  0.14435314  0.05618952 -0.22272658] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05913548 -0.05152503  0.05173499  0.087138  ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06016598  0.14281864  0.05347775 -0.18878417] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0573096  -0.05302599  0.04970207  0.12027778] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05837012  0.14134993  0.05210762 -0.15631966] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05554313 -0.05447786  0.04898123  0.15233604] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05663268  0.13990976  0.05202795 -0.12450097] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05383449 -0.05591746  0.04953793  0.18413171] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05495284  0.13846197  0.05322056 -0.09252156] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0521836  -0.05738081  0.05137013  0.21646613] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05333121 -0.25319806  0.05569946  0.52490072] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05839518 -0.05890237  0.06619747  0.25027633] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05957322  0.13521488  0.071203   -0.02081394] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05686893 -0.06085214  0.07078672  0.29345737] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05808597 -0.25690818  0.07665587  0.60759887] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06322413 -0.06293694  0.08880784  0.3400112 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06448287 -0.25920287  0.09560807  0.65932512] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06966693 -0.0655324   0.10879457  0.3982121 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07097758 -0.26201604  0.11675881  0.72311761] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0762179  -0.06868595  0.13122116  0.46934608] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07759162 -0.26539374  0.14060809  0.80033947] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08289949 -0.46213542  0.15661487  1.13374221] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0921422  -0.26937056  0.17928972  0.89399058] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.09752961 -0.46641148  0.19716953  1.23724352] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[-0.03327337 -0.03584403 -0.03325659  0.02892758] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03399025  0.15973867 -0.03267803 -0.27405988] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03079548  0.35531127 -0.03815923 -0.5768678 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02368925  0.16074438 -0.04969659 -0.29644601] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02047436  0.35653827 -0.05562551 -0.60437903] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0133436   0.55239222 -0.06771309 -0.91405103] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00229576  0.74836151 -0.08599411 -1.22722377] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01267148  0.94447862 -0.11053858 -1.5455635 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03156105  0.7508444  -0.14144985 -1.28931705] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04657794  0.55777623 -0.1672362  -1.04405488] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05773346  0.36522159 -0.18811729 -0.80819111] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06503789  0.56235459 -0.20428112 -1.15365476] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[ 0.03291399  0.04187462  0.02491247 -0.03322739] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03375148 -0.15359557  0.02424792  0.26721041] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03067957 -0.34905503  0.02959213  0.56744162] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02369847 -0.15436041  0.04094096  0.28422635] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[0.02061126 0.04015443 0.04662549 0.00473187] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02141435  0.23457777  0.04672012 -0.27288327] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[0.02610591 0.03882139 0.04126246 0.0341616 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02688233  0.23332808  0.04194569 -0.24522237] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03154889  0.42782661  0.03704124 -0.52438513] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04010543  0.62240823  0.02655354 -0.80516971] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05255359  0.81715629  0.01045015 -1.089383  ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06889672  1.01213892 -0.01133751 -1.37876869] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0891395   0.81716036 -0.03891289 -1.08965281] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.1054827   0.6225724  -0.06070594 -0.80942932] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.11793415  0.42833246 -0.07689453 -0.53644269] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.1265008   0.6244466  -0.08762338 -0.85232937] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.13898973  0.82064681 -0.10466997 -1.17122893] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.15540267  0.62702986 -0.12809455 -0.91310979] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.16794327  0.43385169 -0.14635675 -0.66327347] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.1766203   0.24103633 -0.15962222 -0.42001795] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.18144103  0.43801732 -0.16802258 -0.75846313] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.19020137  0.24555994 -0.18319184 -0.52300905] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.19511257  0.44272338 -0.19365202 -0.86736324] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[-0.03313228 -0.03366487 -0.01037629  0.04011846] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03380558  0.16160433 -0.00957392 -0.25582014] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03057349  0.35686166 -0.01469032 -0.55150739] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02343626  0.16194907 -0.02572047 -0.26348886] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02019727 -0.03279649 -0.03099024  0.02097188] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0208532  -0.22746062 -0.03057081  0.30371819] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02540242 -0.42213385 -0.02449644  0.58660519] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03384509 -0.22667753 -0.01276434  0.28630743] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03837864 -0.03137589 -0.00703819 -0.01037378] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03900616  0.16384629 -0.00724567 -0.305269  ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03572924  0.35907074 -0.01335105 -0.6002282 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02854782  0.16413809 -0.02535561 -0.31178038] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02526506 -0.03061362 -0.03159122 -0.02720045] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(0) ACTION\n",
      "[-0.02587733 -0.22526863 -0.03213523  0.25535009] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0303827  -0.41991737 -0.02702823  0.53772633] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03878105 -0.6146491  -0.0162737   0.82177191] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-5.10740334e-02 -4.19308310e-01  1.61739722e-04  5.24015264e-01] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0594602  -0.61443254  0.01064204  0.81674915] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07174885 -0.80969856  0.02697703  1.1127603 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.08794282 -1.00516422  0.04923223  1.41378239] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.10804611 -1.20086045  0.07750788  1.72143949] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.13206331 -1.00670694  0.11193667  1.45384808] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.15219745 -1.20301128  0.14101363  1.77930247] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.17625768 -1.00973023  0.17659968  1.53357938] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.19645228 -0.81712052  0.20727127  1.30080924] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[0.03229582 0.01486164 0.04160087 0.02084488] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03259305 -0.18083146  0.04201777  0.32635769] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02897642 -0.37652567  0.04854492  0.6319895 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02144591 -0.18211342  0.06118471  0.35498128] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01780364 -0.37804954  0.06828434  0.66631259] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01024265 -0.57405142  0.08161059  0.97969025] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00123838 -0.38011267  0.1012044   0.71371724] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00884063 -0.18652662  0.11547874  0.4545273 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01257117 -0.38307586  0.12456929  0.78126291] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02023268 -0.18986625  0.14019455  0.53022291] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02403001 -0.38665307  0.150799    0.86358939] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03176307 -0.58347074  0.16807079  1.19963254] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04343249 -0.39087352  0.19206344  0.96398795] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[-0.00965156 -0.03446654  0.04261495 -0.03213152] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01034089  0.16001921  0.04197232 -0.31107028] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00714051  0.35451883  0.03575091 -0.59022652] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-5.01329096e-05  1.58915024e-01  2.39463808e-02 -2.86499776e-01] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00312817 -0.03654011  0.01821639  0.01363843] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00239737 -0.2319185   0.01848915  0.31201272] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.002241   -0.42729891  0.02472941  0.61046874] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01078698 -0.62275764  0.03693878  0.91083689] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02324214 -0.42815449  0.05515552  0.62998883] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03180523 -0.23384381  0.0677553   0.35517354] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0364821  -0.42986043  0.07485877  0.66842864] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04507931 -0.23585485  0.08822734  0.40022321] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04979641 -0.04208796  0.09623181  0.13660856] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05063817 -0.23844712  0.09896398  0.45803306] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05540711 -0.43481862  0.10812464  0.780197  ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06410348 -0.24133606  0.12372858  0.52339474] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0689302  -0.04815281  0.13419647  0.27211855] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06989326 -0.24490894  0.13963884  0.60393467] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07479144 -0.05198762  0.15171754  0.35828998] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07583119  0.14068861  0.15888334  0.11702959] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07301742 -0.05631086  0.16122393  0.45532228] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07414363  0.13620823  0.17033037  0.2174834 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07141947 -0.06088733  0.17468004  0.55868378] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07263722  0.13140804  0.18585372  0.32572535] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07000906  0.32346481  0.19236822  0.09692832] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06353976  0.12618096  0.19430679  0.44360359] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06101614  0.3180994   0.20317886  0.21791069] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05465415  0.50982515  0.20753708 -0.00443781] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04445765  0.70146012  0.20744832 -0.2251431 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03042845  0.50407056  0.20294546  0.12514639] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02034704  0.30670708  0.20544839  0.47437315] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[ 0.03314665 -0.02280485 -0.03587551 -0.02376779] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03269055 -0.21739444 -0.03635086  0.25738364] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02834266 -0.02177287 -0.03120319 -0.04653965] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0279072  -0.21643382 -0.03213398  0.23613738] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02357853 -0.41108228 -0.02741123  0.51851358] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01535688 -0.6058078  -0.01704096  0.80243424] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 3.24072476e-03 -8.00691971e-01 -9.92277777e-04  1.08970825e+00] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01277311 -0.60555695  0.02080189  0.79671414] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02488425 -0.80095808  0.03673617  1.09586773] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04090342 -0.99654407  0.05865352  1.39984678] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0608343  -0.80219817  0.08665046  1.12606347] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07687826 -0.99834205  0.10917173  1.44461883] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0968451  -0.80471972  0.13806411  1.18794715] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.1129395  -0.61163087  0.16182305  0.94153195] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.12517211 -0.41901591  0.18065369  0.70375283] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.13355243 -0.61612029  0.19472874  1.04742179] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[-0.02417583 -0.04783762 -0.04559717 -0.02518026] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02513258  0.14790757 -0.04610078 -0.33189379] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02217443  0.34365436 -0.05273865 -0.63875085] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01530134  0.14930589 -0.06551367 -0.36313145] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01231522 -0.04482672 -0.0727763  -0.09180446] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01321176 -0.23883418 -0.07461239  0.17705907] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01798844 -0.43281346 -0.07107121  0.44530275] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02664471 -0.23676175 -0.06216515  0.13109023] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03137994 -0.43094066 -0.05954335  0.40353119] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03999876 -0.235027   -0.05147272  0.0926861 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0446993  -0.42937485 -0.049619    0.36869547] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0532868  -0.23358428 -0.04224509  0.06078907] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05795848 -0.03788288 -0.04102931 -0.24491748] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05871614 -0.23239553 -0.04592766  0.03454671] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06336405 -0.03664608 -0.04523672 -0.27226583] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06409697 -0.23109433 -0.05068204  0.00581311] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06871886 -0.42545417 -0.05056578  0.28208437] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07722794 -0.2296488  -0.04492409 -0.02610828] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08182092 -0.42409867 -0.04544626  0.25206901] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09030289 -0.22835823 -0.04040488 -0.0545951 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.09487005 -0.03268092 -0.04149678 -0.3597473 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09552367 -0.22718918 -0.04869172 -0.08043264] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.10006746 -0.03140426 -0.05030038 -0.38807166] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.10069554 -0.22577748 -0.05806181 -0.11166267] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.10521109 -0.42002146 -0.06029506  0.16215173] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.11361152 -0.22409048 -0.05705203 -0.14892667] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.11809333 -0.41835102 -0.06003056  0.12522583] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.12646035 -0.22242272 -0.05752605 -0.18577561] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(0) ACTION\n",
      "[-0.1309088  -0.0265269  -0.06124156 -0.49603661] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.13143934 -0.22073422 -0.07116229 -0.22326552] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.13585403 -0.02467112 -0.0756276  -0.53751988] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.13634745  0.1714281  -0.086378   -0.85304124] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.13291889 -0.0224169  -0.10343882 -0.58872136] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.13336723  0.17398982 -0.11521325 -0.91211356] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.12988743  0.37046622 -0.13345552 -1.23867209] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.1224781   0.17728666 -0.15822896 -0.99059893] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.11893237 -0.01540453 -0.17804094 -0.75149826] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.11924046  0.18166731 -0.19307091 -1.09450115] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[-0.04702966 -0.02248714 -0.04405846 -0.04998271] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0474794   0.17323797 -0.04505811 -0.35623446] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04401464  0.36897061 -0.0521828  -0.66277804] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03663523  0.17461202 -0.06543836 -0.38697178] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03314299 -0.01952294 -0.0731778  -0.11561776] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03353345  0.1765671  -0.07549015 -0.43046069] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03000211 -0.01740923 -0.08409937 -0.16249792] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03035029  0.17880968 -0.08734933 -0.48048212] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0267741  -0.01497765 -0.09695897 -0.21655741] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02707365  0.18138706 -0.10129012 -0.53818232] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02344591  0.37777624 -0.11205376 -0.86098462] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01589038  0.18434399 -0.12927346 -0.6055308 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01220351 -0.00875589 -0.14138407 -0.35619851] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01237862  0.1880634  -0.14850804 -0.68991019] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00861735 -0.00471979 -0.16230625 -0.44741942] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00871175 -0.19721845 -0.17125463 -0.20997518] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-1.26561196e-02 -1.14387092e-04 -1.75454138e-01 -5.51409460e-01] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01265841  0.19698131 -0.18648233 -0.89383336] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00871878  0.00481105 -0.20435899 -0.66508419] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[-0.02953789  0.011853   -0.01966453 -0.03653862] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02930083  0.20725134 -0.0203953  -0.33536046] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0251558   0.0124255  -0.02710251 -0.04917828] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02490729 -0.18229755 -0.02808607  0.23483187] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02855324  0.01321419 -0.02338944 -0.0665763 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02828896  0.20866353 -0.02472096 -0.366546  ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02411569  0.01390144 -0.03205188 -0.08175934] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02383766  0.20946784 -0.03368707 -0.38437992] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0196483   0.40505146 -0.04137467 -0.68749102] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01154727  0.21052749 -0.05512449 -0.40811546] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00733672  0.40638594 -0.0632868  -0.71765528] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 7.90996605e-04  6.02323976e-01 -7.76399029e-02 -1.02956784e+00] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01283748  0.40831621 -0.09823126 -0.76223607] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0210038   0.21467469 -0.11347598 -0.50200761] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02529729  0.02131977 -0.12351613 -0.24713097] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02572369  0.21796942 -0.12845875 -0.57607967] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03008308  0.02485996 -0.13998035 -0.32646526] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03058028 -0.16802036 -0.14650965 -0.08099466] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02721987 -0.36077144 -0.14812954  0.16211455] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02000444 -0.55349655 -0.14488725  0.40464538] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00893451 -0.74629855 -0.13679435  0.64837019] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00599146 -0.54956277 -0.12382694  0.31593109] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01698272 -0.35291459 -0.11750832 -0.01309481] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02404101 -0.54617243 -0.11777022  0.24032485] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03496446 -0.73943235 -0.11296372  0.49366223] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0497531  -0.932795   -0.10309048  0.74871684] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.068409   -1.12635523 -0.08811614  1.00726079] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.09093611 -0.93017438 -0.06797092  0.68825855] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.1095396  -1.12429039 -0.05420575  0.95879215] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.1320254  -1.31864332 -0.03502991  1.23396467] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.15839827 -1.12308898 -0.01035062  0.93051642] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.18086005 -0.92782887  0.00825971  0.63459892] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.19941663 -1.12306505  0.02095169  0.92987153] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.22187793 -0.92823208  0.03954912  0.64384556] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.24044257 -1.12388226  0.05242603  0.94871575] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.26292021 -1.31966934  0.07140035  1.25739908] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.2893136  -1.12553016  0.09654833  0.98790615] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.3118242  -1.32180292  0.11630645  1.30928532] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.33826026 -1.51819029  0.14249216  1.63599325] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.36862407 -1.7146675   0.17521202  1.96947323] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[-0.01946625  0.03130566 -0.03261381 -0.01932938] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01884014 -0.16333376 -0.0330004   0.26288773] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02210681  0.03224332 -0.02774265 -0.04001852] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02146195 -0.16247005 -0.02854302  0.24378385] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02471135 -0.35717292 -0.02366734  0.52732869] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03185481 -0.55195402 -0.01312077  0.81246096] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04289389 -0.74689381  0.00312845  1.10098809] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05783176 -0.55181317  0.02514822  0.80928832] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06886803 -0.74727052  0.04133398  1.10977443] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.08381344 -0.55271534  0.06352947  0.83033966] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.09486774 -0.74864546  0.08013626  1.14230706] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.10984065 -0.94471792  0.10298241  1.45900676] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.12873501 -0.75099877  0.13216254  1.20019086] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.14375499 -0.55781051  0.15616636  0.95167837] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(0) ACTION\n",
      "[-0.1549112  -0.36509566  0.17519992  0.71185038] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.16221311 -0.56215488  0.18943693  1.05415993] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[-0.00345975  0.04054036 -0.04521321 -0.02591354] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00264894 -0.15390502 -0.04573148  0.2521683 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00572704 -0.34834513 -0.04068812  0.53008316] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01269394 -0.5428718  -0.03008645  0.80967266] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02355138 -0.73756887 -0.013893    1.09274202] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03830276 -0.93250504  0.00796184  1.38103363] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05695286 -0.73748335  0.03558251  1.09085114] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07170252 -0.542848    0.05739953  0.80954196] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.08255948 -0.34855754  0.07359037  0.53545244] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08953064 -0.54463296  0.08429942  0.85038558] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.1004233  -0.35075544  0.10130713  0.5853567 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.1074384  -0.15718751  0.11301427  0.32622688] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.11058215  0.0361592   0.11953881  0.07121245] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.10985897 -0.16045564  0.12096305  0.39909084] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.11306808 -0.35706728  0.12894487  0.72732978] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.12020943 -0.55371378  0.14349147  1.05765555] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.1312837  -0.7504151   0.16464458  1.39171708] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.14629201 -0.55768108  0.19247892  1.15471277] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[ 0.0321395  -0.02891694  0.00143799 -0.00817167] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03156116 -0.22405949  0.00127456  0.28496462] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02707997 -0.02895574  0.00697385 -0.00731605] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02650086  0.16606551  0.00682753 -0.29779051] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02982217 -0.0291531   0.00087172 -0.00296214] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02923911 -0.22428754  0.00081248  0.2899957 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02475336 -0.02917719  0.00661239 -0.00243087] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02416981 -0.22439334  0.00656378  0.29233099] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01968195 -0.02936559  0.0124104   0.0017254 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01909463 -0.2246633   0.0124449   0.29829797] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01460137 -0.41996042  0.01841086  0.59487967] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00620216 -0.61533515  0.03030846  0.89330459] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00610454 -0.42063709  0.04817455  0.61030095] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01451729 -0.61639815  0.06038057  0.9177597 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02684525 -0.81228217  0.07873576  1.22879137] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04309089 -1.00832403  0.10331159  1.54506755] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06325737 -1.20452424  0.13421294  1.86811828] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08734786 -1.40083518  0.1715753   2.1992768 ] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[ 0.01991451 -0.02066806  0.01202627 -0.00692112] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01950115  0.17427937  0.01188785 -0.29578549] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02298673  0.36922985  0.00597214 -0.58469559] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03037133  0.56426763 -0.00572177 -0.87549126] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04165668  0.36922393 -0.0232316  -0.58461268] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04904116  0.17443498 -0.03492385 -0.29933757] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05252986 -0.02017221 -0.0409106  -0.01787031] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05212642 -0.21468431 -0.04126801  0.26162934] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04783273 -0.40919362 -0.03603542  0.54101554] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03964886 -0.21358418 -0.02521511  0.23719975] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03537718 -0.40833699 -0.02047112  0.52182364] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02721044 -0.6031649  -0.01003464  0.80798619] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01514714 -0.7981479   0.00612508  1.09749583] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00081582 -0.60310713  0.028075    0.80674092] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01287796 -0.40838101  0.04420981  0.52301981] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02104558 -0.21390826  0.05467021  0.24458917] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02532375 -0.40976672  0.05956199  0.55400297] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03351908 -0.2155295   0.07064205  0.28066466] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03782967 -0.4115843   0.07625535  0.59476488] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04606136 -0.60768606  0.08815064  0.91046006] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05821508 -0.4138605   0.10635985  0.64673333] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06649229 -0.22036867  0.11929451  0.38934654] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07089966 -0.02712406  0.12708144  0.13652922] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07144214 -0.2238155   0.12981203  0.46644867] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07591845 -0.42050966  0.139141    0.79706514] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.08432865 -0.22754317  0.1550823   0.55118838] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08887951 -0.42446419  0.16610607  0.88843664] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09736879 -0.23193866  0.1838748   0.65223413] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.10200757 -0.03978884  0.19691949  0.42262321] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.10280334 -0.2370758   0.20537195  0.77035536] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[ 0.0397274  -0.00954915  0.02782262  0.0088786 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03953642 -0.20505884  0.02800019  0.31020843] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03543524 -0.01034678  0.03420436  0.02648581] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03522831 -0.20594213  0.03473408  0.32976134] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03110947 -0.40154087  0.0413293   0.63319226] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02307865 -0.2070191   0.05399315  0.35380613] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01893827 -0.40286557  0.06106927  0.66301399] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01088096 -0.59878159  0.07432955  0.97428341] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00109468 -0.40473124  0.09381522  0.70584437] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0091893  -0.60101919  0.10793211  1.02652278] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02120969 -0.79739957  0.12846256  1.3510497 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03715768 -0.99387957  0.15548355  1.68100541] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05703527 -0.80086348  0.18910366  1.44050208] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[ 0.02172276  0.00291678 -0.02351433 -0.01077212] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0217811   0.19836793 -0.02372977 -0.31078037] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02574846  0.00359196 -0.02994538 -0.02567457] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0258203   0.19913027 -0.03045887 -0.32765316] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0298029   0.39467231 -0.03701193 -0.6297837 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03769635  0.59029067 -0.0496076  -0.9338893 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04950216  0.78604548 -0.06828539 -1.24173895] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06522307  0.59186331 -0.09312017 -0.97120482] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.07706034  0.39810615 -0.11254427 -0.70916605] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.08502246  0.59459211 -0.12672759 -1.03504863] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0969143   0.40136198 -0.14742856 -0.78468602] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.10494154  0.20854007 -0.16312228 -0.54177788] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.10911234  0.01604122 -0.17395784 -0.30460828] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.10943317 -0.17623015 -0.18005    -0.07144006] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.10590857  0.02095494 -0.1814788  -0.41508554] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.10632766 -0.17119346 -0.18978052 -0.18465855] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.1029038   0.02606543 -0.19347369 -0.53069555] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(0) ACTION\n",
      "[ 0.1034251  -0.16588418 -0.2040876  -0.30467022] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[ 0.03493919  0.01624757 -0.02911727  0.01456261] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03526415 -0.17844497 -0.02882602  0.29791835] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03169525 -0.3731444  -0.02286765  0.5813725 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02423236 -0.17770963 -0.0112402   0.28157447] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02067817  0.01757084 -0.00560871 -0.01463227] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02102958 -0.17747023 -0.00590136  0.27627578] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01748018  0.01773542 -0.00037584 -0.01826258] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01783489 -0.17738114 -0.0007411   0.27430174] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01428726 -0.37249251  0.00474494  0.56675083] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00683741 -0.17743744  0.01607995  0.27556653] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00328866  0.01745144  0.02159129 -0.01200171] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00363769 -0.1779734   0.02135125  0.28741457] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 7.82256600e-05 -3.73393224e-01  2.70995428e-02  5.86754290e-01] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00738964 -0.56888403  0.03883463  0.88784912] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01876732 -0.76451093  0.05659161  1.19248274] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03405754 -0.96031845  0.08044127  1.50235288] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05326391 -1.15631953  0.11048832  1.81902804] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0763903  -0.96258568  0.14686888  1.56261675] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09564201 -1.15912719  0.17812122  1.89727949] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[ 0.01695397  0.0057248   0.01836302 -0.00474438] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01706846  0.20057865  0.01826814 -0.29157745] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02108004  0.39543542  0.01243659 -0.57844331] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02898874  0.59038089  0.00086772 -0.86718263] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04079636  0.39524714 -0.01647593 -0.57422701] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0487013   0.20036    -0.02796047 -0.28677969] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0527085   0.00564773 -0.03369607 -0.00304475] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05282146 -0.18897517 -0.03375696  0.27881912] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04904196 -0.38359971 -0.02818058  0.56066694] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04136996 -0.18809382 -0.01696724  0.25924055] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03760809  0.0072662  -0.01178243 -0.03874542] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03775341 -0.18768483 -0.01255734  0.25019687] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03399971  0.00761417 -0.0075534  -0.0464203 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.034152    0.20284361 -0.00848181 -0.34147679] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03820887  0.00784336 -0.01531134 -0.05148055] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03836574  0.20318147 -0.01634095 -0.34895474] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04242937  0.0082957  -0.02332005 -0.06146913] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04259528 -0.18648426 -0.02454943  0.2237659 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03886559  0.0089798  -0.02007411 -0.07655866] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03904519 -0.1858487  -0.02160529  0.20972383] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03532822  0.0095754  -0.01741081 -0.08969538] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03551972 -0.18529272 -0.01920472  0.19744398] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03181387  0.01009859 -0.01525584 -0.10123477] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03201584 -0.18480144 -0.01728053  0.18659621] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02831981  0.01056343 -0.01354861 -0.11148757] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02853108 -0.18436178 -0.01577836  0.17689024] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02484385  0.01098238 -0.01224056 -0.12072817] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02506349  0.20627754 -0.01465512 -0.4172476 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02918904  0.01136631 -0.02300007 -0.1292206 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02941637 -0.18341874 -0.02558448  0.15611827] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.025748   -0.37816521 -0.02246212  0.44062136] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01818469 -0.1827327  -0.01364969  0.14094324] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01453004  0.01258206 -0.01083083 -0.15601449] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01478168  0.20785739 -0.01395112 -0.45209451] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01893883  0.01293549 -0.02299301 -0.16384161] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01919754 -0.18184989 -0.02626984  0.12149989] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01556054 -0.37658582 -0.02383984  0.40578054] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00802882 -0.57136173 -0.01572423  0.69085315] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00339841 -0.37602516 -0.00190717  0.39326178] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01091892 -0.1808762   0.00595807  0.09997817] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01453644  0.01415986  0.00795763 -0.19081907] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01425324  0.20916707  0.00414125 -0.4809811 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0100699   0.40423032 -0.00547837 -0.77235594] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0019853   0.59942722 -0.02092549 -1.06675753] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01000325  0.40458829 -0.04226064 -0.78071478] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01809501  0.21007198 -0.05787494 -0.50162175] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02229645  0.01581155 -0.06790737 -0.2277247 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02261269 -0.17827755 -0.07246186  0.0427885 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01904713  0.01780465 -0.07160609 -0.27184809] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01940323  0.21387144 -0.07704306 -0.58622882] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02368066  0.0199083  -0.08876763 -0.31877541] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02407882  0.21617498 -0.09514314 -0.63807972] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02840232  0.41248583 -0.10790474 -0.95914412] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03665204  0.60887998 -0.12708762 -1.28368445] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04882964  0.41558453 -0.15276131 -1.03334286] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05714133  0.22278761 -0.17342816 -0.7922526 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06159708  0.03041677 -0.18927322 -0.55875765] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06220542 -0.16161502 -0.20044837 -0.33116882] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05897312 -0.35340359 -0.20707175 -0.10778408] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05190504 -0.54504959 -0.20922743  0.1130988 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04100405 -0.73665548 -0.20696545  0.33317468] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02627094 -0.92832333 -0.20030196  0.55412651] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00770448 -1.12015283 -0.18921943  0.77762196] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01469858 -0.92300283 -0.17366699  0.43187611] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03315864 -1.11529525 -0.16502947  0.66517416] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05546454 -1.30778397 -0.15172598  0.90168564] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08162022 -1.50056084 -0.13369227  1.14309244] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.11163144 -1.30396981 -0.11083042  0.81164877] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.13771083 -1.49741321 -0.09459745  1.06751611] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.1676591  -1.3011758  -0.07324712  0.74670587] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(1) ACTION\n",
      "[-0.19368262 -1.10512377 -0.05831301  0.43190095] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.21578509 -0.90922671 -0.04967499  0.12142051] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.23396962 -1.10360307 -0.04724658  0.39802695] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.25604169 -1.29802401 -0.03928604  0.6754476 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.28200217 -1.49257864 -0.02577709  0.95550725] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.31185374 -1.68734456 -0.00666694  1.23998146] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.34560063 -1.88238027  0.01813269  1.53056845] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.38324824 -1.68748156  0.04874406  1.24359916] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.41699787 -1.49301783  0.07361604  0.9665751 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.44685822 -1.6890471   0.09294754  1.28144618] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.48063917 -1.88522204  0.11857647  1.60172566] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.51834361 -1.69168684  0.15061098  1.34824119] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.55217734 -1.4987441   0.1775758   1.10621578] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.58215222 -1.69569898  0.19970012  1.44893571] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[ 0.03826871  0.04526404 -0.01870778 -0.04392836] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03917399 -0.14958472 -0.01958634  0.24279388] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0361823   0.04581145 -0.01473047 -0.05600217] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03709853  0.24114148 -0.01585051 -0.35329603] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04192136  0.43648519 -0.02291643 -0.65093464] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05065106  0.6319187  -0.03593512 -0.95074477] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06328944  0.43729837 -0.05495002 -0.66956539] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0720354   0.24298176 -0.06834133 -0.39467761] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.07689504  0.04889272 -0.07623488 -0.12430037] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.07787289  0.24501937 -0.07872088 -0.44002723] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.08277328  0.44116203 -0.08752143 -0.75645011] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.09159652  0.63737427 -0.10265043 -1.07534042] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.10434401  0.44374732 -0.12415724 -0.81655511] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.11321895  0.64033048 -0.14048834 -1.14556853] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.12602556  0.44729459 -0.16339971 -0.90003531] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.13497145  0.25471887 -0.18140042 -0.66284542] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.14006583  0.06252215 -0.19465733 -0.4323204 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.14131627 -0.129388   -0.20330374 -0.20676032] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.13872851  0.06797282 -0.20743894 -0.55606494] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[ 0.04294258 -0.04248098  0.02999982  0.04363158] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04209296 -0.23801998  0.03087245  0.34562674] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03733256 -0.43356718  0.03778498  0.64788282] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02866122 -0.62919461  0.05074264  0.95222066] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01607732 -0.43479086  0.06978705  0.6759023 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00738151 -0.63080959  0.0833051   0.98971545] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00523469 -0.82694186  0.10309941  1.30735699] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02177352 -0.63326654  0.12924655  1.0486442 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03443885 -0.82984429  0.15021943  1.3789442 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05103574 -1.02648846  0.17779832  1.71458722] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[ 0.04334405  0.04413397 -0.04413821  0.03821231] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04422673 -0.15032815 -0.04337396  0.31664899] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04122017  0.0453839  -0.03704098  0.01060907] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04212785  0.24101695 -0.0368288  -0.29352678] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04694818  0.4366441  -0.04269934 -0.59759375] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05568107  0.24214484 -0.05465121 -0.31866056] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06052396  0.43800084 -0.06102443 -0.6280654 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06928398  0.24378128 -0.07358573 -0.3552084 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.07415961  0.43986815 -0.0806899  -0.67015741] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.08295697  0.63601374 -0.09409305 -0.9871156 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.09567724  0.44226909 -0.11383536 -0.72540673] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.10452263  0.63876572 -0.1283435  -1.05163936] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.11729794  0.83533444 -0.14937628 -1.38169684] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.13400463  0.64235858 -0.17701022 -1.13920999] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.1468518   0.44993613 -0.19979442 -0.90685601] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[-0.03411482  0.00058388  0.00489765 -0.0162401 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03410314  0.19563526  0.00457285 -0.30737375] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03019044  0.00044844 -0.00157462 -0.01325218] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03018147 -0.19465089 -0.00183967  0.27893352] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03407449  0.00049726  0.003739   -0.01432907] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03406454 -0.19467812  0.00345242  0.27953121] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0379581  -0.38984915  0.00904305  0.57330102] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04575509 -0.58509671  0.02050907  0.86881902] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05745702 -0.3902597   0.03788545  0.58265421] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06526222 -0.58589139  0.04953853  0.88702682] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07698004 -0.3914756   0.06727907  0.61031927] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08480956 -0.19735538  0.07948545  0.33956284] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.08875666 -0.0034491   0.08627671  0.07296586] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.08882565 -0.19969518  0.08773603  0.39157397] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09281955 -0.39594559  0.09556751  0.71057862] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.10073846 -0.20226792  0.10977908  0.44944187] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.10478382 -0.39875748  0.11876792  0.77461227] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.11275897 -0.5952955   0.13426016  1.10217823] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.12466488 -0.40217053  0.15630373  0.8544518 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.13270829 -0.20948465  0.17339276  0.61471281] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.13689798 -0.40655096  0.18568702  0.95660221] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.145029   -0.60361883  0.20481906  1.30140131] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[0.00814693 0.00055268 0.03687971 0.04135797] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00815799  0.19512691  0.03770687 -0.23946474] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01206052  0.38969048  0.03291757 -0.52001945] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01985433  0.19412097  0.02251718 -0.21714808] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02373675  0.38891391  0.01817422 -0.50264393] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03151503  0.19354057  0.00812134 -0.20428939] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03538584 -0.00169657  0.00403556  0.09094431] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03535191  0.1933673   0.00585444 -0.20046267] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03921926 -0.00183789  0.00184519  0.0940613 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0391825   0.19325757  0.00372642 -0.1980389 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 4.30476513e-02  3.88326021e-01 -2.34362826e-04 -4.89543987e-01] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05081417  0.19320738 -0.01002524 -0.19693493] STATE\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05467832  0.38847128 -0.01396394 -0.49276344] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06244774  0.19354904 -0.02381921 -0.20451385] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06631873  0.38900338 -0.02790949 -0.50461438] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.07409879  0.19428565 -0.03800177 -0.22085585] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.07798451  0.3899296  -0.04241889 -0.52527974] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0857831   0.58562201 -0.05292449 -0.83102171] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.09749554  0.39126181 -0.06954492 -0.55544216] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.10532077  0.58728775 -0.08065376 -0.86920003] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.11706653  0.39335021 -0.09803776 -0.60292588] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.12493353  0.58969681 -0.11009628 -0.92480851] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.13672747  0.3962203  -0.12859245 -0.66865379] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.14465188  0.59287362 -0.14196553 -0.99890031] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.15650935  0.39990535 -0.16194353 -0.75395765] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.16450746  0.20734239 -0.17702269 -0.51629609] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.1686543   0.40445733 -0.18734861 -0.85912111] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.17674345  0.60156925 -0.20453103 -1.20437408] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[-0.02878804  0.01578398  0.01605786  0.04532157] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02847236 -0.17956451  0.01696429  0.34302733] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03206365 -0.37492365  0.02382484  0.64101113] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03956212 -0.18014179  0.03664506  0.3559251 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04316496 -0.37576507  0.04376356  0.65993427] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05068026 -0.1812786   0.05696225  0.38134627] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05430583  0.0129902   0.06458917  0.10715354] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05404603 -0.18299502  0.06673225  0.41949439] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05770593  0.01112102  0.07512213  0.14857314] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05748351  0.20509133  0.0780936  -0.11949659] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05338168  0.00894239  0.07570366  0.19676628] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05320283 -0.18717618  0.07963899  0.51233719] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05694635 -0.38332424  0.08988573  0.82901586] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06461284 -0.18953851  0.10646605  0.56590158] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06840361  0.00394141  0.11778408  0.30856689] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06832478 -0.19264461  0.12395542  0.63595135] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07217767 -0.38925741  0.13667445  0.96495738] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07996282 -0.58592424  0.15597359  1.29726393] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09168131 -0.78264458  0.18191887  1.63443292] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[-0.01537003  0.01641116  0.02797659 -0.00679225] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01504181 -0.17910061  0.02784074  0.29458469] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01862382 -0.37460819  0.03373243  0.59591654] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02611599 -0.17997416  0.04565077  0.31404703] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02971547 -0.37571568  0.05193171  0.62077009] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03722978 -0.57152295  0.06434711  0.92934609] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04866024 -0.37732586  0.08293403  0.6575583 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05620676 -0.18345026  0.0960852   0.39209911] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05987576  0.01018613  0.10392718  0.13118987] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05967204 -0.18625914  0.10655098  0.4547675 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06339722  0.0072076   0.11564633  0.19747991] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06325307 -0.18936224  0.11959592  0.52428945] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06704032  0.00389154  0.13008171  0.27155686] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06696249  0.19694056  0.13551285  0.02256611] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06302368  0.38988516  0.13596417 -0.22447851] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05522597  0.19310842  0.1314746   0.10781297] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0513638  -0.00362853  0.13363086  0.4389138 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05143637  0.18937405  0.14240914  0.1911616 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04764889  0.382202    0.14623237 -0.05342539] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04000485  0.1853189   0.14516386  0.28158653] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03629847  0.37810413  0.15079559  0.03797864] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02873639  0.57077815  0.15155516 -0.20358657] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01732083  0.76344447  0.14748343 -0.44488656] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00205194  0.95620564  0.1385857  -0.68768628] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01707217  1.14915972  0.12483198 -0.93372854] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04005537  0.95259491  0.10615741 -0.60457029] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05910727  1.14608465  0.094066   -0.86201937] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.08202896  1.33980854  0.07682561 -1.1237059 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.10882513  1.14376817  0.05435149 -0.8079482 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.13170049  1.33810481  0.03819253 -1.08305122] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.15846259  1.14250022  0.01653151 -0.77863243] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 1.81312593e-01  1.33739101e+00  9.58857533e-04 -1.06606863e+00] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.20806041  1.53250026 -0.02036251 -1.35845046] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.23871042  1.72787157 -0.04753152 -1.65743285] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.27326785  1.53333506 -0.08068018 -1.37992665] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.30393455  1.33930766 -0.10827871 -1.11352652] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.3307207   1.53567185 -0.13054924 -1.43811959] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.36143414  1.73213858 -0.15931164 -1.7685845 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.39607691  1.53913423 -0.19468333 -1.52938383] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[ 0.03923046 -0.01310482  0.01232768  0.03708269] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03896837 -0.20840136  0.01306933  0.33362948] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03480034 -0.01346784  0.01974192  0.04509645] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03453098  0.18136554  0.02064385 -0.24129293] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03815829  0.37618661  0.01581799 -0.52739341] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04568202  0.57108247  0.00527012 -0.81505033] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05710367  0.37588875 -0.01103089 -0.52071441] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06462145  0.1809238  -0.02144517 -0.23152779] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06823992  0.37634552 -0.02607573 -0.53089736] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.07576683  0.18159988 -0.03669368 -0.24654361] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.07939883  0.37722616 -0.04162455 -0.55057102] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.08694336  0.57290728 -0.05263597 -0.85607254] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0984015   0.37854056 -0.06975742 -0.58039408] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.10597231  0.5745671  -0.0813653  -0.89421082] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.11746365  0.77069261 -0.09924952 -1.21132082] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.13287751  0.57698203 -0.12347593 -0.95131636] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.14441715  0.38371873 -0.14250226 -0.69983826] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.15209152  0.19082972 -0.15649903 -0.45519153] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(0) ACTION\n",
      "[ 0.15590812  0.38777805 -0.16560286 -0.79282679] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.16366368  0.19526959 -0.18145939 -0.55647963] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.16756907  0.0030967  -0.19258899 -0.32600989] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.167631    0.20036424 -0.19910918 -0.67271236] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[-0.01027911  0.04934367  0.00236497 -0.02882503] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00929223 -0.14581212  0.00178847  0.26460313] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01220848  0.04928426  0.00708053 -0.02751516] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01122279  0.24430396  0.00653023 -0.31795572] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00633671  0.04908961  0.00017111 -0.02322057] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-5.35491838e-03  2.44209109e-01 -2.93298514e-04 -3.15849503e-01] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-4.70736193e-04  4.39335237e-01 -6.61028857e-03 -6.08624912e-01] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00831597  0.63454897 -0.01878279 -0.90338253] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02100695  0.43968641 -0.03685044 -0.61666205] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02980068  0.24509812 -0.04918368 -0.33580937] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03470264  0.05070937 -0.05589987 -0.05903301] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03571683  0.24658641 -0.05708053 -0.36881564] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04064855  0.05231999 -0.06445684 -0.09466316] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04169495  0.24830373 -0.0663501  -0.40696549] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04666103  0.05418228 -0.07448941 -0.13591663] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04774467  0.25028775 -0.07720774 -0.45113877] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05275043  0.44641189 -0.08623052 -0.76712511] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06167867  0.25257625 -0.10157302 -0.50277232] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06673019  0.05902161 -0.11162847 -0.24374791] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06791062  0.25554631 -0.11650343 -0.56945086] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07302155  0.45209292 -0.12789244 -0.89644682] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.08206341  0.25891514 -0.14582138 -0.64654466] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.08724171  0.06609359 -0.15875227 -0.4031017 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.08856358 -0.12646263 -0.16681431 -0.16437675] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.08603433 -0.31885262 -0.17010184  0.07138604] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07965728 -0.12175207 -0.16867412 -0.26976693] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07722224 -0.31411585 -0.17406946 -0.03467161] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07093992 -0.50636977 -0.17476289  0.19843291] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06081252 -0.69861749 -0.17079423  0.43129281] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04684018 -0.89096163 -0.16216838  0.66564141] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02902094 -1.08350094 -0.14885555  0.90319225] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00735092 -0.88671032 -0.1307917   0.56766856] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01038328 -0.69001974 -0.11943833  0.23680938] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02418368 -0.49341188 -0.11470215 -0.09103281] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03405191 -0.29684865 -0.1165228  -0.41758875] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03998889 -0.10028485 -0.12487458 -0.74461529] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04199458 -0.29348256 -0.13976688 -0.49369308] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04786424 -0.09669473 -0.14964074 -0.82695417] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04979813 -0.28948844 -0.16617983 -0.58482577] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0555879  -0.48194064 -0.17787634 -0.34875664] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06522671 -0.67414575 -0.18485148 -0.1170183 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07870963 -0.47692271 -0.18719184 -0.46185026] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.08824808 -0.2797169  -0.19642885 -0.80720495] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[ 0.00024458 -0.00685334 -0.03095941 -0.04250862] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 1.07511778e-04 -2.01517983e-01 -3.18095801e-02  2.40247796e-01] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00392285 -0.39617143 -0.02700462  0.52272982] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01184628 -0.20068001 -0.01655003  0.22166121] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01585988 -0.00532546 -0.0121168  -0.07619598] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01596639 -0.20027163 -0.01364072  0.21263956] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01997182 -0.00495733 -0.00938793 -0.08431487] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02007096 -0.19994346 -0.01107423  0.20539142] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02406983 -0.39490531 -0.0069664   0.49456051] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03196794 -0.19968581  0.00292481  0.19969026] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03596166 -0.00460581  0.00691861 -0.09206859] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03605377  0.19041629  0.00507724 -0.38256069] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03224545 -0.00477738 -0.00257397 -0.08828126] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03234099  0.19038137 -0.0043396  -0.38177516] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02853337  0.38556467 -0.0119751  -0.67582319] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02082207  0.19061115 -0.02549156 -0.38693448] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01700985  0.38608552 -0.03323025 -0.68754447] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00928814  0.58165257 -0.04698114 -0.99050099] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00234491  0.38718984 -0.06679116 -0.71293613] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01008871  0.1930531  -0.08104989 -0.44200233] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01394977 -0.000834   -0.08988993 -0.17592899] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01393309  0.19545172 -0.09340851 -0.49556092] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01784212  0.00176256 -0.10331973 -0.23371591] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01787738  0.19819736 -0.10799405 -0.55711879] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02184132  0.39465645 -0.11913642 -0.88177923] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02973445  0.20133663 -0.13677201 -0.62879829] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03376118  0.00836172 -0.14934797 -0.38212485] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03392842  0.20525368 -0.15699047 -0.71792044] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03803349  0.01261233 -0.17134888 -0.47847783] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03828574  0.20968652 -0.18091844 -0.8198905 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04247947  0.40676199 -0.19731625 -1.16357929] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[ 0.03255976  0.03819858  0.02539472 -0.01276565] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03332374  0.2329473   0.02513941 -0.29732931] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[0.03798268 0.03747617 0.01919282 0.00317495] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03873221 -0.1579157   0.01925632  0.30185112] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[0.03557389 0.03692659 0.02529334 0.01530289] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03631242 -0.1585488   0.0255994   0.31585763] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[0.03314145 0.03619933 0.03191655 0.03135654] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03386543  0.23084939  0.03254368 -0.25108797] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03848242  0.42549187  0.02752192 -0.53333094] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04699226  0.22999388  0.0168553  -0.23210459] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05159214  0.42487099  0.01221321 -0.5194235 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06008956  0.61981889  0.00182474 -0.80823294] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(1) ACTION\n",
      "[ 0.07248593  0.42467198 -0.01433992 -0.51497659] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.08097937  0.61999291 -0.02463945 -0.81214365] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.09337923  0.81544355 -0.04088232 -1.11247389] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.1096881   1.0110779  -0.0631318  -1.41769627] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.12990966  0.81679182 -0.09148572 -1.14539615] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.1462455   1.01298175 -0.11439365 -1.46530968] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.16650513  1.20930374 -0.14369984 -1.7914264 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.19069121  1.01605601 -0.17952837 -1.54664504] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[-0.00378144  0.04731014 -0.02739856  0.00753969] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00283524  0.24281409 -0.02724776 -0.29366034] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00202104  0.43831372 -0.03312097 -0.59481077] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01078732  0.6338832  -0.04501718 -0.89773989] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02346498  0.43939942 -0.06297198 -0.61954036] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03225297  0.63534176 -0.07536279 -0.93137282] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0449598   0.83139539 -0.09399025 -1.24675424] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06158771  1.0275886  -0.11892533 -1.56733758] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.08213948  0.83407115 -0.15027208 -1.3139934 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.09882091  1.03074123 -0.17655195 -1.64968782] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[-0.00505949 -0.00695531 -0.00792929 -0.0168906 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0051986   0.18827945 -0.0082671  -0.31206472] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00143301 -0.00672376 -0.0145084  -0.02200041] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00156748 -0.20163467 -0.01494841  0.26606987] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00560018 -0.39654012 -0.00962701  0.55400071] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01353098 -0.20128431  0.00145301  0.25830028] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01755667 -0.00618314  0.00661901 -0.03392399] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01768033  0.18884327  0.00594053 -0.32451122] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01390346 -0.00636276 -0.00054969 -0.02996084] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01403072 -0.20147682 -0.00114891  0.2625486 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01806025 -0.39658235  0.00410206  0.55486893] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0259919  -0.20151824  0.01519944  0.26348122] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03002227 -0.0066165   0.02046907 -0.02436915] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0301546  -0.20202593  0.01998168  0.27470107] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03419512 -0.39742719  0.0254757   0.57361867] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04214366 -0.20267151  0.03694808  0.28906899] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04619709 -0.00809538  0.04272946  0.00826426] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.046359    0.18638855  0.04289474 -0.27063672] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04263123  0.38087298  0.03748201 -0.54948783] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03501377  0.57544893  0.02649225 -0.83012964] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02350479  0.37997507  0.00988966 -0.52923391] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01590529  0.18471539 -0.00069502 -0.23345121] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01221098 -0.01039662 -0.00536404  0.0590124 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01241891 -0.20544125 -0.0041838   0.34999812] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01652774 -0.40050345  0.00281617  0.64135883] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02453781 -0.20542087  0.01564334  0.34956407] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02864622 -0.01052486  0.02263462  0.06185477] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02885672 -0.2059639   0.02387172  0.36159225] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.032976   -0.01118925  0.03110356  0.07653114] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03319978  0.18347332  0.03263419 -0.20617851] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02953032 -0.01209973  0.02851062  0.09661762] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02977231 -0.20761846  0.03044297  0.39815751] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03392468 -0.01294133  0.03840612  0.11522596] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03418351  0.18160987  0.04071064 -0.16509702] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03055131  0.37612613  0.0374087  -0.44466402] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02302879  0.57069938  0.02851542 -0.72532412] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0116148   0.37519496  0.01400894 -0.42380432] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0041109   0.57011569  0.00553285 -0.71203822] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00729141  0.76516059 -0.00870792 -1.00297445] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02259463  0.96039779 -0.0287674  -1.29837924] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04180258  0.76565266 -0.05473499 -1.01483886] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05711563  0.57130172 -0.07503177 -0.73983308] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06854167  0.76737505 -0.08982843 -1.05515424] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.08388917  0.57355114 -0.11093151 -0.79196464] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.09536019  0.77000708 -0.12677081 -1.11738426] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.11076033  0.57675582 -0.14911849 -0.8670044 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.12229545  0.773558   -0.16645858 -1.20261016] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.13776661  0.58093261 -0.19051078 -0.96637899] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[ 0.00112517 -0.00298472  0.03519886  0.00153722] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00106548 -0.19859333  0.0352296   0.30511472] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00290639 -0.39419915  0.0413319   0.60869676] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01079037 -0.58987381  0.05350583  0.91410619] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02258785 -0.39551482  0.07178796  0.63870814] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03049815 -0.59156044  0.08456212  0.95310668] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04232936 -0.39767181  0.10362425  0.6881446 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05028279 -0.2041291   0.11738714  0.42979887] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05436537 -0.01084811  0.12598312  0.17630523] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05458234 -0.20752677  0.12950923  0.50592623] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05873287 -0.01444486  0.13962775  0.2566982 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05902177  0.17843613  0.14476172  0.01110974] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05545305 -0.01843325  0.14498391  0.34573646] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05582171 -0.21528795  0.15189864  0.68039659] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06012747 -0.02256563  0.16550657  0.4391284 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06057878 -0.21959577  0.17428914  0.77907207] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0649707  -0.02724361  0.18987058  0.54589699] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06551557 -0.22445474  0.20078852  0.89188462] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[-0.01153812  0.02716905  0.02174556 -0.0160103 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01099474 -0.1682579   0.02142535  0.28345351] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01435989  0.02655201  0.02709442 -0.00239572] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01382885 -0.16894783  0.02704651  0.29871119] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01720781  0.02577836  0.03302073  0.01467943] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01669224 -0.16980121  0.03331432  0.31759519] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02008827 -0.36538143  0.03966622  0.62059533] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0273959  -0.56103424  0.05207813  0.92550251] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03861658 -0.7568194   0.07058818  1.23408615] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05375297 -0.5626723   0.0952699   0.96432642] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(0) ACTION\n",
      "[-0.06500642 -0.75893606  0.11455643  1.28535405] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.08018514 -0.95531459  0.14026351  1.6115971 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.09929143 -1.15178661  0.17249545  1.94451326] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[ 0.01551854 -0.03058591  0.00426726 -0.04040704] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01490683 -0.2257688   0.00345912  0.25361919] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01039145 -0.03069641  0.0085315  -0.03797066] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00977752 -0.22593966  0.00777209  0.25739178] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00525873 -0.4211717   0.01291993  0.55251598] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00316471 -0.22623355  0.02397025  0.26393151] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00768938 -0.0314618   0.02924888 -0.02109561] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00831861 -0.22699073  0.02882696  0.28067028] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01285843 -0.03229159  0.03444037 -0.00278313] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01350426 -0.22789011  0.03438471  0.30056412] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01806206 -0.03327469  0.04039599  0.01892067] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01872756 -0.22895199  0.0407744   0.32407035] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02330659 -0.03443363  0.04725581  0.04451967] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02399527 -0.23020023  0.0481462   0.3517298 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02859927 -0.03579484  0.0551808   0.07460915] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02931517 -0.23166266  0.05667298  0.38417862] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03394842 -0.0373892   0.06435656  0.10988921] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03469621 -0.2333715   0.06655434  0.42216186] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03936364 -0.42937013  0.07499758  0.73506145] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04795104 -0.23535992  0.08969881  0.46689281] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05265824 -0.0416122   0.09903666  0.2037757 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05349048  0.15196422  0.10311218 -0.05609691] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0504512   0.34546816  0.10199024 -0.31454988] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04354183  0.14905245  0.09569924  0.00847585] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04056078  0.34268095  0.09586876 -0.25254528] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03370717  0.1463301   0.09081785  0.06877082] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03078056 -0.0499686   0.09219327  0.38866918] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03177994  0.14373209  0.09996665  0.1264185 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02890529  0.3372904   0.10249502 -0.13312819] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02215949  0.14086102  0.09983246  0.19005038] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01934227 -0.05553692  0.10363346  0.51248247] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.020453    0.13798437  0.11388311  0.25417168] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01769332 -0.05856387  0.11896655  0.58049358] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01886459 -0.2551343   0.13057642  0.90815885] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02396728 -0.06199861  0.1487396   0.65920203] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02520725  0.13077458  0.16192364  0.41680333] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02259176  0.32327592  0.1702597   0.17922629] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01612624  0.12617846  0.17384423  0.52041253] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01360267  0.31848188  0.18425248  0.2871584 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00723303  0.51056375  0.18999565  0.05777676] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[0.00297824 0.31329751 0.19115118 0.40387469] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[0.00924419 0.11605166 0.19922868 0.7502114 ] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[-0.01817993 -0.02233036  0.03077661 -0.02230106] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01862654 -0.21787985  0.03033059  0.27993116] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02298413 -0.41342104  0.03592921  0.58202387] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03125255 -0.21882043  0.04756969  0.30087225] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03562896 -0.02440763  0.05358713  0.02356316] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03611712  0.16990651  0.0540584  -0.25174259] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03271899 -0.02594403  0.04902354  0.05748951] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03323787 -0.22173336  0.05017334  0.36522782] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03767253 -0.4175311   0.05747789  0.67329961] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04602316 -0.61340285  0.07094388  0.98351124] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05829121 -0.80939988  0.09061411  1.29760726] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07447921 -0.61553787  0.11656625  1.03461137] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08678997 -0.4221425   0.13725848  0.78067985] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09523282 -0.22914743  0.15287208  0.53413738] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09981577 -0.03646843  0.16355483  0.293259  ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.10054514  0.15598982  0.16942001  0.05629527] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09742534  0.34832869  0.17054591 -0.17851083] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09045876  0.54065183  0.16697569 -0.4129125 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07964573  0.73306212  0.15871744 -0.64865079] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06498449  0.53612681  0.14574443 -0.31049354] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05426195  0.72890419  0.13953456 -0.55389657] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03968387  0.53212713  0.12845663 -0.22070958] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02904132  0.33542541  0.12404243  0.10957448] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02233282  0.52857171  0.12623392 -0.14154391] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01176138  0.33188912  0.12340305  0.18814677] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0051236   0.52504932  0.12716598 -0.06320171] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00537739  0.71814037  0.12590195 -0.31321344] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[0.0197402  0.52147074 0.11963768 0.01637345] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[0.03016961 0.3248543  0.11996515 0.3442796 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[0.0366667  0.12824822 0.12685074 0.67225437] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[0.03923166 0.32139998 0.14029583 0.42204881] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[0.04565966 0.12459816 0.1487368  0.75546287] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04815162 -0.0722268   0.16384606  1.09100817] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[0.04670709 0.1204016  0.18566622 0.85389081] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04911512 -0.07669952  0.20274404  1.19873407] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[-0.01038408 -0.00047283  0.02789774  0.0067821 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01039353  0.19423815  0.02803338 -0.27696986] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00650877  0.38894918  0.02249398 -0.56068091] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00127021  0.58374833  0.01128036 -0.84619304] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01294518  0.38847432 -0.0056435  -0.54998429] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02071467  0.58367508 -0.01664318 -0.84443996] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03238817  0.38878414 -0.03353198 -0.55703691] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04016385  0.5843604  -0.04467272 -0.86009294] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05185106  0.38987442 -0.06187458 -0.58178443] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(0) ACTION\n",
      "[ 0.05964855  0.58580624 -0.07351027 -0.89329928] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07136467  0.39175415 -0.09137625 -0.62459981] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07919976  0.19801869 -0.10386825 -0.36203623] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.08316013  0.00451462 -0.11110898 -0.10382684] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.08325042 -0.18885416 -0.11318551  0.15183892] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07947334  0.00769122 -0.11014873 -0.17429824] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07962716 -0.18569606 -0.1136347   0.08170643] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.07591324 -0.37902118 -0.11200057  0.33648709] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06833282 -0.18249834 -0.10527083  0.01069045] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06468285  0.01396348 -0.10505702 -0.31326341] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06496212  0.21041294 -0.11132229 -0.63714291] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06917038  0.40689683 -0.12406515 -0.9627056 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07730832  0.60344773 -0.14331926 -1.29164751] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.08937727  0.41040882 -0.16915221 -1.04704965] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.09758545  0.60732226 -0.1900932  -1.38769817] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[-0.01453965  0.01616579  0.03921719 -0.04614352] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01421633 -0.17949593  0.03829432  0.25865029] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01780625  0.015059    0.04346733 -0.02171237] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01750507 -0.18065849  0.04303308  0.28436194] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02111824  0.01382412  0.04872032  0.00555592] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02084176  0.20821472  0.04883144 -0.27136588] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01667747  0.01243121  0.04340412  0.03631006] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01642884 -0.18328542  0.04413032  0.34236532] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02009455  0.01118181  0.05097763  0.06391857] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01987091  0.20553721  0.052256   -0.21225487] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01576017  0.3998746   0.0480109  -0.48800677] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00776268  0.2041093   0.03825077 -0.18058741] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00368049  0.00846146  0.03463902  0.12391266] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00351126  0.20307048  0.03711727 -0.15764391] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00055015  0.3976419   0.0339644  -0.43839004] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00850299  0.59226707  0.02519659 -0.72017582] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02034833  0.39680574  0.01079308 -0.41966988] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02828444  0.59177311  0.00239968 -0.70893079] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0401199   0.78686174 -0.01177893 -1.00085739] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05585714  0.98213911 -0.03179608 -1.29721605] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07549992  0.78743506 -0.0577404  -1.01465407] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.09124862  0.59312874 -0.07803348 -0.74064655] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.1031112   0.39916589 -0.09284642 -0.47350621] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.11109451  0.2054694  -0.10231654 -0.21147053] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.1152039   0.4018941  -0.10654595 -0.53459532] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.12324178  0.20841909 -0.11723786 -0.27729376] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.12741017  0.40500146 -0.12278373 -0.60453219] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.1355102   0.60160721 -0.13487438 -0.93322839] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.14754234  0.79826565 -0.15353894 -1.26507   ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.16350765  0.60540251 -0.17884034 -1.02414184] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.1756157   0.8023968  -0.19932318 -1.3672196 ] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[ 0.01774593  0.02587966 -0.00127984 -0.04337029] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01826353  0.22101994 -0.00214725 -0.33645674] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02268393  0.02592861 -0.00887638 -0.04445172] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0232025   0.22117671 -0.00976542 -0.33992195] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02762603  0.02619506 -0.01656386 -0.05033441] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02814993  0.22155056 -0.01757055 -0.34819701] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03258095  0.41691795 -0.02453449 -0.64636834] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0409193   0.2221463  -0.03746185 -0.3615112 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04536223  0.41778015 -0.04469208 -0.6657673 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05371783  0.22330738 -0.05800742 -0.38748452] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05818398  0.41920271 -0.06575711 -0.6978779 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06656804  0.61517186 -0.07971467 -1.01051499] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.07887147  0.81126189 -0.09992497 -1.3271273 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.09509671  1.00749308 -0.12646752 -1.64933486] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.11524657  0.81405601 -0.15945421 -1.39857896] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.13152769  0.621235   -0.18742579 -1.15969774] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[-0.03784422 -0.00522377 -0.01632613 -0.02637107] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0379487  -0.20010783 -0.01685356  0.2611164 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04195086 -0.0047494  -0.01163123 -0.03683434] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04204585 -0.19970265 -0.01236791  0.25215624] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0460399  -0.39464583 -0.00732479  0.54091258] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05393281 -0.58966406  0.00349346  0.83127861] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0657261  -0.39459003  0.02011903  0.53969642] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0736179  -0.19975659  0.03091296  0.25342001] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07761303 -0.00508936  0.03598136 -0.02935436] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07771482 -0.20070834  0.03539428  0.27446042] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08172898 -0.0061088   0.04088348 -0.00685232] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08185116  0.1884037   0.04074644 -0.28636087] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07808308  0.38292158  0.03501922 -0.56591937] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07042465  0.18732629  0.02370083 -0.26241278] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06667813 -0.00812581  0.01845258  0.03765043] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06684064 -0.20350745  0.01920559  0.33609774] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07091079 -0.00866401  0.02592754  0.04953257] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07108407 -0.20414796  0.02691819  0.35028178] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07516703 -0.39964218  0.03392383  0.65132988] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.08315988 -0.20500872  0.04695043  0.36951936] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.08726005 -0.40076522  0.05434081  0.67662888] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.09527535 -0.59659843  0.06787339  0.98591344] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.10720732 -0.79256053  0.08759166  1.29911923] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.12305853 -0.59865289  0.11357404  1.03509189] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.13503159 -0.79508663  0.13427588  1.3611632 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.15093332 -0.6018787   0.16149915  1.11331864] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.1629709  -0.4092029   0.18376552  0.87534039] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.17115496 -0.60628327  0.20127233  1.21970695] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[-0.01120869  0.02591341  0.01077485 -0.02319781] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01069042  0.22087919  0.0103109  -0.31246177] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00627284  0.41585275  0.00406166 -0.6018752 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00204422  0.61091765 -0.00797584 -0.89327601] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01426257  0.41590478 -0.02584136 -0.60311088] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(1) ACTION\n",
      "[ 0.02258066  0.22115361 -0.03790358 -0.31867817] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02700374  0.41679432 -0.04427714 -0.62306945] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03533962  0.61250562 -0.05673853 -0.92936191] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04758974  0.80834567 -0.07532577 -1.23932145] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06375665  0.61426758 -0.1001122  -0.97115465] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.076042    0.42062138 -0.11953529 -0.71152359] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.08445443  0.22733968 -0.13376576 -0.45872962] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.08900122  0.03433713 -0.14294036 -0.21102198] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.08968796 -0.15848254 -0.14716079  0.03337628] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.08651831 -0.35122127 -0.14649327  0.27625024] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.07949389 -0.54398256 -0.14096826  0.51938005] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06861424 -0.34718683 -0.13058066  0.18580711] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0616705  -0.15046182 -0.12686452 -0.14504988] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05866126  0.04622711 -0.12976552 -0.47491042] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05958581  0.2429198  -0.13926373 -0.80551293] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0644442   0.43964802 -0.15537399 -1.13855964] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07323716  0.24686077 -0.17814518 -0.89835869] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07817438  0.05454233 -0.19611235 -0.66654456] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[-0.04881536 -0.00014991  0.01356105  0.01874855] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04881836 -0.19546369  0.01393602  0.31567911] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05272764 -0.39078135  0.0202496   0.61272421] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06054326 -0.19594816  0.03250409  0.32648731] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06446223 -0.0013037   0.03903383  0.04422936] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0644883  -0.19696301  0.03991842  0.3489678 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06842756 -0.00243086  0.04689778  0.06913494] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06847618 -0.19819271  0.04828048  0.37623774] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07244003 -0.39396597  0.05580523  0.68374438] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.08031935 -0.19966148  0.06948012  0.40913958] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08431258 -0.3956962   0.07766291  0.72289291] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.09222651 -0.20172945  0.09212077  0.45562973] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09626109 -0.39802491  0.10123336  0.77586966] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.10422159 -0.20443022  0.11675076  0.51667615] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.1083102  -0.01112905  0.12708428  0.26294508] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.10853278 -0.20781429  0.13234318  0.59285602] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.11268906 -0.01476896  0.1442003   0.34461559] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.11298444  0.17803882  0.15109261  0.10065372] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.10942367  0.3707088   0.15310569 -0.14080627] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.10200949  0.17376337  0.15028956  0.19599356] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09853422 -0.02315303  0.15420943  0.53205761] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.09899728  0.16950207  0.16485059  0.29166233] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09560724 -0.02753959  0.17068383  0.6314647 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.09615803  0.16484172  0.18331313  0.39702376] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0928612  -0.03234324  0.1912536   0.74143823] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09350806 -0.22951838  0.20608237  1.0876946 ] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[-0.00852124 -0.02234996 -0.03355841  0.0230182 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00896824 -0.21697498 -0.03309805  0.30492704] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01330774 -0.41161001 -0.02699951  0.58699064] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02153994 -0.21612053 -0.0152597   0.2859264 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02586235 -0.41102157 -0.00954117  0.57375774] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03408278 -0.21576715  0.00193399  0.27808439] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03839812 -0.41091664  0.00749567  0.57137667] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04661645 -0.2159006   0.01892321  0.28106453] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05093447 -0.4112873   0.0245445   0.57965519] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05916021 -0.21651775  0.0361376   0.29480422] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06349057 -0.41213576  0.04203369  0.59866182] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07173328 -0.21762637  0.05400692  0.3195097 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07608581 -0.0233135   0.06039712  0.04433569] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07655208  0.17089271  0.06128383 -0.22869652] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07313423  0.36508779  0.0567099  -0.50143613] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06583247  0.16921423  0.04668118 -0.19143433] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06244819  0.36363836  0.04285249 -0.46903355] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05517542  0.16793807  0.03347182 -0.16315782] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05181666 -0.02764666  0.03020866  0.13989382] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05236959  0.1670299   0.03300654 -0.14310779] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04902899 -0.02854883  0.03014438  0.15980267] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04959997 -0.22408908  0.03334044  0.46184091] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05408175 -0.02945382  0.04257726  0.17985053] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05467083  0.16503381  0.04617427 -0.0991026 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05137015 -0.03071844  0.04419221  0.20778324] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05198452  0.16374464  0.04834788 -0.07063832] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04870963 -0.03203591  0.04693511  0.23689809] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04935034 -0.2277959   0.05167307  0.54400855] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05390626 -0.03343672  0.06255325  0.26804404] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.054575   -0.22939302  0.06791413  0.57978228] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05916286 -0.03528522  0.07950977  0.30924319] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05986856  0.15861915  0.08569464  0.04265678] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05669618  0.35241439  0.08654777 -0.22180599] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04964789  0.54619951  0.08211165 -0.48598239] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0387239   0.74007262  0.072392   -0.75169853] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02392245  0.544031    0.05735803 -0.43714163] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01304183  0.34814604  0.0486152  -0.12694338] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00607891  0.54253905  0.04607633 -0.40390115] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(1) ACTION\n",
      "[ 0.00477187  0.73697828  0.03799831 -0.6817088 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01951144  0.93155249  0.02436413 -0.96219061] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03814249  1.1263387   0.00512032 -1.24712087] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06066926  1.32139462 -0.0198221  -1.53819554] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.08709716  1.12651672 -0.05058601 -1.25176343] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.10962749  1.322249   -0.07562128 -1.55985222] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.13607247  1.12810918 -0.10681832 -1.29168622] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.15863465  0.93449502 -0.13265204 -1.03426518] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.17732455  0.74136243 -0.15333735 -0.78599685] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.1921518   0.54864217 -0.16905728 -0.545213  ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.20312465  0.74568576 -0.17996154 -0.88603118] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.21803836  0.55340303 -0.19768217 -0.65488854] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[ 0.04293496 -0.00255925 -0.03638717  0.04419671] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04288378  0.19306507 -0.03550324 -0.25974101] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04674508  0.38867539 -0.04069806 -0.56340735] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05451858  0.19414743 -0.05196621 -0.28381895] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05840153 -0.00019631 -0.05764258 -0.00796806] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05839761 -0.19444624 -0.05780195  0.26598536] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05450868 -0.38869762 -0.05248224  0.53989151] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04673473 -0.58304407 -0.04168441  0.81558709] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03507385 -0.77757122 -0.02537267  1.09487269] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01952242 -0.58212442 -0.00347521  0.79433808] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00787994 -0.7771985   0.01241155  1.08592572] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00766403 -0.58224245  0.03413006  0.79716307] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01930888 -0.38760502  0.05007333  0.51540925] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02706098 -0.19322265  0.06038151  0.23891657] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03092544 -0.38915289  0.06515984  0.55001771] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03870849 -0.58512668  0.0761602   0.86249799] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05041103 -0.39111971  0.09341016  0.59470059] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05823342 -0.19742075  0.10530417  0.33284146] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06218184 -0.3938716   0.111961    0.65678767] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07005927 -0.20047158  0.12509675  0.40135178] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0740687  -0.00732545  0.13312379  0.15057916] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07421521 -0.20407755  0.13613537  0.48211894] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07829676 -0.4008319   0.14577775  0.81442017] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0863134  -0.59761719  0.16206615  1.14917424] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09826574 -0.79444007  0.18504964  1.48797894] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[ 0.00502165 -0.00442901 -0.03204883 -0.03538921] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00493307 -0.19907706 -0.03275662  0.24701219] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00095153 -0.39371625 -0.02781637  0.52918562] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0069228  -0.19821424 -0.01723266  0.22786909] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01088708 -0.00285031 -0.01267528 -0.07019944] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01094409 -0.19778827 -0.01407927  0.21845758] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01489985 -0.00246792 -0.00971012 -0.07863308] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01494921 -0.19744933 -0.01128278  0.21097052] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0188982  -0.39240816 -0.00706337  0.50007308] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02674636 -0.19718735  0.00293809  0.20517256] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03069011 -0.00210754  0.00704154 -0.0865821 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03073226 -0.19732971  0.0053099   0.2083141 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03467885 -0.39252718  0.00947618  0.50266729] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0425294  -0.58778141  0.01952953  0.79832143] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05428502 -0.78316578  0.03549596  1.09708341] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06994834 -0.97873667  0.05743763  1.40068858] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08952307 -0.78437364  0.0854514   1.12650219] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.10521055 -0.59046891  0.10798144  0.86179852] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.11701992 -0.39696992  0.12521741  0.60492542] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.12495932 -0.20380093  0.13731592  0.35415834] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.12903534 -0.40058099  0.14439909  0.68679412] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.13704696 -0.59738077  0.15813497  1.02122747] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.14899458 -0.40467853  0.17855952  0.78207887] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.15708815 -0.60174633  0.1942011   1.12519865] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[-0.01961779  0.0181879  -0.0111383   0.02582001] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01925403 -0.17677256 -0.0106219   0.31496797] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02278948 -0.37174161 -0.00432254  0.60428226] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03022431 -0.17655947  0.0077631   0.31024097] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0337555   0.01845102  0.01396792  0.02001638] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03338648  0.2133699   0.01436825 -0.26822698] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02911908  0.40828388  0.00900371 -0.55634366] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02095341  0.60327827 -0.00212316 -0.84617634] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00888784  0.79842913 -0.01904669 -1.13952617] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00708074  0.60356131 -0.04183722 -0.85287685] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01915197  0.40903388 -0.05889475 -0.5736376 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02733265  0.21478497 -0.0703675  -0.30007445] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03162835  0.02073287 -0.07636899 -0.03038764] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.032043   -0.17321555 -0.07697675  0.23725638] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02857869 -0.3671582  -0.07223162  0.50470006] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02123553 -0.17109659 -0.06213762  0.19015689] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0178136   0.02485673 -0.05833448 -0.12146254] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01831073 -0.16938305 -0.06076373  0.15226091] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01492307 -0.36358467 -0.05771851  0.4251722 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00765138 -0.16769466 -0.04921507  0.11486667] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00429748  0.02809666 -0.04691773 -0.19292811] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00485942 -0.16632382 -0.0507763   0.08459294] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(0) ACTION\n",
      "[ 0.00153294 -0.36068253 -0.04908444  0.36083345] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00568071 -0.55507366 -0.04186777  0.63764393] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01678218 -0.74958754 -0.02911489  0.91685353] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03177393 -0.55408427 -0.01077782  0.61516434] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04285562 -0.3588134   0.00152547  0.31910643] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05003189 -0.16371321  0.0079076   0.02690497] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05330615  0.03129445  0.00844569 -0.26327257] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05268026 -0.16394703  0.00318024  0.0320622 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0559592   0.03112918  0.00382149 -0.25961563] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05533662  0.22619636 -0.00137083 -0.55109076] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05081269  0.03109369 -0.01239264 -0.25884005] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05019082 -0.16384917 -0.01756944  0.02990844] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0534678   0.03152028 -0.01697127 -0.26826564] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0528374   0.22688027 -0.02233659 -0.56625274] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04829979  0.03207867 -0.03366164 -0.28068965] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04765822  0.22766422 -0.03927543 -0.58379642] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04310493  0.42331373 -0.05095136 -0.88858838] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03463866  0.61908874 -0.06872313 -1.19684302] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02225688  0.81502962 -0.09265999 -1.51024925] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00595629  0.62114451 -0.12286497 -1.24787228] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0064666   0.42779326 -0.14782242 -0.99606323] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01502246  0.23492412 -0.16774369 -0.75321631] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01972095  0.04246312 -0.18280801 -0.51766259] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02057021 -0.14967801 -0.19316126 -0.28770003] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01757665 -0.34159618 -0.19891526 -0.06161356] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01074472 -0.1442607  -0.20014753 -0.40987623] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00785951  0.05305236 -0.20834506 -0.75838874] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[-0.04315444 -0.02039578 -0.03258834  0.01851607] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04356236  0.175178   -0.03221802 -0.28426812] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0400588   0.3707443  -0.03790338 -0.58693579] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03264391  0.17617313 -0.0496421  -0.30642958] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02912045  0.37196603 -0.05577069 -0.61434591] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02168113  0.1776659  -0.06805761 -0.33973695] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01812781  0.3736869  -0.07485234 -0.65308124] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01065407  0.56976698 -0.08791397 -0.96836384] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00074127  0.37592837 -0.10728125 -0.70454224] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00825983  0.57236047 -0.12137209 -1.02897665] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01970704  0.76887019 -0.14195162 -1.35716862] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03508445  0.57578534 -0.169095   -1.11204825] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04660015  0.38323848 -0.19133596 -0.87682461] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05426492  0.19115972 -0.20887245 -0.6498745 ] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[-0.0115443  -0.00531058  0.03192248 -0.03326396] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01165052  0.1893394   0.0312572  -0.31570655] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00786373  0.38400249  0.02494307 -0.59837044] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-1.83678101e-04  5.78766719e-01  1.29756565e-02 -8.83093407e-01] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01139166  0.38347098 -0.00468621 -0.58635972] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01906108  0.18841497 -0.01641341 -0.29515667] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02282938 -0.00646918 -0.02231654 -0.00769513] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02269999  0.18896559 -0.02247044 -0.30733482] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0264793   0.38440041 -0.02861714 -0.60701878] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03416731  0.57991055 -0.04075751 -0.90857621] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04576552  0.38536329 -0.05892904 -0.62897706] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05347279  0.19111108 -0.07150858 -0.35541964] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05729501  0.38717304 -0.07861697 -0.66976659] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06503847  0.19322709 -0.0920123  -0.40283582] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06890301 -0.0004776  -0.10006902 -0.14052136] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06889346 -0.19403455 -0.10287945  0.11899038] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06501277  0.00239945 -0.10049964 -0.20429589] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06506076 -0.1911524  -0.10458556  0.05506975] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06123771 -0.38463138 -0.10348416  0.31301031] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05354508 -0.18819913 -0.09722396 -0.01043185] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0497811   0.00817298 -0.09743259 -0.33213779] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04994456 -0.18543688 -0.10407535 -0.07170026] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04623582 -0.37892472 -0.10550935  0.18641812] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03865733 -0.57239128 -0.10178099  0.44404195] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0272095  -0.76593695 -0.09290015  0.70298601] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01189076 -0.56965864 -0.07884043  0.38256475] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00049759 -0.37351096 -0.07118914  0.06610207] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00697263 -0.17744438 -0.0698671  -0.2481648 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01052152 -0.37150259 -0.07483039  0.02168812] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01795157 -0.56547607 -0.07439663  0.28985448] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02926109 -0.36937646 -0.06859954 -0.02533371] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03664862 -0.17334121 -0.06910621 -0.33884741] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04011544  0.02269254 -0.07588316 -0.65249724] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03966159  0.21878465 -0.08893311 -0.96807718] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0352859   0.41498086 -0.10829465 -1.28732036] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02698628  0.22139063 -0.13404106 -1.03041256] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02255847  0.02828201 -0.15464931 -0.78263808] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02199283  0.22515283 -0.17030207 -1.1197059 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01748977  0.03262365 -0.19269619 -0.88491707] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[0.00059115 0.04629738 0.02226843 0.03735143] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0015171  -0.14913671  0.02301546  0.33697626] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00146564 -0.34457849  0.02975498  0.63682725] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00835721 -0.14988386  0.04249153  0.35366124] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01135488 -0.34558344  0.04956475  0.6594344 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(1) ACTION\n",
      "[-0.01826655 -0.54135888  0.06275344  0.96730299] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02909373 -0.34713319  0.0820995   0.69497531] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03603639 -0.54329211  0.09599901  1.01233277] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04690224 -0.34957288  0.11624566  0.75127223] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05389369 -0.54608969  0.13127111  1.07815733] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06481549 -0.35292252  0.15283425  0.82938247] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07187394 -0.16018316  0.1694219   0.58840195] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0750776   0.03221198  0.18118994  0.35351328] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07443336  0.22435734  0.18826021  0.12299224] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06994622  0.02710708  0.19072005  0.46866203] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06940407 -0.170125    0.20009329  0.81487822] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[ 0.01287813 -0.03277231 -0.04855559  0.03345708] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01222268  0.16301111 -0.04788645 -0.27414166] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01548291 -0.03139605 -0.05336928  0.00306156] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01485499 -0.2257136  -0.05330805  0.27843976] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01034071 -0.42003614 -0.04773925  0.55384437] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00193999 -0.61445637 -0.03666236  0.83111244] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01034914 -0.41885303 -0.02004012  0.52712827] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0187262  -0.61368736 -0.00949755  0.81342969] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03099994 -0.80867795  0.00677104  1.10311015] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0471735  -0.61364572  0.02883325  0.81255918] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05944642 -0.41893032  0.04508443  0.52908328] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06782502 -0.61465658  0.0556661   0.83562467] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.08011816 -0.42033742  0.07237859  0.56095481] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0885249  -0.61639654  0.08359768  0.87553424] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.10085283 -0.81254931  0.10110837  1.19328498] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.11710382 -0.6188717   0.12497407  0.93392749] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.12948125 -0.42563707  0.14365262  0.68298294] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.137994   -0.62243072  0.15731228  1.01722163] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.15044261 -0.42971576  0.17765671  0.77777797] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.15903693 -0.23742353  0.19321227  0.54583624] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.1637854  -0.43466008  0.20412899  0.89263758] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[ 0.0305586  -0.01727186 -0.0483861  -0.01549065] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03021316 -0.2116677  -0.04869591  0.26154188] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02597981 -0.01588568 -0.04346508 -0.04609398] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0256621  -0.21035828 -0.04438696  0.23256471] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02145493 -0.01463113 -0.03973566 -0.07378232] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02116231 -0.20916157 -0.04121131  0.20610378] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01697908 -0.4036707  -0.03708923  0.48550707] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00890566 -0.59825018 -0.02737909  0.76627382] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00305934 -0.40276217 -0.01205361  0.46510326] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01111458 -0.20747199 -0.00275155  0.16864559] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01526402 -0.40255445  0.00062136  0.46045922] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02331511 -0.59768518  0.00983055  0.75333793] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03526882 -0.79294128  0.02489731  1.04909807] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05112764 -0.59815835  0.04587927  0.76433327] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06309081 -0.40369724  0.06116593  0.48643232] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07116475 -0.59962653  0.07089458  0.79774824] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08315729 -0.40554519  0.08684954  0.52818306] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09126819 -0.21174561  0.0974132   0.26408104] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0955031  -0.01813928  0.10269483  0.00364255] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09586589 -0.21457264  0.10276768  0.32687947] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.10015734 -0.0210525   0.10930527  0.0682913 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.10057839 -0.21755799  0.11067109  0.39336225] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.10492955 -0.02416623  0.11853834  0.13752024] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.10541287 -0.22076894  0.12128874  0.46512355] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.10982825 -0.41737721  0.13059121  0.79344091] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.1181758  -0.22426653  0.14646003  0.54452616] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.12266113 -0.03147335  0.15735055  0.30133795] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.12329059 -0.22844708  0.16337731  0.63921881] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.12785954 -0.03593455  0.17616169  0.4021149 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.12857823  0.15630855  0.18420399  0.16973718] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.12545206 -0.04090653  0.18759873  0.51440541] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.12627019 -0.23810621  0.19788684  0.85984618] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[-0.02756854  0.00784875 -0.01685257  0.00123707] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02741157  0.20320829 -0.01682783 -0.29671504] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0233474   0.00833022 -0.02276213 -0.00938647] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0231808  -0.18645802 -0.02294986  0.27602871] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02690996 -0.38124515 -0.01742928  0.5613858 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03453486 -0.18588299 -0.00620156  0.26326312] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03825252  0.00932693 -0.0009363  -0.03136937] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03806598 -0.18578158 -0.00156369  0.26101799] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04178161  0.00936265  0.00365667 -0.03215773] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04159436 -0.18581154  0.00301352  0.26167667] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04531059 -0.38097638  0.00824705  0.55530857] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05293012 -0.18597119  0.01935322  0.26523533] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05664954  0.00886927  0.02465793 -0.02128113] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05647216  0.20362908  0.0242323  -0.30608335] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05239958  0.39839749  0.01811064 -0.59102659] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04443163  0.59326126  0.00629011 -0.87795007] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0325664   0.78829717 -0.0112689  -1.16864887] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01680046  0.59332361 -0.03464187 -0.87952009] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00493399  0.78889867 -0.05223228 -1.18288935] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01084399  0.59449195 -0.07589006 -0.90702627] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02273383  0.40047495 -0.09403059 -0.63912869] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(1) ACTION\n",
      "[ 0.03074333  0.20678113 -0.10681316 -0.37747572] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03487895  0.4032451  -0.11436268 -0.70183558] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04294385  0.20987842 -0.12839939 -0.44722963] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04714142  0.0167844  -0.13734398 -0.19761867] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04747711  0.21357626 -0.14129635 -0.53027901] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05174863  0.41037353 -0.15190193 -0.86393611] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0599561   0.21760949 -0.16918066 -0.62260717] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06430829  0.41463931 -0.1816328  -0.96343461] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.07260108  0.22236081 -0.20090149 -0.73286823] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[ 0.01439997 -0.03777651  0.0434824  -0.01465915] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01364444 -0.23349421  0.04318922  0.2914197 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00897455 -0.03901384  0.04901761  0.01266485] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00819427  0.15537212  0.04927091 -0.26415844] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01130172  0.34975745  0.04398774 -0.5409028 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01829687  0.1540457   0.03316968 -0.23469093] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02137778  0.34867842  0.02847586 -0.51672932] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02835135  0.54338807  0.01814128 -0.80030462] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03921911  0.34802205  0.00213519 -0.50197056] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04617955  0.54311384 -0.00790423 -0.79397983] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05704183  0.34810127 -0.02378382 -0.50379393] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06400385  0.15332245 -0.0338597  -0.21870013] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0670703  -0.04129953 -0.0382337   0.06311266] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06624431 -0.23585304 -0.03697145  0.34349168] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06152725 -0.43043005 -0.03010162  0.62429062] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05291865 -0.23490106 -0.0176158   0.32228146] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04822063 -0.42976779 -0.01117018  0.60935742] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03962527 -0.62473182  0.00101697  0.89850127] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02713064 -0.42962367  0.018987    0.60613818] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01853816 -0.6250059   0.03110976  0.90474054] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00603804 -0.82053504  0.04920457  1.20703713] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01037266 -0.62608212  0.07334532  0.93017108] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0228943  -0.43202268  0.09194874  0.66140872] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03153475 -0.62829562  0.10517691  0.98156931] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04410066 -0.82465783  0.1248083   1.30535001] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06059382 -0.63131956  0.1509153   1.05419687] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07322021 -0.82808471  0.17199924  1.39019078] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08978191 -0.63547126  0.19980305  1.15585256] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[-0.00776567  0.04784627  0.00751223 -0.00185681] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00680874 -0.1473826   0.0074751   0.29318682] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00975639 -0.34261033  0.01333883  0.58821788] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0166086  -0.53791651  0.02510319  0.88507259] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02736693 -0.73337014  0.04280464  1.18554022] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04203433 -0.5388287   0.06651545  0.90657609] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05281091 -0.34466734  0.08464697  0.6355192 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05970425 -0.15082163  0.09735735  0.37064821] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06272069  0.04279205  0.10477032  0.11018083] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06186484  0.23626882  0.10697393 -0.14769681] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05713947  0.03979055  0.10402     0.17672631] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05634366 -0.15665433  0.10755452  0.50032765] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05947674 -0.35311503  0.11756107  0.82487596] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06653904 -0.15978034  0.13405859  0.57135901] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06973465 -0.35650223  0.14548577  0.90308951] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0768647  -0.16361871  0.16354756  0.65944288] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08013707 -0.36059341  0.17673642  0.99883116] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08734894 -0.16821766  0.19671305  0.766453  ] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[-0.00516397 -0.04394332 -0.03769718 -0.04869995] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00604284 -0.23850502 -0.03867118  0.2318549 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01081294 -0.04285244 -0.03403408 -0.07277103] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01166999 -0.23747036 -0.0354895   0.2089827 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0164194  -0.0418594  -0.03130985 -0.09468071] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01725658  0.15369699 -0.03320346 -0.39707512] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01418264 -0.04093853 -0.04114496 -0.11504277] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01500142 -0.23544752 -0.04344582  0.16438057] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01971037 -0.03973144 -0.04015821 -0.14168558] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02050499 -0.23425592 -0.04299192  0.13806256] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02519011 -0.03854542 -0.04023067 -0.16786768] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02596102  0.15712861 -0.04358802 -0.47296585] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02281845 -0.0373515  -0.05304734 -0.19433357] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02356548  0.15848756 -0.05693401 -0.50326755] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02039573 -0.03578767 -0.06699936 -0.22905644] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02111148 -0.2298914  -0.07158049  0.04176283] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02570931 -0.03381988 -0.07074523 -0.27261799] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02638571 -0.22786481 -0.07619759 -0.00306043] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.030943   -0.03173746 -0.0762588  -0.31877824] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03157775  0.16438308 -0.08263437 -0.63450243] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02829009  0.36055453 -0.09532441 -0.95202238] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.021079    0.16683565 -0.11436486 -0.69074623] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01774229 -0.02652928 -0.12817979 -0.4361436 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01827287 -0.21962586 -0.13690266 -0.18645666] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02266539 -0.02283779 -0.14063179 -0.51900066] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02312215 -0.2157289  -0.1510118  -0.27373003] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02743672 -0.40840981 -0.15648641 -0.03222682] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03560492 -0.60098189 -0.15713094  0.20728227] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04762456 -0.79354879 -0.1529853   0.44656744] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06349553 -0.98621305 -0.14405395  0.68738608] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08321979 -1.17907288 -0.13030623  0.93147387] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.10680125 -0.98245612 -0.11167675  0.60084735] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.12645037 -0.78596366 -0.0996598   0.27517994] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(1) ACTION\n",
      "[-0.14216965 -0.58957149 -0.0941562  -0.04719809] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.15396108 -0.39323436 -0.09510016 -0.36804034] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.16182577 -0.58688543 -0.10246097 -0.10679361] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.17356347 -0.3904558  -0.10459684 -0.42996362] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.18137259 -0.58395309 -0.11319612 -0.17199939] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.19305165 -0.77728825 -0.1166361   0.08293874] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.20859742 -0.97056186 -0.11497733  0.33666611] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.22800865 -0.77400744 -0.10824401  0.01005223] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.2434888  -0.96742395 -0.10804296  0.26671736] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.26283728 -0.77093919 -0.10270862 -0.0579941 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.27825607 -0.96444998 -0.1038685   0.20059914] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.29754507 -0.76800768 -0.09985651 -0.12295868] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.31290522 -0.96156776 -0.10231569  0.13662605] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.33213657 -1.15508672 -0.09958317  0.39535807] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.35523831 -0.95870321 -0.09167601  0.07301314] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.37441237 -1.15239944 -0.09021574  0.33542239] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.39746036 -1.34612941 -0.0835073   0.59834812] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.42438295 -1.1499444  -0.07154033  0.28057347] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.44738184 -0.95387875 -0.06592886 -0.03378766] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.46645941 -1.14799634 -0.06660462  0.2373871 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.48941934 -0.9519892  -0.06185687 -0.075539  ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.50845912 -1.14617234 -0.06336765  0.19700431] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.53138257 -0.95020393 -0.05942757 -0.11497594] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.55038665 -0.75428304 -0.06172709 -0.42579991] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.56547231 -0.94847882 -0.07024309 -0.15319726] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.58444189 -1.14252836 -0.07330703  0.11652476] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.60729245 -1.33652757 -0.07097654  0.38520905] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.634023   -1.14047354 -0.06327235  0.07101885] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.65683248 -0.94450423 -0.06185198 -0.24093656] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.67572256 -1.1386906  -0.06667071  0.03161312] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.69849637 -1.33279626 -0.06603845  0.30253807] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.7251523  -1.52691791 -0.05998769  0.57368465] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.75569066 -1.72114978 -0.04851399  0.8468826 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.79011365 -1.52540078 -0.03157634  0.53934704] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.82062167 -1.3298495  -0.0207894   0.23688444] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.84721866 -1.52466836 -0.01605171  0.52293795] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.87771202 -1.71956075 -0.00559295  0.81051988] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.91210324 -1.52436263  0.01061745  0.51608295] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.94259049 -1.71963247  0.02093911  0.81209267] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.97698314 -1.91503489  0.03718096  1.11128769] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-1.01528384 -1.72042056  0.05940671  0.83049665] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-1.04969225 -1.91630202  0.07601665  1.14125581] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-1.08801829 -1.72225145  0.09884176  0.87334793] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-1.12246332 -1.52860237  0.11630872  0.61330425] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-1.15303537 -1.7251411   0.12857481  0.94023828] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-1.18753819 -1.92173966  0.14737957  1.27039885] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-1.22597298 -2.11840325  0.17278755  1.60536918] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-1.26834105 -2.31509649  0.20489493  1.94656283] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[-0.03389834  0.03816627  0.00951892  0.02564677] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03313502 -0.15709088  0.01003185  0.32131775] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03627683  0.03788678  0.01645821  0.03181529] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0355191  -0.15746727  0.01709451  0.32964518] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03866844  0.03740721  0.02368742  0.04240171] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0379203  -0.15804626  0.02453545  0.34246312] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04108122 -0.35350853  0.03138471  0.64278097] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0481514  -0.54905355  0.04424033  0.94517965] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05913247 -0.74474259  0.06314393  1.25142842] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07402732 -0.550484    0.0881725   0.97917322] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.085037   -0.35664764  0.10775596  0.71543731] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09216995 -0.16316917  0.12206471  0.45852186] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.09543333  0.03003491  0.13123514  0.20667006] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09483264 -0.16669565  0.13536854  0.53770086] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.09816655  0.02628943  0.14612256  0.29054839] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.09764076 -0.17058105  0.15193353  0.62551437] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.10105238 -0.36746081  0.16444382  0.96192881] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.1084016  -0.56436494  0.18368239  1.30142942] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[-0.00546601 -0.01256654 -0.01836392  0.03224767] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00571734  0.18281388 -0.01771897 -0.26617223] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00206106  0.37818418 -0.02304241 -0.56439078] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00550262  0.18339298 -0.03433023 -0.27905542] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00917048  0.37898742 -0.03991134 -0.58236531] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01675023  0.57464516 -0.05155864 -0.88734901] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02824313  0.77042759 -0.06930562 -1.19578394] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04365168  0.96637494 -0.0932213  -1.50935891] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06297918  0.77249822 -0.12340848 -1.24717391] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07842915  0.57915577 -0.14835196 -0.99555592] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.09001226  0.38629571 -0.16826308 -0.75289904] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.09773818  0.19384393 -0.18332106 -0.51753458] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.10161505  0.00171211 -0.19367175 -0.287761  ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.1016493   0.19899241 -0.19942697 -0.63473317] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[ 0.00516452 -0.01425921  0.00826452 -0.0319931 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00487934  0.18074326  0.00762466 -0.32205707] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0084942   0.37575581  0.00118351 -0.61232576] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01600932  0.5708612  -0.011063   -0.90463569] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02742654  0.37589081 -0.02915572 -0.61545045] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(0) ACTION\n",
      "[ 0.03494436  0.18118809 -0.04146472 -0.33209106] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03856812 -0.01331988 -0.04810655 -0.05276702] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03830172 -0.20772022 -0.04916189  0.22435826] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03414732 -0.01193136 -0.04467472 -0.08341798] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03390869 -0.20638539 -0.04634308  0.19484204] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02978098 -0.40081484 -0.04244624  0.47255312] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02176469 -0.5953124  -0.03299518  0.75156123] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00985844 -0.39975135 -0.01796395  0.44868049] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00186341 -0.59461465 -0.00899034  0.73564717] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01002888 -0.78961127  0.0057226   1.02548715] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02582111 -0.98480895  0.02623234  1.31996131] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04551729 -1.18025252  0.05263157  1.62073722] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06912234 -0.98578855  0.08504631  1.34491172] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.08883811 -1.18187089  0.11194455  1.66294569] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.11247553 -1.37810434  0.14520346  1.98829645] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.14003761 -1.18477377  0.18496939  1.74389233] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[-0.03859386 -0.04880427  0.02410836 -0.04094425] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03956995  0.14596382  0.02328948 -0.32592432] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03665067  0.34074657  0.01677099 -0.61117273] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02983574  0.14539428  0.00454754 -0.31325506] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02692785 -0.04979216 -0.00171757 -0.01914146] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0279237  -0.24488944 -0.0021004   0.27299906] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03282149 -0.43998136  0.00335959  0.56501878] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04162111 -0.63515028  0.01465996  0.85875823] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05432412 -0.44023106  0.03183513  0.5707207 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06312874 -0.24556968  0.04324954  0.28823452] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06804013 -0.44128085  0.04901423  0.59423815] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07686575 -0.24687797  0.06089899  0.31738846] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08180331 -0.44281206  0.06724676  0.62863801] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09065955 -0.24868985  0.07981952  0.35786848] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09563335 -0.05478801  0.08697689  0.09138379] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09672911  0.13898656  0.08880457 -0.17264036] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09394938  0.33273268  0.08535176 -0.43604011] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.08729472  0.52654925  0.07663096 -0.70064369] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07676374  0.33045336  0.06261809 -0.38485561] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07015467  0.52463302  0.05492097 -0.65715657] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05966201  0.32879132  0.04177784 -0.34769829] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05308618  0.13310081  0.03482388 -0.04213955] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05042417  0.32770654  0.03398109 -0.32363501] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04387004  0.13211762  0.02750839 -0.02043248] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04122768 -0.06338781  0.02709974  0.28080116] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04249544  0.1313373   0.03271576 -0.00321291] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03986869  0.32597516  0.0326515  -0.28539675] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03334919  0.52061659  0.02694357 -0.56760544] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02293686  0.32512726  0.01559146 -0.26655744] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01643431  0.1297863   0.01026031  0.0310021 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01383859 -0.06548128  0.01088035  0.32690452] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01514821 -0.26075643  0.01741844  0.62299867] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02036334 -0.06588197  0.02987841  0.33585208] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02168098 -0.26141611  0.03659546  0.63780533] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0269093  -0.45702874  0.04935156  0.94178449] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03604988 -0.26260535  0.06818725  0.66500795] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04130199 -0.06849481  0.08148741  0.39455029] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04267188 -0.26467278  0.08937842  0.71177158] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04796534 -0.46091121  0.10361385  1.03119656] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05718356 -0.65724754  0.12423778  1.35452959] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07032851 -0.85369043  0.15132837  1.68335479] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08740232 -0.66061008  0.18499547  1.44136505] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[ 0.04053395  0.02998473  0.04226204 -0.02978527] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04113365  0.22447593  0.04166633 -0.30884025] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04562317  0.4189802   0.03548952 -0.58809707] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05400277  0.61358765  0.02372758 -0.86939271] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06627452  0.80837892  0.00633973 -1.1545221 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0824421   1.00341762 -0.01675071 -1.44521041] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.10251046  1.19874162 -0.04565492 -1.74307978] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.12648529  1.39435232 -0.08051652 -2.04960798] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.15437233  1.2001424  -0.12150868 -1.78288316] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.17837518  1.00657815 -0.15716634 -1.53031325] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.19850674  1.20320708 -0.1877726  -1.86763832] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[ 0.02105774 -0.03336682  0.02299986 -0.01667459] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02039041 -0.22881094  0.02266637  0.28317541] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01581419 -0.42424873  0.02832988  0.58292014] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00732921 -0.22953489  0.03998828  0.29929445] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00273852 -0.03500507  0.04597417  0.01948642] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00203841 -0.23075516  0.0463639   0.32631294] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00257669 -0.03632293  0.05289016  0.04860411] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00330315 -0.23216182  0.05386224  0.35749412] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00794638 -0.4280065   0.06101212  0.66666295] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01650651 -0.62392155  0.07434538  0.97791516] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02898495 -0.42987093  0.09390368  0.7094807 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03758236 -0.23616636  0.1080933   0.44777108] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04230569 -0.04272621  0.11704872  0.19102192] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04316022 -0.23931119  0.12086916  0.51821553] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(1) ACTION\n",
      "[-0.04794644 -0.04607981  0.13123347  0.2659322 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04886804  0.14694877  0.13655211  0.01735102] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04592906 -0.04984041  0.13689913  0.34980868] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04692587 -0.24661681  0.14389531  0.68233443] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0518582  -0.05375521  0.15754199  0.43819127] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05293331 -0.25071521  0.16630582  0.7760985 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05794761 -0.05822329  0.18182779  0.54001317] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05911208 -0.25537268  0.19262805  0.88402548] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[ 0.04392373 -0.00869179 -0.0095876  -0.01625628] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04374989  0.18656633 -0.00991272 -0.31194872] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04748122 -0.008413   -0.0161517  -0.02240838] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04731296  0.18693681 -0.01659987 -0.32014327] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05105169  0.38229118 -0.02300273 -0.6180146 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05869752  0.18749798 -0.03536302 -0.3326643 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06244748  0.38310496 -0.04201631 -0.63628589] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07010958  0.18859339 -0.05474203 -0.3571252 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.07388144 -0.00570929 -0.06188453 -0.08219388] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.07376726  0.19024266 -0.06352841 -0.39374161] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07757211  0.38620586 -0.07140324 -0.70575824] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.08529623  0.19214207 -0.08551841 -0.43637901] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.08913907 -0.0016718  -0.09424599 -0.17183242] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.08910563  0.19466385 -0.09768264 -0.49269624] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.09299891  0.39101806 -0.10753656 -0.81449653] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.10081927  0.58743536 -0.12382649 -1.13897542] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.11256798  0.78393941 -0.146606   -1.46778667] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.12824677  0.98051991 -0.17596173 -1.80244215] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[ 0.00051034  0.01383829  0.00156305 -0.04183118] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00078711  0.20893779  0.00072643 -0.33402054] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00496587  0.01380551 -0.00595398 -0.04110862] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00524198  0.20901233 -0.00677615 -0.33566412] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00942222  0.40423006 -0.01348943 -0.63047616] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01750682  0.59953761 -0.02609896 -0.92737662] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02949758  0.4047776  -0.04464649 -0.64300831] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03759313  0.21030543 -0.05750666 -0.36471245] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04179924  0.01604589 -0.06480091 -0.09070246] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04212015  0.21203393 -0.06661495 -0.40310563] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04636083  0.01791689 -0.07467707 -0.13214621] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04671917 -0.17606032 -0.07731999  0.13607403] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04319796 -0.36999454 -0.07459851  0.40339679] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03579807 -0.56398359 -0.06653057  0.67165946] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0245184  -0.36800294 -0.05309739  0.3587933 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01715834 -0.17216796 -0.04592152  0.04985152] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01371498 -0.36660238 -0.04492449  0.32769951] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00638294 -0.56105693 -0.0383705   0.60588368] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0048382  -0.36542003 -0.02625283  0.30136618] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0121466  -0.16993393 -0.0202255   0.00052062] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01554528 -0.36476007 -0.02021509  0.28675417] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02284048 -0.55958798 -0.01448001  0.57299349] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03403224 -0.75450394 -0.00302014  0.86107982] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04912232 -0.94958463  0.01420146  1.1528116 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06811401 -1.14488893  0.03725769  1.4499136 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.09101179 -1.34044847  0.06625596  1.75400056] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.11782076 -1.53625633  0.10133598  2.0665329 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.14854589 -1.34230151  0.14266663  1.806835  ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.17539192 -1.14903099  0.17880333  1.5616764 ] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[ 0.04496953  0.00677491 -0.02029516  0.01400054] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04510503  0.20218196 -0.02001515 -0.28501603] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04914867  0.0073511  -0.02571547  0.00128766] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04929569  0.20283223 -0.02568971 -0.2993966 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05335234  0.39831076 -0.03167764 -0.60006959] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06131855  0.59386123 -0.04367904 -0.90255981] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07319578  0.78954681 -0.06173023 -1.20864576] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.08898671  0.59527409 -0.08590315 -0.9359289 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.10089219  0.40140916 -0.10462173 -0.67142827] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.10892038  0.20788521 -0.11805029 -0.41343229] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.11307808  0.01461711 -0.12631894 -0.16017405] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.11337042 -0.17849134 -0.12952242  0.09014162] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.1098006   0.0182263  -0.12771959 -0.24043639] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.11016512 -0.17486179 -0.13252831  0.00938911] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.10666789  0.02188724 -0.13234053 -0.32199586] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.10710563 -0.17112627 -0.13878045 -0.07380187] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.10368311 -0.36401401 -0.14025649  0.17207734] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.09640283 -0.556879   -0.13681494  0.41743363] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.08526525 -0.36011032 -0.12846627  0.08494047] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.07806304 -0.55317912 -0.12676746  0.33449193] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06699946 -0.35650238 -0.12007762  0.0046742 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05986941 -0.54971588 -0.11998413  0.25718951] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04887509 -0.74293872 -0.11484034  0.50974925] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03401632 -0.93627143 -0.10464536  0.76414923] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01529089 -1.12980855 -0.08936237  1.02215721] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00730528 -1.32363374 -0.06891923  1.28549732] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03377796 -1.51781405 -0.04320928  1.55583011] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06413424 -1.32220191 -0.01209268  1.24998618] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09057828 -1.12692707  0.01290704  0.95354018] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.11311682 -0.93198113  0.03197785  0.66494019] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.13175644 -0.73731824  0.04527665  0.38249503] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.1465028  -0.93305284  0.05292655  0.68910282] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.16516386 -1.12886781  0.06670861  0.99796713] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.18774122 -1.32481515  0.08666795  1.31083218] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.21423752 -1.13089114  0.11288459  1.04648645] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.23685534 -1.32731558  0.13381432  1.37236546] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.26340166 -1.1340964   0.16126163  1.1243518 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.28608358 -0.94141274  0.18374867  0.88628197] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.30491184 -1.13848955  0.20147431  1.23063933] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[-0.03074796 -0.03976583  0.03411231  0.01523563] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03154327 -0.23535996  0.03441702  0.31848322] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(1) ACTION\n",
      "[-0.03625047 -0.43095476  0.04078669  0.62181828] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04486957 -0.23642539  0.05322305  0.34225485] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04959808 -0.04209945  0.06006815  0.06681889] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05044007 -0.23802887  0.06140453  0.37783239] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05520064 -0.43396671  0.06896118  0.68922681] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06387998 -0.23986613  0.08274571  0.41902649] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0686773  -0.43605714  0.09112624  0.73660561] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07739844 -0.24230408  0.10585836  0.47393593] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.08224452 -0.43874925  0.11533707  0.79801846] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09101951 -0.63524875  0.13129744  1.12464339] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.10372448 -0.44206896  0.15379031  0.87585842] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.11256586 -0.24933425  0.17130748  0.6352048 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.11755255 -0.44637912  0.18401158  0.97656423] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.12648013 -0.2541377   0.20354286  0.74686496] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[ 0.02154625  0.04730177  0.03364233 -0.04324205] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02249229  0.24192555  0.03277749 -0.32512355] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0273308   0.04635261  0.02627502 -0.02228686] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02825785  0.24108809  0.02582928 -0.30656529] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03307961  0.04560778  0.01969797 -0.00584967] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03399177 -0.14979104  0.01958098  0.29298255] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03099595 -0.34518662  0.02544063  0.59177617] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02409222 -0.1504299   0.03727615  0.30721435] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[0.02108362 0.04414161 0.04342044 0.02651649] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02196645 -0.15157526  0.04395077  0.33257669] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[0.01893494 0.04289446 0.0506023  0.05407117] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01979283 -0.15291517  0.05168373  0.36228038] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01673453 -0.34873217  0.05892934  0.67080191] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00975989 -0.54462172  0.07234537  0.9814414 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-1.13254723e-03 -7.40634734e-01  9.19742014e-02  1.29594282e+00] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01594524 -0.93679674  0.11789306  1.61594447] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03468118 -0.7432462   0.15021195  1.36221425] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0495461  -0.939897    0.17745623  1.69786525] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[ 0.02434129  0.02882411 -0.0207296  -0.02387798] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02491777  0.22423712 -0.02120716 -0.32302858] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02940251  0.02942347 -0.02766774 -0.03710825] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02999098  0.22493103 -0.0284099  -0.33839063] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0344896   0.42044548 -0.03517771 -0.63989535] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04289851  0.22583118 -0.04797562 -0.35849462] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04741514  0.42160116 -0.05514551 -0.66591113] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05584716  0.22728782 -0.06846374 -0.39108876] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06039292  0.03320093 -0.07628551 -0.12075308] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06105694  0.22932825 -0.07870057 -0.43649465] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0656435   0.03540342 -0.08743047 -0.16962187] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06635157  0.23166093 -0.0908229  -0.48855551] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07098479  0.427939   -0.10059401 -0.8084244 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07954357  0.23432884 -0.1167625  -0.54900209] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.08423014  0.04102396 -0.12774254 -0.29526931] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.08505062 -0.15206738 -0.13364793 -0.04544652] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.08200928  0.04469264 -0.13455686 -0.37713023] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.08290313 -0.14828727 -0.14209946 -0.12971736] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07993738 -0.34111797 -0.14469381  0.11497654] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.07311502 -0.53390222 -0.14239428  0.35873936] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06243698 -0.33707349 -0.13521949  0.02476181] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05569551 -0.14029759 -0.13472426 -0.30734121] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05288956  0.0564611  -0.14087108 -0.63929404] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05401878 -0.1364449  -0.15365696 -0.39408066] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05128988 -0.3290907  -0.16153858 -0.15351419] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04470807 -0.5215754  -0.16460886  0.08416786] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03427656 -0.71400209 -0.1629255   0.32072946] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01999652 -0.90647432 -0.15651091  0.55792597] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00186703 -0.70954168 -0.14535239  0.22031106] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0123238  -0.90231924 -0.14094617  0.46384483] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03037019 -1.09519746 -0.13166928  0.70899203] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05227414 -1.2882737  -0.11748944  0.95750051] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07803961 -1.09178464 -0.09833942  0.63033631] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0998753  -1.28540682 -0.0857327   0.89050117] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.12558344 -1.08923285 -0.06792268  0.57214642] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.1473681  -0.89322752 -0.05647975  0.25886245] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.16523265 -1.08749958 -0.0513025   0.53320966] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.18698264 -1.2818639  -0.0406383   0.80929623] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.21261992 -1.47640614 -0.02445238  1.0889243 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.24214804 -1.67119731 -0.00267389  1.37383543] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.27557199 -1.47604204  0.02480282  1.08031744] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.30509283 -1.28125618  0.04640916  0.7955198 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.33071795 -1.47698328  0.06231956  1.10243374] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.36025762 -1.28273406  0.08436823  0.82993562] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.3859123  -1.08886044  0.10096695  0.56493442] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.40768951 -1.28524321  0.11226564  0.88764223] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.43339437 -1.4816952   0.13001848  1.21340197] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.46302827 -1.28846839  0.15428652  0.96412598] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.48879764 -1.09571789  0.17356904  0.72361512] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.510712   -1.29276115  0.18804134  1.06551252] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.53656722 -1.48980588  0.20935159  1.41082836] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[-0.00884173 -0.00032258  0.00474656  0.02903316] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00884818 -0.19551228  0.00532722  0.32320989] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01275843 -0.00046659  0.01179142  0.03221171] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01276776 -0.19575563  0.01243565  0.3285915 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01668287 -0.39105239  0.01900748  0.62516998] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02450392 -0.19620087  0.03151088  0.33853336] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02842794 -0.39175671  0.03828155  0.64098416] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03626307 -0.58739083  0.05110123  0.94547237] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04801089 -0.39299304  0.07001068  0.66927355] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(1) ACTION\n",
      "[-0.05587075 -0.19891079  0.08339615  0.39942969] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05984897 -0.00506477  0.09138475  0.13416128] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05995026 -0.20136881  0.09406797  0.45421839] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06397764 -0.00769411  0.10315234  0.19260595] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06413152  0.18581256  0.10700446 -0.06583815] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06041527 -0.01066787  0.1056877   0.25859615] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06062863 -0.20712742  0.11085962  0.58265555] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06477117 -0.01371893  0.12251273  0.32684973] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06504555  0.17946517  0.12904972  0.07517424] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06145625  0.37252359  0.13055321 -0.17416997] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05400578  0.5655589   0.12706981 -0.42298514] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0426946   0.36888737  0.11861011 -0.09309827] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03531685  0.5621272   0.11674814 -0.34613182] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02407431  0.75541171  0.1098255  -0.59983892] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00896607  0.94883968  0.09782873 -0.85600902] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01001072  1.1425022   0.08070855 -1.11639803] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03286076  1.33647747  0.05838059 -1.38271124] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05959031  1.14067776  0.03072636 -1.07235791] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.08240387  1.33538035  0.0092792  -1.35524181] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.10911147  1.14014319 -0.01782563 -1.05967062] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.13191434  0.94526183 -0.03901905 -0.77263549] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.15081957  0.75069785 -0.05447176 -0.49248034] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.16583353  0.9465441  -0.06432136 -0.80182077] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.18476441  1.14248642 -0.08035778 -1.1140239 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.20761414  0.94850623 -0.10263826 -0.84759233] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.22658427  1.1448672  -0.1195901  -1.17070641] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.24948161  0.95148592 -0.14300423 -0.91778204] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.26851133  0.75855662 -0.16135987 -0.67324223] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.28368246  0.56600148 -0.17482472 -0.43539573] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.29500249  0.37372945 -0.18353263 -0.20252093] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.30247708  0.57093689 -0.18758305 -0.54702015] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.31389582  0.37887677 -0.19852345 -0.31881026] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.32147335  0.57619049 -0.20489966 -0.6669509 ] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[-0.0115652  -0.0263471   0.01731067  0.02381651] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01209214 -0.22171297  0.017787    0.32191049] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0165264  -0.41708364  0.02422521  0.62014923] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02486807 -0.6125354   0.03662819  0.92036236] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03711878 -0.41792711  0.05503544  0.63941201] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04547732 -0.61377146  0.06782368  0.948906  ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05775275 -0.41962493  0.0868018   0.67828064] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06614525 -0.61583869  0.10036741  0.99698088] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07846202 -0.4221916   0.12030703  0.73743056] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08690585 -0.61875149  0.13505564  1.06542425] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.09928088 -0.42565001  0.15636413  0.81799542] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.10779388 -0.62252707  0.17272403  1.15549211] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.12024443 -0.43002544  0.19583388  0.92156498] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[ 0.01799624  0.04719119  0.03641108 -0.04246029] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01894006  0.24177262  0.03556187 -0.32343648] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02377552  0.04616281  0.02909315 -0.01975429] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02469877 -0.14936403  0.02869806  0.28196403] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02171149  0.04533707  0.03433734 -0.00153129] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02261823  0.23995018  0.03430671 -0.28318568] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02741724  0.43456644  0.028643   -0.5648541 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03610857  0.23905458  0.01734592 -0.26328671] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[0.04088966 0.04368938 0.01208018 0.03481644] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04176344 -0.1516037   0.01277651  0.33128617] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03873137 -0.34690516  0.01940224  0.62797065] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03179327 -0.1520593   0.03196165  0.34146077] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02875208 -0.34762107  0.03879086  0.64404873] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02179966 -0.54326156  0.05167184  0.94869035] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01093443 -0.73903968  0.07064565  1.25715033] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00384636 -0.93499115  0.09578865  1.57109692] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02254619 -0.74113399  0.12721059  1.30976303] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03736887 -0.93761698  0.15340585  1.63940516] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05612121 -0.74458985  0.18619396  1.39818936] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[ 0.01379022 -0.03277814  0.04499982 -0.01379404] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01313465 -0.22851559  0.04472394  0.29274039] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00856434 -0.42424573  0.05057874  0.59918647] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 7.94282844e-05 -2.29866561e-01  6.25624734e-02  3.22854400e-01] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0045179  -0.03568869  0.06901956  0.05053879] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00523168  0.1583792   0.07003034 -0.2195943 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00206409 -0.03767028  0.06563845  0.0943312 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0028175  -0.23366862  0.06752508  0.40697945] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00749087 -0.03956589  0.07566466  0.13632617] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00828219 -0.23568547  0.07839119  0.45188803] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0129959  -0.04175459  0.08742895  0.18490846] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01383099 -0.23801162  0.09112712  0.50384098] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01859122 -0.43429177  0.10120394  0.82379391] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02727706 -0.24068905  0.11767982  0.56457895] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03209084 -0.43724839  0.12897139  0.89189684] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04083581 -0.24408962  0.14680933  0.64237759] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0457176  -0.05128599  0.15965688  0.39929025] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04674332  0.14125361  0.16764269  0.16089391] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04391825 -0.05582209  0.17086057  0.50141963] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04503469 -0.25288844  0.18088896  0.84270382] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05009246 -0.06063513  0.19774304  0.61192083] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[ 0.04289833  0.04694012 -0.03256108  0.01746112] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04383713  0.24251354 -0.03221186 -0.28531478] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04868741  0.04786545 -0.03791815 -0.00296275] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04964471  0.24351011 -0.03797741 -0.30736406] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05451492  0.43915205 -0.04412469 -0.61177805] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06329796  0.63486203 -0.05636025 -0.91802572] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0759952   0.83069877 -0.07472077 -1.22787555] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.09260917  1.02669862 -0.09927828 -1.54300244] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.11314315  0.83290021 -0.13013832 -1.28287612] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(1) ACTION\n",
      "[ 0.12980115  0.63965339 -0.15579585 -1.03360981] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.14259422  0.83646525 -0.17646804 -1.37087093] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.15932352  1.03330005 -0.20388546 -1.71315059] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[-0.00682264  0.04565615 -0.03053603 -0.02401705] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00590952  0.2412024  -0.03101637 -0.32617588] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00108547  0.43675191 -0.03753989 -0.62847642] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00764957  0.24217342 -0.05010942 -0.34784845] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01249304  0.43797096 -0.05706639 -0.65590211] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02125246  0.63383899 -0.07018443 -0.9659943 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03392924  0.43972629 -0.08950431 -0.69615937] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04272376  0.24595214 -0.1034275  -0.43294076] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04764281  0.44237476 -0.11208632 -0.75635371] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0564903   0.24896149 -0.12721339 -0.50093875] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06146953  0.05584073 -0.13723217 -0.25089839] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06258635 -0.13708195 -0.14225013 -0.00445478] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05984471  0.05976318 -0.14233923 -0.33842075] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06103997 -0.13307678 -0.14910764 -0.09379271] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05837843  0.0638328  -0.1509835  -0.4295551 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05965509  0.26073429 -0.1595746  -0.765768  ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06486978  0.06812715 -0.17488996 -0.52724399] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06623232 -0.12415867 -0.18543484 -0.29437097] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06374915 -0.31621963 -0.19132226 -0.06542447] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05742475 -0.11894306 -0.19263075 -0.41184617] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05504589 -0.31088718 -0.20086767 -0.18553913] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04882815 -0.11354288 -0.20457845 -0.53426259] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[ 0.02860197 -0.03702937 -0.0225314   0.01422864] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02786138  0.15840834 -0.02224683 -0.28547719] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03102955  0.35384039 -0.02795637 -0.58509276] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03810636  0.54934256 -0.03965823 -0.88644952] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04909321  0.35478075 -0.05738722 -0.60649274] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05618882  0.16050619 -0.06951707 -0.3324232 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05939895  0.35654521 -0.07616554 -0.64619304] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06652985  0.55264119 -0.0890894  -0.96185526] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.07758268  0.74884012 -0.1083265  -1.28114416] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.09255948  0.94516247 -0.13394938 -1.60568729] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.11146273  1.14159029 -0.16606313 -1.93695313] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.13429453  1.33805244 -0.20480219 -2.27619079] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[-0.0194957   0.02889147 -0.00917313  0.02918042] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01891787  0.22414376 -0.00858953 -0.26638255] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01443499  0.02914545 -0.01391718  0.02357883] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01385208 -0.16577418 -0.0134456   0.31183845] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01716757 -0.36070202 -0.00720883  0.60025089] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02438161 -0.55572238  0.00479619  0.89065446] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03549605 -0.36066583  0.02260928  0.59948307] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04270937 -0.55609669  0.03459894  0.89920099] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0538313  -0.75167003  0.05258296  1.20255549] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0688647  -0.55726595  0.07663407  0.92680468] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08001002 -0.7533344   0.09517016  1.24255255] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09507671 -0.559554    0.12002121  0.98113512] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.10626779 -0.3662272   0.13964391  0.72843266] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.11359234 -0.17328347  0.15421257  0.48275636] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.117058   -0.37020727  0.16386769  0.81979566] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.12446215 -0.17766156  0.18026361  0.58280797] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.12801538 -0.37479005  0.19191977  0.92642038] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[-0.04532058  0.01991093  0.0138763   0.0361318 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04492236  0.21483117  0.01459894 -0.25214091] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04062574  0.01950383  0.00955612  0.04511081] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04023566 -0.17575384  0.01045834  0.3407934 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04375074 -0.37102302  0.0172742   0.63675584] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0511712  -0.17614618  0.03000932  0.34956256] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05469412  0.01853641  0.03700057  0.06649158] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05432339  0.21310888  0.0383304  -0.21429168] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05006122  0.01746049  0.03404457  0.09023172] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04971201 -0.17813248  0.0358492   0.39345834] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05327466 -0.37374432  0.04371837  0.69722499] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06074954 -0.56944438  0.05766287  1.0033437 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07213843 -0.76528727  0.07772975  1.31356328] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08744418 -0.5712305   0.10400101  1.04618657] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09886879 -0.37763116  0.12492474  0.78787786] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.10642141 -0.18442625  0.1406823   0.53696136] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.11010993  0.00846657  0.15142153  0.29170495] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.1099406  -0.18845357  0.15725563  0.62805331] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.11370967  0.0041647   0.16981669  0.38873388] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.11362638 -0.19290934  0.17759137  0.7297793 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.11748457 -0.38998318  0.19218696  1.07267856] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[ 0.00048955  0.03652331 -0.02739991  0.0203426 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00122002  0.23202727 -0.02699306 -0.28085783] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00586056  0.42752365 -0.03261021 -0.58193058] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01441104  0.62308697 -0.04424883 -0.88470529] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02687278  0.81878089 -0.06194293 -1.19096376] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04324839  0.62451383 -0.08576221 -0.91832161] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05573867  0.82068392 -0.10412864 -1.23667846] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.07215235  0.62704255 -0.12886221 -0.97834821] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0846932   0.82363463 -0.14842917 -1.30857207] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.10116589  0.63067197 -0.17460061 -1.06578875] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.11377933  0.43823587 -0.19591639 -0.83259546] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[-0.02725499 -0.00100135 -0.02609007 -0.00078527] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02727502  0.19448487 -0.02610577 -0.30158441] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02338532 -0.00025546 -0.03213746 -0.01724767] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(1) ACTION\n",
      "[-0.02339043  0.19531228 -0.03248241 -0.3198946 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01948419  0.39088142 -0.0388803  -0.62264171] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01166656  0.19632331 -0.05133314 -0.34245303] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00774009  0.00196783 -0.0581822  -0.06638915] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00770073 -0.19227378 -0.05950998  0.20738433] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01154621 -0.38649651 -0.0553623   0.48071709] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01927614 -0.58079505 -0.04574795  0.75545018] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03089204 -0.38507333 -0.03063895  0.44872956] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03859351 -0.18953169 -0.02166436  0.14654832] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04238414 -0.38433681 -0.01873339  0.4323186 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05007088 -0.18895469 -0.01008702  0.13378955] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05384997  0.00631029 -0.00741123 -0.16205855] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05372377  0.20153755 -0.0106524  -0.45707028] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04969301  0.0065678  -0.01979381 -0.167764  ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04956166 -0.18826531 -0.02314909  0.11860942] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05332697  0.00718053 -0.0207769  -0.18128602] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05318335 -0.18763804 -0.02440262  0.10477081] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05693612  0.00782496 -0.0223072  -0.19551007] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05677962  0.20325878 -0.0262174  -0.49514563] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05271444  0.39874045 -0.03612032 -0.79597441] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04473963  0.59433898 -0.0520398  -1.0997979 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03285285  0.39993912 -0.07403576 -0.82388566] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02485407  0.20590377 -0.09051348 -0.55537658] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02073599  0.01216146 -0.10162101 -0.29252739] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02049276 -0.18137584 -0.10747156 -0.0335443 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02412028  0.01511003 -0.10814244 -0.35810852] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02381808 -0.17832169 -0.11530461 -0.10138784] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02738451 -0.37161856 -0.11733237  0.15280769] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03481689 -0.17502909 -0.11427621 -0.17446566] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03831747 -0.36834585 -0.11776553  0.08009401] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04568438 -0.56159997 -0.11616365  0.33342428] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05691638 -0.75489349 -0.10949516  0.58733518] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07201425 -0.55842218 -0.09774846  0.26226581] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0831827  -0.75202282 -0.09250314  0.52258821] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09822315 -0.94572937 -0.08205138  0.78474863] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.11713774 -0.7495816  -0.06635641  0.4674208 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.13212937 -0.55358798 -0.05700799  0.1545829 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.14320113 -0.74784931 -0.05391633  0.42875025] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.15815812 -0.94216789 -0.04534133  0.70396085] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.17700148 -1.13663315 -0.03126211  0.98203317] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.19973414 -0.94110654 -0.01162145  0.67969699] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.21855627 -0.7458251   0.00197249  0.38337798] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.23347277 -0.55073121  0.00964005  0.09131763] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.2444874  -0.74599     0.01146641  0.38702633] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.2594072  -0.94127283  0.01920693  0.68330237] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.27823265 -0.74642277  0.03287298  0.39672769] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.29316111 -0.94199533  0.04080753  0.69959094] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.31200102 -0.74746218  0.05479935  0.42002823] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.32695026 -0.55315778  0.06319992  0.14511209] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.33801341 -0.7491252   0.06610216  0.45704478] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.35299592 -0.94511637  0.07524305  0.76980935] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.37189825 -0.7511062   0.09063924  0.5017181 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.38692037 -0.94738105  0.1006736   0.82153329] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.40586799 -0.75377009  0.11710427  0.56213539] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.42094339 -0.56046933  0.12834698  0.30852011] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.43215278 -0.75716415  0.13451738  0.63876606] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.44729606 -0.95387981  0.1472927   0.97060286] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.46637366 -1.15063875  0.16670476  1.30569317] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.48938643 -0.95797571  0.19281862  1.06948947] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[ 0.04354492  0.01048442 -0.04022855  0.00303526] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0437546   0.20615953 -0.04016784 -0.30206394] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04787779  0.40183026 -0.04620912 -0.60713946] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0559144   0.20738382 -0.05835191 -0.32936179] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06006208  0.4032858  -0.06493915 -0.63986036] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06812779  0.59925016 -0.07773635 -0.952266  ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0801128   0.79532725 -0.09678167 -1.2683249 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.09601934  0.60156537 -0.12214817 -1.00745161] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.10805065  0.40826715 -0.1422972  -0.75548696] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.11621599  0.21536343 -0.15740694 -0.51075028] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.12052326  0.02276821 -0.16762195 -0.27151579] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.12097862  0.21983596 -0.17305226 -0.61202475] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.12537534  0.02750092 -0.18529276 -0.37845513] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.12592536 -0.16457278 -0.19286186 -0.14944123] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.12263391  0.03271252 -0.19585069 -0.49623314] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.12328816 -0.15918693 -0.20577535 -0.27109962] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[-0.04741338  0.04254362 -0.03167398 -0.00695713] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04656251  0.23810517 -0.03181312 -0.30946291] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04180041  0.04345059 -0.03800238 -0.02698027] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0409314  -0.15110635 -0.03854198  0.25347443] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04395352  0.04454413 -0.03347249 -0.05111188] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04306264  0.24012965 -0.03449473 -0.3541649 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03826005  0.04551473 -0.04157803 -0.07255562] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03734975 -0.14898724 -0.04302914  0.20672478] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0403295   0.04672276 -0.03889465 -0.09921516] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03939504 -0.1478208  -0.04087895  0.18094733] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04235146  0.04786154 -0.03726    -0.12434603] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04139423 -0.14670734 -0.03974692  0.15635283] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04432837 -0.34123834 -0.03661987  0.43623634] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05115314 -0.14561766 -0.02789514  0.13223821] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05406549 -0.34032915 -0.02525037  0.41599172] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06087208 -0.14485861 -0.01693054  0.11545655] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06376925 -0.33973393 -0.01462141  0.4027503 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07056393 -0.14440769 -0.0065664   0.10549371] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07345208 -0.33943492 -0.00445653  0.39609775] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.08024078 -0.14425003  0.00346543  0.10201309] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(1) ACTION\n",
      "[-0.08312578 -0.33942147  0.00550569  0.39578733] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08991421 -0.14437807  0.01342143  0.10484533] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.09280177  0.050549    0.01551834 -0.18357315] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09179079 -0.14479152  0.01184688  0.11396456] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.09468662  0.05015869  0.01412617 -0.17495734] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09368345 -0.14516255  0.01062702  0.12214824] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0965867   0.04980555  0.01306999 -0.16716311] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.09559059  0.244738    0.00972672 -0.45569434] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09069583  0.04947989  0.00061284 -0.15996141] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08970623  0.24459306 -0.00258639 -0.45245094] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.08481437  0.43975149 -0.01163541 -0.74594801] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07601934  0.24479201 -0.02655437 -0.45694931] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0711235   0.05005534 -0.03569336 -0.17275336] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07012239 -0.14453806 -0.03914842  0.1084592 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07301315 -0.33907778 -0.03697924  0.3885385 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07979471 -0.14345098 -0.02920847  0.08442951] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.08266373  0.05207722 -0.02751988 -0.21732376] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08162218 -0.14264074 -0.03186635  0.06655275] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.084475    0.05292325 -0.0305353  -0.23601142] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.08341653  0.24846785 -0.03525553 -0.53816758] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07844718  0.05385883 -0.04601888 -0.25679842] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07737    -0.14057693 -0.05115485  0.02102167] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08018154 -0.33492936 -0.05073441  0.297136  ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08688013 -0.13912227 -0.04479169 -0.01110624] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.08966257  0.05661248 -0.04501382 -0.31757824] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08853032 -0.13784039 -0.05136538 -0.03942398] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09128713  0.05797907 -0.05215386 -0.34786055] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.09012755  0.25380253 -0.05911107 -0.65652264] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0850515   0.05955111 -0.07224153 -0.38302295] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08386048 -0.13447476 -0.07990198 -0.11396379] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08654997  0.06169578 -0.08218126 -0.43074698] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.08531606  0.25787944 -0.0907962  -0.7481634 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.08015847  0.06411953 -0.10575947 -0.48537815] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07887608 -0.12936346 -0.11546703 -0.22781059] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08146334 -0.32266222 -0.12002324  0.02633488] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08791659 -0.12604178 -0.11949655 -0.30167617] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09043742  0.07056263 -0.12553007 -0.62952818] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.08902617  0.26719208 -0.13812063 -0.95896056] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08368233  0.07417009 -0.15729984 -0.71266341] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08219893  0.27107973 -0.17155311 -1.05043694] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07677733  0.4680103  -0.19256185 -1.39168622] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[ 0.03449463  0.03779753 -0.00097462 -0.02870527] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03525058  0.23293345 -0.00154873 -0.32169555] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03990925  0.03783359 -0.00798264 -0.02950143] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04066592  0.2330691  -0.00857267 -0.32469224] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0453273   0.03807025 -0.01506651 -0.03472505] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04608871 -0.15683244 -0.01576101  0.25316641] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04295206 -0.35172583 -0.01069769  0.54083665] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 3.59175406e-02 -5.46695796e-01  1.19047537e-04  8.30129803e-01] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02498362 -0.35157547  0.01672164  0.53748432] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01795212 -0.54692848  0.02747133  0.8353888 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00701355 -0.35219236  0.04417911  0.55147039] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-3.03016041e-05 -1.57717831e-01  5.52085139e-02  2.73027662e-01] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00318466  0.03657468  0.06066907 -0.00174401] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00245316 -0.15936251  0.06063419  0.30944682] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00564041 -0.35529361  0.06682312  0.62061891] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01274629 -0.16116539  0.0792355   0.34970785] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01596959  0.0327455   0.08622966  0.08302496] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01531468 -0.1634999   0.08789016  0.40161947] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01858468 -0.35975139  0.09592255  0.7206669 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02577971 -0.5560603   0.11033588  1.04193461] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03690092 -0.75246095  0.13117458  1.36711714] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05195014 -0.55920215  0.15851692  1.11817421] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06313418 -0.36647451  0.1808804   0.87911629] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07046367 -0.56353207  0.19846273  1.22276967] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[-3.17626070e-02 -3.32046083e-02 -2.18778795e-02 -8.19178260e-05] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0324267  -0.22800608 -0.02187952  0.28561878] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03698682 -0.03257903 -0.01616714 -0.0138837 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0376384  -0.22746543 -0.01644482  0.27365477] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04218771 -0.42234893 -0.01097172  0.56110593] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-5.06346886e-02 -6.17315197e-01  2.50397712e-04  8.50312109e-01] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06298099 -0.42219666  0.01725664  0.55770793] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07142493 -0.61755656  0.0284108   0.85577734] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.08377606 -0.42283304  0.04552635  0.57216159] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09223272 -0.6185628   0.05696958  0.87883216] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.10460397 -0.4242593   0.07454622  0.60458974] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.11308916 -0.62034026  0.08663802  0.9197906 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.12549597 -0.4264894   0.10503383  0.65554466] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.13402575 -0.62290471  0.11814472  0.97936716] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.14648385 -0.42954767  0.13773206  0.72600723] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.1550748  -0.62627806  0.15225221  1.05867356] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.16760036 -0.82305291  0.17342568  1.39501082] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.18406142 -1.01985607  0.2013259   1.73651926] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[ 0.01959417  0.02013862 -0.02421779  0.00428634] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01999694 -0.17462779 -0.02413206  0.28923095] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01650439  0.02082982 -0.01834745 -0.01096424] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01692098  0.21621003 -0.01856673 -0.30937907] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02124518  0.41159153 -0.02475431 -0.60785911] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02947701  0.60705067 -0.03691149 -0.90823501] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04161803  0.41244729 -0.05507619 -0.6273783 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04986697  0.60829296 -0.06762376 -0.93688557] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06203283  0.41414487 -0.08636147 -0.6661953 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07031573  0.22032349 -0.09968538 -0.40190546] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0747222   0.02674632 -0.10772349 -0.14224021] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07525712 -0.16668116 -0.11056829  0.1146097 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0719235  -0.36005943 -0.1082761   0.37046494] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06472231 -0.55348983 -0.1008668   0.62714041] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(0) ACTION\n",
      "[ 0.05365252 -0.35711543 -0.08832399  0.30447141] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04651021 -0.55087493 -0.08223456  0.56804423] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03549271 -0.35470163 -0.07087368  0.25062898] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02839868 -0.15864293 -0.0658611  -0.0635402 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02522582 -0.3527618  -0.0671319   0.20765765] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01817058 -0.15674733 -0.06297875 -0.10542372] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01503564  0.03921796 -0.06508722 -0.41729242] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01581999  0.23519901 -0.07343307 -0.72976367] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02052397  0.04116472 -0.08802835 -0.46106654] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02134727 -0.15260994 -0.09724968 -0.19737692] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01829507 -0.3462161  -0.10119721  0.06311377] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01137075 -0.14979979 -0.09993494 -0.2597046 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00837475  0.04659625 -0.10512903 -0.58216081] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00930668 -0.14690787 -0.11677225 -0.32435817] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00636852  0.04966638 -0.12325941 -0.65146458] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00736185  0.24626984 -0.1362887  -0.98027895] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01228724  0.05321175 -0.15589428 -0.73332208] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01335148 -0.13945211 -0.17056072 -0.49347841] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01056244 -0.33180997 -0.18043029 -0.25903045] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00392624 -0.13463254 -0.1856109  -0.60275444] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00123359 -0.32673996 -0.19766599 -0.37379967] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00530121 -0.51858576 -0.20514198 -0.14937062] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01567293 -0.71026995 -0.20812939  0.07223722] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02987833 -0.51286633 -0.20668465 -0.27822723] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[ 0.02349958  0.00238849 -0.02493448 -0.02828558] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02354735 -0.19236717 -0.02550019  0.25642713] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0197      0.0031094  -0.02037165 -0.04418864] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01976219  0.19851745 -0.02125542 -0.34322875] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02373254  0.00370424 -0.02812    -0.05732357] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02380663  0.19921786 -0.02926647 -0.35874425] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02779098  0.39474337 -0.03644135 -0.66051003] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03568585  0.59035298 -0.04965155 -0.96444102] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04749291  0.39593194 -0.06894037 -0.68776042] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05541155  0.20183117 -0.08269558 -0.4175533 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05944817  0.3980217  -0.09104665 -0.73511822] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06740861  0.20426755 -0.10574901 -0.4724215 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.07149396  0.40071169 -0.11519744 -0.79647394] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.07950819  0.59721006 -0.13112692 -1.12306114] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.09145239  0.79378429 -0.15358814 -1.45383209] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.10732808  0.60084549 -0.18266479 -1.21280824] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.11934499  0.79779305 -0.20692095 -1.55671999] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[ 0.0006844  -0.04820719  0.00828846 -0.00510955] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-2.79747639e-04 -2.43447027e-01  8.18627218e-03  2.90176920e-01] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00514869 -0.43868475  0.01398981  0.58543042] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01392238 -0.24376152  0.02569842  0.29718706] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01879761 -0.4392402   0.03164216  0.59786277] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02758242 -0.24457497  0.04359942  0.31531233] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03247392 -0.44028997  0.04990566  0.62142002] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04127972 -0.24589915  0.06233406  0.34486301] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0461977  -0.44184989  0.06923132  0.65653261] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0550347  -0.63786382  0.08236198  0.97018693] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06779197 -0.83398896  0.10176571  1.28756444] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08447175 -1.0302478   0.127517    1.61029715] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.10507671 -0.83684202  0.15972295  1.35993243] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.12181355 -1.0335648   0.18692159  1.69801972] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[-0.02598321  0.03274033 -0.03947692 -0.02250839] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0253284   0.22840553 -0.03992708 -0.32738072] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02076029  0.4240725  -0.0464747  -0.6323827 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01227884  0.61981097 -0.05912235 -0.93933203] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 1.17380618e-04  4.25533712e-01 -7.79089929e-02 -6.65797154e-01] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00862805  0.23157688 -0.09122494 -0.3986265 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01325959  0.42786657 -0.09919747 -0.71861969] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02181692  0.234247   -0.11356986 -0.45873434] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02650186  0.04089826 -0.12274455 -0.20389741] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02731983 -0.15227406 -0.12682249  0.04768468] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02427435 -0.34537088 -0.1258688   0.29781776] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01736693 -0.14870044 -0.11991245 -0.03176189] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01439292  0.0479188  -0.12054768 -0.35974273] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0153513  -0.14530175 -0.12774254 -0.10737048] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01244526  0.05139733 -0.12988995 -0.4374693 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01347321  0.24809557 -0.13863933 -0.76811158] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01843512  0.44482597 -0.15400157 -1.10100524] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02733164  0.64160158 -0.17602167 -1.43777315] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04016367  0.83840138 -0.20477713 -1.77989501] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[-0.02154426 -0.01947745 -0.00420128  0.00713848] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02193381 -0.2145389  -0.00405851  0.2984929 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02622459 -0.01935933  0.00191135  0.00453276] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02661177  0.17573516  0.002002   -0.2875465 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02309707 -0.01941529 -0.00374893  0.00576717] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02348538 -0.21448327 -0.00363358  0.29726491] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02777504 -0.40955324  0.00231171  0.58879966] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03596611 -0.60470748  0.01408771  0.88220989] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04806026 -0.80001791  0.03173191  1.17928811] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06406062 -0.60532203  0.05531767  0.89671892] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07616706 -0.41099189  0.07325205  0.62192469] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.08438689 -0.21696515  0.08569054  0.35318192] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0887262  -0.41319444  0.09275418  0.67160805] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(0) ACTION\n",
      "[-0.09699009 -0.21947594  0.10618634  0.40951002] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.1013796  -0.41593052  0.11437654  0.73369225] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.10969822 -0.22255899  0.12905038  0.47908358] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.1141494  -0.41924406  0.13863206  0.80949349] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.12253428 -0.22626622  0.15482193  0.56343128] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.1270596  -0.03361677  0.16609055  0.32324954] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.12773194  0.15879879  0.17255554  0.08720448] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.12455596 -0.03832253  0.17429963  0.4289753 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.12532241  0.15395804  0.18287914  0.19590953] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.12224325 -0.04324453  0.18679733  0.54024788] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.12310814  0.14882852  0.19760229  0.31174983] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.12013157  0.34066738  0.20383728  0.08730904] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.11331822  0.1432957   0.20558346  0.4367513 ] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[-0.01535953 -0.04301794  0.04995706  0.01589617] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01621989  0.15135329  0.05027499 -0.26061566] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01319282  0.34572286  0.04506268 -0.537027  ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00627837  0.14999725  0.03432214 -0.23049268] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00327842 -0.04559791  0.02971228  0.07281599] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00419038 -0.24113295  0.0311686   0.37472322] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00901304 -0.04646728  0.03866307  0.09202862] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00994239 -0.24212145  0.04050364  0.39665466] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01478481 -0.43779396  0.04843673  0.70182751] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02354069 -0.63355263  0.06247328  1.00935597] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03621175 -0.82945028  0.0826604   1.3209844 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05280075 -0.63546461  0.10908009  1.05527391] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06551004 -0.44194419  0.13018557  0.7987245 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07434893 -0.63858888  0.14616006  1.12936201] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0871207  -0.44565157  0.1687473   0.88586002] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.09603374 -0.25317269  0.1864645   0.65061927] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.10109719 -0.45033489  0.19947688  0.99573953] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[ 0.00357628  0.04819533 -0.00881052  0.03326479] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00454018  0.24344251 -0.00814523 -0.26218487] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00940903  0.04843777 -0.01338892  0.02791787] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01037779  0.24374914 -0.01283057 -0.26895916] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01525277  0.04881262 -0.01820975  0.01964947] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01622902  0.24419092 -0.01781676 -0.27872277] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02111284  0.04932761 -0.02339121  0.00828796] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02209939 -0.1454512  -0.02322546  0.29349987] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01919037  0.04999405 -0.01735546 -0.00641662] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02019025  0.24536055 -0.01748379 -0.30452448] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02509746  0.05049207 -0.02357428 -0.0174064 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0261073   0.24594403 -0.02392241 -0.31743308] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03102618  0.4413984  -0.03027107 -0.61756326] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03985415  0.2467121  -0.04262234 -0.33456594] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04478839  0.44241393 -0.04931365 -0.64037925] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05363667  0.63818743 -0.06212124 -0.94817485] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06640042  0.44395442 -0.08108474 -0.67563923] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.07527951  0.25004727 -0.09459752 -0.40954693] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.08028046  0.44637418 -0.10278846 -0.73049084] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.08920794  0.2528117  -0.11739828 -0.4718462 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.09426417  0.44937912 -0.1268352  -0.79910366] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.10325176  0.25620409 -0.14281727 -0.54885837] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.10837584  0.06334666 -0.15379444 -0.30436315] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.10964277 -0.12928734 -0.1598817  -0.06386018] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.10705702  0.06772268 -0.16115891 -0.40241147] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.10841148 -0.12479049 -0.16920714 -0.16455949] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.10591567  0.07229867 -0.17249833 -0.50548102] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.10736164  0.26937808 -0.18260795 -0.84717623] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.1127492   0.07715383 -0.19955147 -0.61702276] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[ 0.0005186  -0.02493132  0.01049758 -0.02310127] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 1.99783515e-05 -2.20202234e-01  1.00355520e-02  2.72875185e-01] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00438407 -0.02522491  0.01549306 -0.01662568] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00488856  0.16967147  0.01516054 -0.30438038] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00149514  0.36457412  0.00907293 -0.59224373] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00579635  0.55956789 -0.00277194 -0.88205494] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01698771  0.3644837  -0.02041304 -0.59024473] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02427738  0.16965342 -0.03221793 -0.30406121] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02767045  0.36521935 -0.03829916 -0.60672831] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03497483  0.17065325 -0.05043372 -0.32635038] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0383879  -0.02371573 -0.05696073 -0.04998885] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03791359 -0.21797661 -0.05796051  0.22419249] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03355405 -0.41222433 -0.05347666  0.49804394] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02530957 -0.21639078 -0.04351578  0.1889989 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02098175 -0.02067416 -0.0397358  -0.11708773] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02056827  0.17499394 -0.04207756 -0.42203745] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02406815  0.37068598 -0.05051831 -0.72768285] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03148187  0.17629749 -0.06507196 -0.45131817] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03500782  0.37227648 -0.07409833 -0.76378214] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04245335  0.178249   -0.08937397 -0.49530452] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04601833 -0.01550644 -0.09928006 -0.23207232] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0457082  -0.20907996 -0.10392151  0.02771776] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(1) ACTION\n",
      "[ 0.0415266  -0.40256996 -0.10336715  0.28588898] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0334752  -0.20613737 -0.09764937 -0.03752304] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02935245 -0.39973326 -0.09839983  0.22282408] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02135779 -0.59332115 -0.09394335  0.48291835] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00949136 -0.78700037 -0.08428498  0.74457742] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00624865 -0.98086427 -0.06939343  1.00959052] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02586593 -0.7848882  -0.04920162  0.69594861] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04156369 -0.9792945  -0.03528265  0.97274573] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06114958 -0.78371733 -0.01582774  0.66919161] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07682393 -0.58837892 -0.00244391  0.37156754] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08859151 -0.78346607  0.00498745  0.66347888] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.10426083 -0.58841386  0.01825702  0.3723705 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.11602911 -0.39355596  0.02570443  0.08549964] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.12390023 -0.58903676  0.02741443  0.38618024] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.13568096 -0.39431449  0.03513803  0.10226542] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.14356725 -0.58992193  0.03718334  0.40582392] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.15536569 -0.39534646  0.04529982  0.12509183] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.16327262 -0.59108712  0.04780165  0.43171522] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.17509436 -0.78685218  0.05643596  0.73907572] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.19083141 -0.59255306  0.07121747  0.46467441] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.20268247 -0.39850593  0.08051096  0.19526178] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.21065259 -0.2046224   0.0844162  -0.07097678] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.21474503 -0.40084678  0.08299666  0.24710056] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.22276197 -0.2070022   0.08793867 -0.0182924 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.22690201 -0.01324425  0.08757282 -0.2819856 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.2271669  -0.20949909  0.08193311  0.03698207] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.23135688 -0.40569456  0.08267275  0.35434943] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.23947077 -0.60188875  0.08975974  0.6719137 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.25150855 -0.40812163  0.10319802  0.40878702] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.25967098 -0.60454379  0.11137376  0.73213915] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.27176186 -0.80101412  0.12601654  1.05769499] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.28778214 -0.60776654  0.14717044  0.80707307] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.29993747 -0.80456596  0.1633119   1.14219316] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.31602879 -0.61191042  0.18615576  0.90485555] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.328267   -0.41973082  0.20425288  0.67598082] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[-0.03437795  0.03544802  0.01409543  0.02216382] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03366899 -0.15987321  0.0145387   0.31926047] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03686645  0.03503869  0.02092391  0.03119774] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03616568 -0.16037697  0.02154787  0.33040826] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03937322 -0.35579892  0.02815603  0.62980772] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0464892  -0.55130222  0.04075219  0.93122322] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05751524 -0.35675323  0.05937665  0.65161985] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06465031 -0.1625063   0.07240905  0.3782094 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06790043  0.03151662  0.07997324  0.10920731] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0672701   0.22540691  0.08215738 -0.15721149] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06276196  0.02921065  0.07901315  0.16021734] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06217775 -0.16694835  0.0822175   0.47674355] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06551671  0.02692232  0.09175237  0.2110646 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06497827 -0.16938356  0.09597366  0.53122219] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06836594  0.0242667   0.10659811  0.27025653] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06788061  0.21771877  0.11200324  0.01300604] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06352623  0.41107111  0.11226336 -0.24234506] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05530481  0.6044254   0.10741646 -0.49761462] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0432163   0.40796589  0.09746417 -0.17310408] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03505698  0.6015676   0.09400208 -0.43351931] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02302563  0.40524922  0.0853317  -0.11274591] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01492065  0.59905137  0.08307678 -0.37733432] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00293962  0.40285383  0.07553009 -0.05965652] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00511746  0.59681614  0.07433696 -0.32758575] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01705378  0.40071893  0.06778525 -0.01241734] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02506816  0.59480657  0.0675369  -0.2829666 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03696429  0.78890348  0.06187757 -0.55360746] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05274236  0.98310443  0.05080542 -0.826171  ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07240445  1.17749614  0.034282   -1.10245196] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.09595437  0.98194038  0.01223296 -0.79921373] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.11559318  1.17689239 -0.00375131 -1.08802348] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.13913103  1.37206361 -0.02551178 -1.38188113] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.1665723   1.17726913 -0.05314941 -1.09728413] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.19011768  1.37304905 -0.07509509 -1.4061579 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.21757866  1.5690185  -0.10321825 -1.72134082] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.24895903  1.76516005 -0.13764506 -2.04427958] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.28426223  1.57169437 -0.17853065 -1.79716578] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[-0.04044251  0.01425846  0.03291671  0.03752696] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04015734 -0.18131968  0.03366725  0.34041101] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04378374  0.01330747  0.04047547  0.05853203] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04351759 -0.18237075  0.04164611  0.3637055 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.047165    0.01213533  0.04892022  0.08443978] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0469223  -0.18365249  0.05060902  0.39214697] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05059535  0.01071608  0.05845196  0.1158401 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05038103  0.20495392  0.06076876 -0.15784397] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04628195  0.00901695  0.05761188  0.15337399] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04610161  0.20326869  0.06067936 -0.12059166] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04203623  0.39747115  0.05826753 -0.39353044] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03408681  0.59171994  0.05039692 -0.66728819] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(0) ACTION\n",
      "[-0.02225241  0.78610614  0.03705115 -0.94368686] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00653029  0.59050513  0.01817742 -0.63959617] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00527981  0.39513452  0.00538549 -0.3412448 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0131825   0.59017944 -0.0014394  -0.63222461] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02498609  0.3950776  -0.0140839  -0.33999533] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03288764  0.20015885 -0.0208838  -0.05178673] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03689082  0.39557392 -0.02191954 -0.35098482] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0448023   0.20077044 -0.02893923 -0.06529362] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04881771  0.3962951  -0.03024511 -0.36696474] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05674361  0.59183348 -0.0375844  -0.66902889] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06858028  0.39725373 -0.05096498 -0.3884126 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07652535  0.59306065 -0.05873323 -0.69671883] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.08838657  0.39880026 -0.07266761 -0.42308834] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.09636257  0.20477896 -0.08112937 -0.15416993] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.10045815  0.01090671 -0.08421277  0.11185604] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.10067629  0.20712809 -0.08197565 -0.20616227] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.10481885  0.40332078 -0.0860989  -0.52353691] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.11288526  0.59954226 -0.09656963 -0.84206067] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.12487611  0.40586171 -0.11341085 -0.58124081] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.13299334  0.60237478 -0.12503566 -0.90738814] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.14504084  0.40914711 -0.14318343 -0.65647413] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.15322378  0.60594137 -0.15631291 -0.990597  ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.16534261  0.80277066 -0.17612485 -1.3280104 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.18139802  0.61025368 -0.20268506 -1.09521567] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[ 0.01485496 -0.02524013  0.00553664  0.03623082] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01435016  0.16980198  0.00626125 -0.25470012] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0177462   0.36483398  0.00116725 -0.54540158] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02504288  0.16969565 -0.00974078 -0.25235111] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02843679 -0.02528587 -0.0147878   0.03724355] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02793107 -0.22019267 -0.01404293  0.32522433] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02352722 -0.02487362 -0.00753845  0.02814615] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02302975  0.17035562 -0.00697552 -0.26690567] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02643686  0.36557643 -0.01231364 -0.56178055] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03374839  0.17062943 -0.02354925 -0.27300233] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03716098  0.36607934 -0.02900929 -0.57301875] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04448257  0.17137589 -0.04046967 -0.28961398] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04791008  0.36705085 -0.04626195 -0.59478094] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0552511   0.17260589 -0.05815757 -0.31702171] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05870322 -0.02164156 -0.064498   -0.04323184] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05827039 -0.21578218 -0.06536264  0.22842432] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05395474 -0.40991213 -0.06079415  0.49979449] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0457565  -0.6041267  -0.05079826  0.77271569] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03367397 -0.40834399 -0.03534395  0.46449219] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02550709 -0.60294914 -0.0260541   0.74582865] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0134481  -0.40747755 -0.01113753  0.44506173] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00529855 -0.21219981 -0.0022363   0.14888892] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00105456 -0.40728966  0.00074148  0.4408655 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00709124 -0.6024221   0.00955879  0.73378207] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01913968 -0.40743351  0.02423443  0.44412279] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02728835 -0.21266269  0.03311689  0.15917657] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0315416  -0.01803013  0.03630042 -0.12287761] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03190221  0.17655346  0.03384287 -0.40389073] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02837114 -0.01903171  0.02576505 -0.10073306] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02875177 -0.21451325  0.02375039  0.19996602] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03304203 -0.4099667   0.02774971  0.50004547] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04124137 -0.60546864  0.03775062  0.80134284] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05335074 -0.80108744  0.05377748  1.10565803] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06937249 -0.99687365  0.07589064  1.41471575] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.08930996 -0.80276951  0.10418495  1.14668801] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.10536535 -0.99908607  0.12711871  1.4701411 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.12534708 -0.80572769  0.15652154  1.21971609] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.14146163 -0.61293054  0.18091586  0.97988531] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.15372024 -0.42063393  0.20051356  0.74904759] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[ 0.00326008  0.00163107 -0.01331635  0.00781001] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0032927   0.19694144 -0.01316015 -0.28904446] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00723153  0.39224856 -0.01894104 -0.58584875] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0150765   0.19739695 -0.03065802 -0.2991921 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01902444  0.00272512 -0.03664186 -0.01633361] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01907894 -0.19185271 -0.03696853  0.26456692] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01524189  0.00377686 -0.03167719 -0.03954321] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01531743  0.1993384  -0.03246806 -0.3420499 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0193042   0.39490688 -0.03930905 -0.64479194] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02720233  0.20035415 -0.05220489 -0.36474216] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03120942  0.00601148 -0.05949974 -0.08896657] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03132965 -0.18820938 -0.06127907  0.18436658] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02756546 -0.38240344 -0.05759173  0.45710615] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01991739 -0.18651658 -0.04844961  0.14684041] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01618706  0.0092645  -0.0455128  -0.16072522] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01637235 -0.18517733 -0.04872731  0.11725935] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0126688   0.0106077  -0.04638212 -0.19038977] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01288095 -0.18382108 -0.05018992  0.08730829] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00920453 -0.37818902 -0.04844375  0.36374342] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00164075 -0.5725902  -0.04116888  0.6407662 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00981105 -0.37691923 -0.02835356  0.33540821] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01734944 -0.57162643 -0.02164539  0.61901678] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02878196 -0.76643946 -0.00926506  0.90480464] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04411075 -0.96143472  0.00883103  1.19456108] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(1) ACTION\n",
      "[-0.06333945 -0.76642823  0.03272226  0.90465904] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07866801 -0.57176435  0.05081544  0.62243817] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0901033  -0.37738741  0.0632642   0.34618242] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.09765105 -0.57334954  0.07018785  0.65812485] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.10911804 -0.76937461  0.08335035  0.97205677] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.12450553 -0.57546409  0.10279148  0.70667741] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.13601481 -0.77184857  0.11692503  1.02986675] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.15145178 -0.57846012  0.13752236  0.77606242] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.16302099 -0.3854705   0.15304361  0.52961536] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.1707304  -0.19279523  0.16363592  0.28879991] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.1745863  -0.00033885  0.16941192  0.05186629] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.17459308 -0.1974338   0.17044924  0.39284551] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.17854175 -0.00508867  0.17830615  0.15837861] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.17864353  0.18709195  0.18147373 -0.07317726] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.17490169 -0.0101049   0.18001018  0.2708216 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.17510379 -0.20727777  0.18542661  0.61443963] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.17924934 -0.4044402   0.19771541  0.9593171 ] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[ 0.04421872  0.01010953 -0.03310813  0.02834817] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04442091  0.20569025 -0.03254117 -0.27459414] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04853471  0.40126102 -0.03803305 -0.5773604 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05655993  0.59689483 -0.04958026 -0.88177793] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06849783  0.40248011 -0.06721582 -0.60508484] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07654743  0.20835932 -0.07931752 -0.33430775] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.08071462  0.01445057 -0.08600367 -0.067654  ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.08100363  0.21069353 -0.08735675 -0.38618438] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0852175   0.01691328 -0.09508044 -0.12227261] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.08555576 -0.17672697 -0.09752589  0.13896427] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.08202122  0.01964673 -0.09474661 -0.18282374] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.08241416  0.2159877  -0.09840308 -0.50382813] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.08673391  0.4123488  -0.10847964 -0.82582871] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.09498089  0.21886434 -0.12499622 -0.56913852] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.09935818  0.41549728 -0.13637899 -0.89844004] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.10766812  0.61217787 -0.15434779 -1.23069284] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.11991168  0.41934081 -0.17896165 -0.99007422] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.1282985   0.22700669 -0.19876313 -0.75851498] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[-0.01478954 -0.02865311  0.04578617  0.02160815] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01536261 -0.22440076  0.04621833  0.32837833] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01985062 -0.02996621  0.0527859   0.05062131] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02044995  0.16436067  0.05379833 -0.22495115] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01716273 -0.03148725  0.0492993   0.08420478] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01779248 -0.22727995  0.0509834   0.39202513] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02233808 -0.0329172   0.0588239   0.11584246] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02299642 -0.22883051  0.06114075  0.4264884 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02757303 -0.42476276  0.06967052  0.7378018 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03606829 -0.23066861  0.08442656  0.46783346] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04068166 -0.03683456  0.09378323  0.20290991] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04141835  0.15682977  0.09784142 -0.05877814] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03828175 -0.03954899  0.09666586  0.26310068] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03907273  0.1540698   0.10192787  0.00240388] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03599134  0.34759345  0.10197595 -0.25646004] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02903947  0.15117461  0.09684675  0.06656662] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02601598 -0.04519284  0.09817808  0.38816598] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02691983 -0.24156131  0.1059414   0.71011826] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03175106 -0.43797855  0.12014377  1.03418082] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04051063 -0.24464126  0.14082739  0.78150278] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04540346 -0.05170688  0.15645744  0.53623275] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04643759 -0.24864264  0.1671821   0.87383923] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05141045 -0.44559478  0.18465888  1.21407135] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06032234 -0.2532717   0.20894031  0.98447298] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[-0.04337951 -0.01693661  0.02561982 -0.04943467] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04371824  0.17780879  0.02463113 -0.33392549] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04016206 -0.01765491  0.01795262 -0.03357806] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04051516  0.17720505  0.01728105 -0.32054312] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03697106  0.37207669  0.01087019 -0.60772656] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02952953  0.17680446 -0.00128434 -0.31163975] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02599344 -0.01829917 -0.00751713 -0.01936214] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02635942 -0.21331251 -0.00790438  0.2709396 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03062567 -0.01807867 -0.00248559 -0.02422589] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03098724  0.17707884 -0.0029701  -0.31769201] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02744567 -0.01800068 -0.00932394 -0.02594722] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02780568  0.17725373 -0.00984289 -0.32155731] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02426061 -0.01772667 -0.01627403 -0.03199463] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02461514 -0.21261151 -0.01691393  0.25550956] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02886737 -0.0172522  -0.01180374 -0.04245994] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02921241  0.178037   -0.01265293 -0.33884356] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02565167 -0.01690264 -0.01942981 -0.05017737] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02598973 -0.21174068 -0.02043335  0.23631252] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03022454 -0.01633284 -0.0157071  -0.06274503] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0305512   0.17901075 -0.016962   -0.36034197] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02697098 -0.01586604 -0.02416884 -0.07305543] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0272883   0.17959392 -0.02562995 -0.37326468] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02369642  0.3750704  -0.03309524 -0.67391749] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01619502  0.18042367 -0.04657359 -0.3918355 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01258654 -0.01400745 -0.0544103  -0.11419276] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01286669 -0.20830926 -0.05669416  0.16083991] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01703288 -0.40257567 -0.05347736  0.43511217] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02508439 -0.20673905 -0.04477512  0.12606234] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02921917 -0.0110052  -0.04225387 -0.18040366] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02943928  0.18469512 -0.04586194 -0.48611107] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02574537 -0.00975069 -0.05558417 -0.20822773] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02594039  0.18612019 -0.05974872 -0.51791363] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02221798 -0.00811186 -0.07010699 -0.24464111] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02238022  0.18793775 -0.07499982 -0.55858772] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(0) ACTION\n",
      "[-0.01862147  0.38402792 -0.08617157 -0.87392538] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01094091  0.19017662 -0.10365008 -0.60953045] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00713737  0.3865831  -0.11584069 -0.93297762] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 5.94287040e-04  1.93198527e-01 -1.34500239e-01 -6.78825918e-01] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 4.45825758e-03  1.76005825e-04 -1.48076757e-01 -4.31331022e-01] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00446178  0.19705035 -0.15670338 -0.76678648] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00840278  0.00439296 -0.17203911 -0.52722378] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00849064  0.20146478 -0.18258358 -0.86879804] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01251994  0.00923355 -0.19995954 -0.63862595] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[ 0.0148445   0.03996057  0.02630323 -0.02739259] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01564371  0.23469563  0.02575538 -0.31166189] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02033762  0.42944136  0.01952214 -0.59611236] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02892645  0.23405172  0.0075999  -0.2973446 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03360749  0.03882226  0.00165301 -0.00227452] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03438393  0.23392047  0.00160751 -0.29443545] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03906234  0.03877564 -0.00428119 -0.00124597] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03983785  0.23395872 -0.00430611 -0.29527658] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04451703  0.4291418  -0.01021165 -0.58931447] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05309986  0.23416432 -0.02199793 -0.29986567] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05778315  0.03936271 -0.02799525 -0.01420083] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05857041  0.23487473 -0.02827926 -0.31558347] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0632679   0.04016677 -0.03459093 -0.03195131] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06407124  0.23576726 -0.03522996 -0.33534423] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06878658  0.43137242 -0.04193684 -0.63892538] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07741403  0.62705324 -0.05471535 -0.94451435] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.08995509  0.43270936 -0.07360564 -0.66951307] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.09860928  0.23868383 -0.0869959  -0.40088333] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.10338296  0.04489666 -0.09501357 -0.13684641] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.10428089  0.24124202 -0.0977505  -0.45792804] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.10910573  0.04762801 -0.10690906 -0.19758633] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.11005829  0.24410378 -0.11086078 -0.52198879] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.11494037  0.05070254 -0.12130056 -0.26619415] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.11595442  0.24732809 -0.12662444 -0.59454027] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.12090098  0.0541846  -0.13851525 -0.34427273] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.12198467 -0.13872302 -0.1454007  -0.09827616] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.11921021 -0.33149424 -0.14736623  0.14523212] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.11258033 -0.13460293 -0.14446158 -0.19007277] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.10988827 -0.32739452 -0.14826304  0.05377817] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.10334038 -0.52011389 -0.14718747  0.29625598] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0929381  -0.71286453 -0.14126236  0.5391387 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.07868081 -0.51606888 -0.13047958  0.20549249] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06835943 -0.31934586 -0.12636973 -0.12533594] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06197251 -0.12266135 -0.12887645 -0.45506405] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05951929 -0.31574802 -0.13797773 -0.20562084] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05320433 -0.50865484 -0.14209015  0.04055249] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04303123 -0.3118115  -0.1412791  -0.29337097] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.036795   -0.50466619 -0.14714652 -0.04836916] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02670168 -0.3077743  -0.1481139  -0.38362055] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02054619 -0.11089402 -0.15578631 -0.71909399] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01832831 -0.30355664 -0.17016819 -0.47921596] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01225718 -0.10649254 -0.17975251 -0.82033096] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01012733  0.09057386 -0.19615913 -1.16373096] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[-0.0299059   0.03722117 -0.0217467  -0.01246499] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02916148 -0.15758227 -0.021996    0.27327804] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03231312  0.03784652 -0.01653044 -0.02626053] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03155619  0.23320158 -0.01705565 -0.32411281] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02689216  0.03832659 -0.02353791 -0.03685695] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02612563 -0.15645006 -0.02427504  0.24830758] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02925463 -0.35121707 -0.01930889  0.53323586] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03627897 -0.54606221 -0.00864418  0.81977263] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04720022 -0.7410648   0.00775128  1.10972423] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06202151 -0.93628774  0.02994576  1.4048287 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.08074727 -0.74155013  0.05804234  1.12165597] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09557827 -0.9373832   0.08047546  1.43196578] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.11432593 -0.74334134  0.10911477  1.16547924] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.12919276 -0.93970107  0.13242436  1.4902833 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.14798678 -1.13616334  0.16223002  1.82121565] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.17071005 -0.94317313  0.19865433  1.5830181 ] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[ 0.01536878 -0.01981953 -0.04114662 -0.01315336] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01497239  0.17586762 -0.04140969 -0.31852934] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01848975  0.37155413 -0.04778028 -0.62397844] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02592083  0.56730946 -0.06025985 -0.93131838] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03726702  0.7631906  -0.07888621 -1.24231245] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05253083  0.56916481 -0.10373246 -0.97534737] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06391413  0.37557566 -0.12323941 -0.71696725] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07142564  0.57216818 -0.13757875 -1.04576023] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.082869    0.37911387 -0.15849396 -0.79923571] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.09045128  0.18647984 -0.17447867 -0.56030872] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.09418088  0.38356585 -0.18568485 -0.90248611] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.10185219  0.19137871 -0.20373457 -0.67343911] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[-0.00239891 -0.01119521  0.04244273 -0.01029328] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00262281 -0.20689933  0.04223686  0.29547297] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0067608  -0.40259718  0.04814632  0.60117188] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01481274 -0.20818063  0.06016976  0.32403454] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01897636 -0.01396475  0.06665045  0.05091715] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01925565  0.18014134  0.06766879 -0.22001478] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01565283 -0.01587935  0.0632685   0.09322276] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01597041  0.1782814   0.06513295 -0.17884712] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01240478 -0.01770923  0.06155601  0.13365101] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01275897  0.17647944  0.06422903 -0.13899429] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00922938  0.37062547  0.06144914 -0.41074249] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00181687  0.17468864  0.05323429 -0.09933729] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(0) ACTION\n",
      "[ 0.0016769  -0.02115424  0.05124755  0.20965438] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00125382 -0.21697008  0.05544063  0.5180527 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00308558 -0.41282699  0.06580169  0.82767788] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01134212 -0.21866349  0.08235525  0.55639466] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01571539 -0.41483922  0.09348314  0.873846  ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02401218 -0.22110413  0.11096006  0.61195629] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02843426 -0.41758764  0.12319918  0.93742442] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03678601 -0.61413609  0.14194767  1.26614151] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04906874 -0.42108409  0.1672705   1.0210686 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05749042 -0.61799172  0.18769188  1.36125617] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[-0.02953984  0.01998128  0.01834537 -0.02094797] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02914022  0.2148354   0.01792641 -0.30778674] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02484351  0.40969739  0.01177067 -0.59476268] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-1.66495626e-02  2.14412691e-01 -1.24581049e-04 -2.98395419e-01] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01236131  0.40953642 -0.00609249 -0.59111763] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00417058  0.60474314 -0.01791484 -0.88571346] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00792428  0.40986892 -0.03562911 -0.59871572] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01612166  0.21526312 -0.04760343 -0.31746502] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02042692  0.41102963 -0.05395273 -0.62477225] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02864752  0.21670075 -0.06644817 -0.3495575 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03298153  0.41270172 -0.07343932 -0.66243151] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04123557  0.60876447 -0.08668795 -0.97730468] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05341085  0.41490518 -0.10623405 -0.71306203] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06170896  0.61132506 -0.12049529 -1.03720495] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07393546  0.8078245  -0.14123938 -1.36515691] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.09009195  0.61472534 -0.16854252 -1.11977886] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.10238646  0.81160835 -0.1909381  -1.46023595] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[ 0.01236501 -0.01831343  0.00727704  0.00243917] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01199874  0.1767034   0.00732582 -0.28793891] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01553281  0.37172012  0.00156705 -0.57830237] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02296721  0.56682008 -0.009999   -0.87049123] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03430361  0.7620766  -0.02740883 -1.16630103] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04954514  0.56732187 -0.05073485 -0.88233584] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06089158  0.76309482 -0.06838156 -1.19052706] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07615348  0.95903299 -0.09219211 -1.50383594] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.09533414  0.76514285 -0.12226882 -1.24130191] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.110637    0.5717841  -0.14709486 -0.98928661] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.12207268  0.76853612 -0.16688059 -1.32431766] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.1374434   0.96532568 -0.19336695 -1.6642367 ] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[0.02010465 0.01478959 0.00083081 0.00116612] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02040044 -0.18034426  0.00085413  0.29411106] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01679356 -0.37547838  0.00673635  0.58706324] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00928399 -0.18045142  0.01847761  0.29650991] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[0.00567496 0.01440232 0.02440781 0.00971131] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00596301  0.20916588  0.02460204 -0.27517183] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[0.01014632 0.01370171 0.0190986  0.02516796] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01042036 -0.18168885  0.01960196  0.323815  ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00678658 -0.37708435  0.02607826  0.62261456] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-7.55106180e-04 -5.72560548e-01  3.85305526e-02  9.23395290e-01] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01220632 -0.37797972  0.05699846  0.64306585] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01976591 -0.57384785  0.06985978  0.95313939] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03124287 -0.37973187  0.08892256  0.68319773] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03883751 -0.18594987  0.10258652  0.41978198] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0425565  -0.38236444  0.11098216  0.74296245] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05020379 -0.18893506  0.12584141  0.48716523] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05398249 -0.38558704  0.13558471  0.81671044] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06169423 -0.19255593  0.15191892  0.56956098] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06554535 -0.38944541  0.16331014  0.90598625] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07333426 -0.58635699  0.18142986  1.24522365] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0850614  -0.39396565  0.20633434  1.01442153] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[-0.0064042   0.04294704 -0.03528399 -0.00044992] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00554526 -0.15165159 -0.03529299  0.280895  ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00857829  0.04395556 -0.02967509 -0.02270699] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00769918  0.23949024 -0.03012923 -0.32460306] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00290937  0.43502794 -0.03662129 -0.62663316] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00579119  0.24043579 -0.04915395 -0.34570488] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0105999   0.43622123 -0.05606805 -0.65347378] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01932433  0.63207724 -0.06913753 -0.96327152] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03196587  0.8280566  -0.08840296 -1.27684849] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.048527    0.63416603 -0.11393993 -1.01310499] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06121032  0.44073315 -0.13420203 -0.75826293] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07002499  0.63742402 -0.14936728 -1.08998347] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.08277347  0.44455254 -0.17116695 -0.84764842] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.09166452  0.25212681 -0.18811992 -0.61330551] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.09670705  0.44931024 -0.20038603 -0.95884409] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[-0.04669     0.01693141 -0.01961524 -0.01853056] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04635137 -0.17790382 -0.01998585  0.26789956] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04990945  0.01749757 -0.01462786 -0.03101944] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0495595   0.2128262  -0.01524825 -0.32828149] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04530297  0.40816188 -0.02181388 -0.62573375] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03713973  0.60358144 -0.03432855 -0.92520612] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02506811  0.40893953 -0.05283268 -0.64350589] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01688931  0.60475648 -0.06570279 -0.95234706] STATE\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00479418  0.41057725 -0.08474974 -0.68100939] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00341736  0.60676766 -0.09836992 -0.99912539] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01555271  0.41308844 -0.11835243 -0.73888519] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02381448  0.60962854 -0.13313014 -1.06634744] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03600705  0.41649513 -0.15445708 -0.81823554] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04433696  0.2237865  -0.17082179 -0.57784387] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04881269  0.03141837 -0.18237867 -0.34346917] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04944105  0.2286028  -0.18924806 -0.68766295] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05401311  0.03654115 -0.20300131 -0.46002149] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[-0.02321925  0.01616685 -0.00499792  0.04485279] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02289591  0.21136011 -0.00410086 -0.24940283] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01866871  0.40654038 -0.00908892 -0.54337643] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0105379   0.21154733 -0.01995645 -0.25357104] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00630695  0.40694847 -0.02502787 -0.55248118] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00183202  0.6024128  -0.03607749 -0.85294328] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01388027  0.79800748 -0.05313636 -1.15674887] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02984042  0.99378031 -0.07627133 -1.46560838] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04971603  0.79967075 -0.1055835  -1.19769256] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06570944  0.60606164 -0.12953735 -0.93988006] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.07783068  0.41290125 -0.14833495 -0.69054454] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0860887   0.60973613 -0.16214584 -1.02600528] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.09828342  0.41710059 -0.18266595 -0.7883044 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.10662544  0.22489439 -0.19843204 -0.55819736] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[-0.00631752 -0.03427881 -0.01408304 -0.00753409] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0070031  -0.22919599 -0.01423372  0.28067241] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01158702 -0.42411204 -0.00862027  0.5688323 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02006926 -0.22887025  0.00275638  0.27344617] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02464667 -0.03378773  0.0082253  -0.01836612] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02532242  0.1612153   0.00785798 -0.30844256] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02209811 -0.03401773  0.00168913 -0.01329183] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02277847  0.16107995  0.00142329 -0.30544134] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01955687  0.35618159 -0.00468554 -0.59767506] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01243324  0.16112552 -0.01663904 -0.3064717 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00921073 -0.03375543 -0.02276847 -0.0190824 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00988584  0.16168553 -0.02315012 -0.3188612 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00665213 -0.0330992  -0.02952734 -0.03356793] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00731411 -0.22778554 -0.0301987   0.24965452] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01186982 -0.03224564 -0.02520561 -0.05239867] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01251473 -0.22699728 -0.02625359  0.23222632] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01705468 -0.03151022 -0.02160906 -0.06862088] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01768488  0.16391476 -0.02298148 -0.36804249] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01440659  0.3593556  -0.03034233 -0.66788229] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00721948  0.16466844 -0.04369997 -0.38490525] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00392611  0.36038267 -0.05139808 -0.69104002] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00328155  0.5561787  -0.06521888 -0.9994506 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01440512  0.36198627 -0.08520789 -0.727942  ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02164485  0.16813916 -0.09976673 -0.46324706] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02500763 -0.02544181 -0.10903167 -0.20360229] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02449879  0.17105681 -0.11310372 -0.52859182] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02791993 -0.02230741 -0.12367555 -0.27358036] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02747378 -0.21546763 -0.12914716 -0.02232108] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02316443 -0.40852371 -0.12959358  0.2269876 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01499395 -0.60157842 -0.12505383  0.47614742] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00296239 -0.40493309 -0.11553088  0.14681277] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00513628 -0.59822734 -0.11259463  0.40093076] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01710082 -0.79158715 -0.10457601  0.65610099] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03293257 -0.59517677 -0.09145399  0.33240559] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0448361  -0.39888018 -0.08480588  0.01234118] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0528137  -0.20265084 -0.08455906 -0.30584805] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05686672 -0.00643208 -0.09067602 -0.6239553 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05699536 -0.20017887 -0.10315512 -0.36115128] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06099894 -0.0037535  -0.11037815 -0.68449696] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06107401 -0.19718391 -0.12406809 -0.42850235] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.06501769 -0.39035046 -0.13263813 -0.17736222] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0728247  -0.19360436 -0.13618538 -0.50877054] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.07669679 -0.38657131 -0.14636079 -0.26191497] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08442821 -0.57933385 -0.15159909 -0.01873919] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.09601489 -0.3823996  -0.15197387 -0.35515217] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.10366288 -0.18548038 -0.15907692 -0.69163504] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.10737249  0.01144917 -0.17290962 -1.02987131] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.1071435  -0.18100333 -0.19350704 -0.79607973] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.11076357  0.01617261 -0.20942864 -1.14286015] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[ 0.01908029  0.01616992  0.01560016 -0.04110203] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01940369  0.21106473  0.01477812 -0.32882242] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02362498  0.01573555  0.00820167 -0.03151604] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02393969 -0.17950305  0.00757135  0.26374328] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02034963  0.01551002  0.01284621 -0.02654199] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02065983  0.21044541  0.01231537 -0.31514427] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02486874  0.01515022  0.00601249 -0.01860306] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02517174 -0.18005744  0.00564043  0.27597079] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0215706  -0.37525941  0.01115984  0.57042736] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01406541 -0.18029573  0.02256839  0.28128098] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01045949  0.01449716  0.02819401 -0.00419939] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01074944  0.20920366  0.02811002 -0.28785513] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01493351  0.4039137   0.02235292 -0.57154156] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02301178  0.20848555  0.01092209 -0.27190135] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02718149  0.40344996  0.00548406 -0.56111949] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(0) ACTION\n",
      "[ 0.03525049  0.59849452 -0.00573833 -0.85206963] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04722038  0.40345127 -0.02277972 -0.56119663] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05528941  0.20865629 -0.03400366 -0.27577663] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05946254  0.40424647 -0.03951919 -0.57898745] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.06754746  0.20969999 -0.05109894 -0.29901119] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.07174146  0.01534224 -0.05707916 -0.02287166] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07204831  0.21123434 -0.05753659 -0.33300342] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.076273    0.0169765  -0.06419666 -0.05900528] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.07661253 -0.17716905 -0.06537677  0.21275231] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.07306914  0.01882373 -0.06112172 -0.0998159 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07344562  0.21476599 -0.06311804 -0.41113904] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07774094  0.02059297 -0.07134082 -0.13900387] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0781528  -0.17343853 -0.0741209   0.13034663] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07468403  0.02266262 -0.07151397 -0.18476904] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07513728 -0.1713671  -0.07520935  0.08452443] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.07170994 -0.3653349  -0.07351886  0.35256293] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06440324 -0.5593386  -0.0664676   0.62118731] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.05321647 -0.36335447 -0.05404385  0.30833318] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04594938 -0.55766639 -0.04787719  0.58349435] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03479605 -0.75208607 -0.0362073   0.86071918] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01975433 -0.94669672 -0.01899292  1.1418012 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 8.20395222e-04 -1.14156536e+00  3.84310490e-03  1.42846786e+00] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02201091 -0.94649108  0.03241246  1.13698848] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04094073 -1.14202165  0.05515223  1.43965788] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06378117 -0.94762082  0.08394539  1.16470655] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.08273358 -1.14372918  0.10723952  1.4824842 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.10560817 -0.95006628  0.1368892   1.22512666] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.12460949 -0.7569464   0.16139174  0.97827657] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.13974842 -0.95382095  0.18095727  1.31699415] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.15882484 -1.15071064  0.20729715  1.66041799] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[ 0.04777142  0.01385507  0.02557344 -0.02377156] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04804852  0.20860112  0.02509801 -0.30827734] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.05222054  0.40335663  0.01893246 -0.59294056] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06028768  0.59820851  0.00707365 -0.87960019] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.07225185  0.79323364 -0.01051835 -1.17005094] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.08811652  0.9884908  -0.03391937 -1.46601281] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.10788634  0.79380021 -0.06323963 -1.18411529] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.12376234  0.59955311 -0.08692193 -0.91190713] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.1357534   0.40570794 -0.10516008 -0.64776   ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.14386756  0.60212563 -0.11811528 -0.97161906] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.15591007  0.79861763 -0.13754766 -1.29894868] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.17188243  0.6054835  -0.16352663 -1.05229527] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.1839921   0.41286284 -0.18457254 -0.81508167] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.19224935  0.2206829  -0.20087417 -0.58566483] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[ 0.02663462 -0.04032143  0.02722017  0.04145794] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02582819 -0.23582292  0.02804932  0.34260328] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.02111173 -0.43133247  0.03490139  0.64399766] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01248508 -0.626923    0.04778134  0.94746373] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-5.33790693e-05 -4.32475893e-01  6.67306175e-02  6.70168316e-01] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0087029  -0.62845904  0.08013398  0.98309299] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02127208 -0.43449688  0.09979584  0.71661757] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02996202 -0.63084804  0.1141282   1.0389702 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04257898 -0.4374123   0.1349076   0.78418511] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05132722 -0.63410459  0.1505913   1.11608539] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06400931 -0.83084742  0.17291301  1.45196637] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.08062626 -1.02761997  0.20195234  1.7933072 ] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[-0.00951501  0.01304053  0.04924601  0.03616871] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0092542   0.20742296  0.04996939 -0.24057897] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00510574  0.01162413  0.04515781  0.06743757] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00487326  0.20607054  0.04650656 -0.21066282] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00075185  0.01031555  0.0422933   0.09632018] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00054554 -0.18538625  0.04421971  0.4020409 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00425326 -0.38110659  0.05226053  0.70833079] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01187539 -0.18674601  0.06642714  0.43254554] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01561031  0.00737561  0.07507805  0.16152052] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0154628   0.20134693  0.07830846 -0.10656461] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01143586  0.00519521  0.07617717  0.20976047] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01133196 -0.19092863  0.08037238  0.52546751] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01515053 -0.38708414  0.09088173  0.84235499] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02289221 -0.5833212   0.10772883  1.16217828] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03455864 -0.77966845  0.1309724   1.48660281] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05015201 -0.58636309  0.16070445  1.23752405] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.06187927 -0.78314255  0.18545493  1.57593395] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[ 0.00266249 -0.0271852   0.04482095  0.04986131] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00211878 -0.22292023  0.04581818  0.35634194] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00233962 -0.02847863  0.05294502  0.07845146] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0029092  -0.22431802  0.05451405  0.38735748] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00739556 -0.03001057  0.0622612   0.11234847] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00799577  0.16416653  0.06450817 -0.16005986] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00471244 -0.03181679  0.06130697  0.15225593] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00534877 -0.22776057  0.06435209  0.46363238] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00990398 -0.42373009  0.07362474  0.77588468] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01837859 -0.22969384  0.08914243  0.5072449 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02297246 -0.03593357  0.09928733  0.24393169] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02369113  0.15764038  0.10416596 -0.01585669] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02053833 -0.03880925  0.10384883  0.30778995] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(0) ACTION\n",
      "[-0.02131451 -0.23524582  0.11000463  0.63133408] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02601943 -0.43171675  0.12263131  0.95653532] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03465376 -0.23843836  0.14176202  0.70475826] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03942253 -0.04553565  0.15585718  0.45984541] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04033324 -0.24247755  0.16505409  0.79731423] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04518279 -0.04995835  0.18100037  0.56076596] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04618196 -0.24709713  0.19221569  0.90456749] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(0) ACTION\n",
      "[-0.02712078  0.01455448  0.0280587   0.03178148] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02682969 -0.18095838  0.02869433  0.33318351] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03044886  0.01374366  0.035358    0.04968565] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03017399  0.20834124  0.03635171 -0.23163519] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02600716  0.01271922  0.03171901  0.07228903] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02575278  0.20737242  0.03316479 -0.21022007] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02160533  0.40200486  0.02896039 -0.49225953] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.01356523  0.20648665  0.0191152  -0.19059203] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0094355   0.01109652  0.01530335  0.10805914] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00921357 -0.18424135  0.01746454  0.40553064] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01289839  0.01062864  0.02557515  0.11840455] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01268582  0.205375    0.02794324 -0.16610118] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00857832  0.40008604  0.02462122 -0.44983948] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0005766   0.20462466  0.01562443 -0.14949836] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[0.00351589 0.0092825  0.01263446 0.14807251] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00370154  0.20422126  0.01559591 -0.14059787] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[0.00778597 0.00887946 0.01278395 0.15696422] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00796356 -0.18642317  0.01592324  0.45365261] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[0.00423509 0.00847004 0.02499629 0.16603109] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00440449  0.20322542  0.02831691 -0.11866259] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[0.008469   0.00770944 0.02594366 0.18281787] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00862319  0.20245075  0.02960002 -0.10156921] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[0.01267221 0.00691737 0.02756863 0.20030352] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01281055 -0.18858781  0.0315747   0.501554  ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.0090388  -0.38414028  0.04160578  0.80401792] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00135599 -0.18961272  0.05768614  0.52470767] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00243626 -0.38549701  0.0681803   0.83499568] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0101462  -0.58148084  0.08488021  1.14831789] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.02177582 -0.77760206  0.10784657  1.46636544] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03732786 -0.97386654  0.13717388  1.79069723] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05680519 -0.78052402  0.17298782  1.54361049] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.07241567 -0.58785133  0.20386003  1.30952009] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[0.04849035 0.01914883 0.04461432 0.03371657] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04887333  0.21360354  0.04528866 -0.244563  ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[0.0531454  0.01786497 0.0403974  0.06205412] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0535027  -0.1778122   0.04163848  0.36720402] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04994645 -0.37350033  0.04898256  0.67271997] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.04247645 -0.56926769  0.06243696  0.9804139 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.03109109 -0.37503567  0.08204524  0.70797812] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.02359038 -0.57119252  0.0962048   1.02531789] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.01216653 -0.37747405  0.11671116  0.76432446] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00461705 -0.57399331  0.13199765  1.09133318] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00686282 -0.77058427  0.15382431  1.42234912] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0222745  -0.57766273  0.18227129  1.18142994] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03382776 -0.38531291  0.20589989  0.95097628] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[-0.00596328  0.02467762 -0.02385445 -0.0148512 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00546972  0.22013341 -0.02415147 -0.31496401] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.00106706  0.41559092 -0.03045075 -0.6151646 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00724476  0.22090737 -0.04275404 -0.3322258 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01166291  0.41661096 -0.04939856 -0.6380788 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.01999513  0.61238567 -0.06216013 -0.9458999 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.03224284  0.80828725 -0.08107813 -1.25744807] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.04840859  1.00434786 -0.10622709 -1.57438374] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.06849554  0.81064087 -0.13771477 -1.31663314] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.08470836  1.00720974 -0.16404743 -1.64905515] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.10485256  0.8143416  -0.19702853 -1.41165092] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n",
      "tensor(1) ACTION\n",
      "[ 0.00328442  0.00556895  0.04122954 -0.02536305] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.0033958   0.20007612  0.04072228 -0.30475782] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[0.00739732 0.00439822 0.03462712 0.00048474] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[ 0.00748529 -0.19120278  0.03463682  0.3038887 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[0.00366123 0.00340887 0.04071459 0.02232751] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[ 0.00372941 -0.19227261  0.04116114  0.32757325] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-1.16041835e-04 -3.87955649e-01  4.77126059e-02  6.32947159e-01] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.00787515 -0.58370961  0.06037155  0.94026612] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.01954935 -0.38945107  0.07917687  0.66714763] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.02733837 -0.19551432  0.09251982  0.40040755] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03124865 -0.0018182   0.10052797  0.13826804] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03128502 -0.19822556  0.10329334  0.46089715] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.03524953 -0.00470365  0.11251128  0.20247576] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0353436  -0.20123983  0.11656079  0.52842464] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.0393684  -0.00793405  0.12712929  0.27462482] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.03952708 -0.20461897  0.13262178  0.60454672] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04361946 -0.01157662  0.14471272  0.35640257] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04385099 -0.2084277   0.15184077  0.69098997] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04801955 -0.01570223  0.16566057  0.44969777] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04833359 -0.21273219  0.17465452  0.78967834] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05258823 -0.02038367  0.19044809  0.55663273] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05299591  0.17162632  0.20158074  0.32947909] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04956338 -0.02570888  0.20817033  0.6783524 ] STATE\n",
      "<class 'float'> REWARD\n",
      "True\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(0) ACTION\n",
      "[-0.04599506 -0.04502388  0.04239488  0.00861977] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04689554 -0.24072738  0.04256727  0.31437162] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.05171009 -0.04623684  0.0488547   0.03541107] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05263482  0.14815172  0.04956293 -0.24146629] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04967179 -0.04764189  0.0447336   0.06642901] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.05062463  0.14681111  0.04606218 -0.2118114 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.0476884  -0.04893812  0.04182595  0.0950385 ] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04866717  0.14556017  0.04372672 -0.18416054] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04575596 -0.05015928  0.04004351  0.12198957] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04675915  0.14436677  0.0424833  -0.15779595] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04387181 -0.05133686  0.03932738  0.14798096] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04489855  0.1432005   0.042287   -0.13204018] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04203454 -0.05250088  0.0396462   0.17367814] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(0) ACTION\n",
      "[-0.04308456  0.14203189  0.04311976 -0.10623879] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n",
      "tensor(1) ACTION\n",
      "[-0.04024392 -0.05368061  0.04099499  0.19973037] STATE\n",
      "<class 'float'> REWARD\n",
      "False\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-102-c37724f7a63a>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     50\u001b[0m             w.add_histogram(\"gradients/critic\",\n\u001b[1;32m     51\u001b[0m                              torch.cat([p.data.view(-1) for p in critic.parameters()]), global_step=s)\n\u001b[0;32m---> 52\u001b[0;31m             \u001b[0madam_critic\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     53\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     54\u001b[0m         \u001b[0mprev_prob_act\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mprob_act\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/envs/irl/lib/python3.7/site-packages/torch/optim/adam.py\u001b[0m in \u001b[0;36mstep\u001b[0;34m(self, closure)\u001b[0m\n\u001b[1;32m     93\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     94\u001b[0m                 \u001b[0;31m# Decay the first and second moment running average coefficient\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 95\u001b[0;31m                 \u001b[0mexp_avg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmul_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbeta1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mbeta1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgrad\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     96\u001b[0m                 \u001b[0mexp_avg_sq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmul_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbeta2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maddcmul_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mbeta2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgrad\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgrad\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     97\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0mamsgrad\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "episode_rewards = []\n",
    "gamma = 0.98\n",
    "eps = 0.2\n",
    "w = tensorboard.SummaryWriter()\n",
    "s = 0\n",
    "max_grad_norm = 0.5\n",
    "\n",
    "for i in range(800):\n",
    "    prev_prob_act = None\n",
    "    done = False\n",
    "    total_reward = 0\n",
    "    state = env.reset()\n",
    "\n",
    "\n",
    "    while not done:\n",
    "        s += 1\n",
    "        probs = actor(t(state))\n",
    "        dist = torch.distributions.Categorical(probs=probs)\n",
    "        action = dist.sample()\n",
    "        prob_act = dist.log_prob(action)\n",
    "        print(action, \"ACTION\")\n",
    "        print(state, \"STATE\")\n",
    "        next_state, reward, done, info = env.step(action.detach().data.numpy())\n",
    "        print(type(reward), \"REWARD\")\n",
    "        print(done)\n",
    "        advantage = reward + (1-done)*gamma*critic(t(next_state)) - critic(t(state))\n",
    "        \n",
    "        w.add_scalar(\"loss/advantage\", advantage, global_step=s)\n",
    "        w.add_scalar(\"actions/action_0_prob\", dist.probs[0], global_step=s)\n",
    "        w.add_scalar(\"actions/action_1_prob\", dist.probs[1], global_step=s)\n",
    "        \n",
    "        total_reward += reward\n",
    "        state = next_state\n",
    "        \n",
    "        if prev_prob_act:\n",
    "            actor_loss = policy_loss(prev_prob_act.detach(), prob_act, advantage.detach(), eps)\n",
    "            w.add_scalar(\"loss/actor_loss\", actor_loss, global_step=s)\n",
    "            adam_actor.zero_grad()\n",
    "            actor_loss.backward()\n",
    "            # clip_grad_norm_(adam_actor, max_grad_norm)\n",
    "            w.add_histogram(\"gradients/actor\",\n",
    "                             torch.cat([p.grad.view(-1) for p in actor.parameters()]), global_step=s)\n",
    "            adam_actor.step()\n",
    "\n",
    "            critic_loss = advantage.pow(2).mean()\n",
    "            w.add_scalar(\"loss/critic_loss\", critic_loss, global_step=s)\n",
    "            adam_critic.zero_grad()\n",
    "            critic_loss.backward()\n",
    "            # clip_grad_norm_(adam_critic, max_grad_norm)\n",
    "            w.add_histogram(\"gradients/critic\",\n",
    "                             torch.cat([p.data.view(-1) for p in critic.parameters()]), global_step=s)\n",
    "            adam_critic.step()\n",
    "        \n",
    "        prev_prob_act = prob_act\n",
    "    \n",
    "    w.add_scalar(\"reward/episode_reward\", total_reward, global_step=i)\n",
    "    episode_rewards.append(total_reward)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<torch._C.Generator at 0x7fb26da1f310>"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## PPO for gridworld environment\n",
    "\n",
    "#svf_exp += self.env.gamma**step * self.env.get_svf_features(state=self.state)\n",
    "#rewards += self.env.gamma**step * self.env.get_rewards(state=self.state)\n",
    "#self.state = self.env.take_action(self.state, a)\n",
    "\n",
    "gridworld = GridWorldEnvironment(0, 10, prop=0)\n",
    "\n",
    "# config\n",
    "state_dim = gridworld.n_states\n",
    "n_actions = gridworld.n_actions\n",
    "actor = Actor(state_dim, n_actions, activation=Mish)\n",
    "critic = Critic(state_dim, activation=Mish)\n",
    "adam_actor = torch.optim.Adam(actor.parameters(), lr=3e-4)\n",
    "adam_critic = torch.optim.Adam(critic.parameters(), lr=1e-3)\n",
    "\n",
    "torch.manual_seed(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/lucaviano/anaconda3/envs/irl/lib/python3.7/site-packages/torch/nn/modules/container.py:100: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n",
      "  input = module(input)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n",
      "True DONE\n"
     ]
    }
   ],
   "source": [
    "episode_rewards = []\n",
    "gamma = 0.98\n",
    "eps = 0.2\n",
    "w = tensorboard.SummaryWriter()\n",
    "s = 0\n",
    "max_grad_norm = 0.5\n",
    "\n",
    "for i in range(600):\n",
    "    prev_prob_act = None\n",
    "    done = False\n",
    "    total_reward = 0\n",
    "    #state = env.reset()\n",
    "    \n",
    "    state_id = gridworld.get_random_initial_state()\n",
    "    state = gridworld.get_svf_features(state=state_id)\n",
    "\n",
    "    while not done:\n",
    "        s += 1\n",
    "        probs = actor(t(state))\n",
    "        dist = torch.distributions.Categorical(probs=probs)\n",
    "        action = dist.sample()\n",
    "        prob_act = dist.log_prob(action)\n",
    "        #print(action.detach().numpy(), \"ACTION\")\n",
    "        #print(state_id, \"STATE\")\n",
    "        #next_state, reward, done, info = env.step(action.detach().data.numpy())\n",
    "        next_state_id = gridworld.take_action(state_id, int(action.detach().numpy()))\n",
    "        next_state = gridworld.get_svf_features(state=next_state_id)\n",
    "        reward = gridworld.get_rewards(state=state_id)\n",
    "        done = (state_id == [0, 0]).all()\n",
    "        if done:\n",
    "            print(done, \"DONE\")\n",
    "        #print(type(float(reward)), \"REWARD\")\n",
    "        #print(torch.FloatTensor(np.array(reward)) )\n",
    "        #print(critic(t(state)), \"CRITIC STATE\")\n",
    "        #print(critic(t(next_state)), \"CRITIC NEXT STATE\")\n",
    "        #print((1-done)*gamma*critic(t(next_state)) - critic(t(state)))\n",
    "        advantage = float(reward) + (1-done)*gamma*critic(t(next_state)) - critic(t(state))\n",
    "        #print(advantage, \"ADVANTAGE\")\n",
    "        w.add_scalar(\"loss/advantage\", advantage, global_step=s)\n",
    "        w.add_scalar(\"actions/action_0_prob\", dist.probs[0], global_step=s)\n",
    "        w.add_scalar(\"actions/action_1_prob\", dist.probs[1], global_step=s)\n",
    "        w.add_scalar(\"actions/action_2_prob\", dist.probs[2], global_step=s)\n",
    "        w.add_scalar(\"actions/action_3_prob\", dist.probs[3], global_step=s)\n",
    "        total_reward += reward\n",
    "        state = next_state\n",
    "        state_id = next_state_id\n",
    "        \n",
    "        if prev_prob_act:\n",
    "            actor_loss = policy_loss(prev_prob_act.detach(), prob_act, advantage.detach(), eps)\n",
    "            w.add_scalar(\"loss/actor_loss\", actor_loss, global_step=s)\n",
    "            adam_actor.zero_grad()\n",
    "            actor_loss.backward()\n",
    "            # clip_grad_norm_(adam_actor, max_grad_norm)\n",
    "            w.add_histogram(\"gradients/actor\",\n",
    "                             torch.cat([p.grad.view(-1) for p in actor.parameters()]), global_step=s)\n",
    "            adam_actor.step()\n",
    "\n",
    "            critic_loss = advantage.pow(2).mean()\n",
    "            w.add_scalar(\"loss/critic_loss\", critic_loss, global_step=s)\n",
    "            adam_critic.zero_grad()\n",
    "            critic_loss.backward()\n",
    "            # clip_grad_norm_(adam_critic, max_grad_norm)\n",
    "            w.add_histogram(\"gradients/critic\",\n",
    "                             torch.cat([p.data.view(-1) for p in critic.parameters()]), global_step=s)\n",
    "            adam_critic.step()\n",
    "        \n",
    "        prev_prob_act = prob_act\n",
    "    \n",
    "    w.add_scalar(\"reward/episode_reward\", total_reward, global_step=i)\n",
    "    episode_rewards.append(total_reward)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gridworld.take_action(9, 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "677"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<torch.utils.tensorboard.writer.SummaryWriter at 0x7fa93c9a8290>"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "w"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "episode_rewards"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([3, 4])"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gridworld.take_action([4,4], 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<matplotlib.lines.Line2D at 0x7fb26817f950>]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAD4CAYAAAAD6PrjAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nO2deZwcRfn/P88ce+W+D3JDDhJCDpYkIEdCOJIgRlQ0yFdBUVDBL/48kHz5Cn5RBG9A8EANKooR8YpcgUBEQEIuSMhByEJC7oPcyWaPmanfH93VXV1d3dMzO5Od3Xner9e+dqb6qu6pfp56jqoiIQQYhmEYBgBirV0BhmEYpnRgpcAwDMM4sFJgGIZhHFgpMAzDMA6sFBiGYRiHRGtXoKX07NlTDBkypLWrwTAM06ZYsWLFe0KIXnp5m1cKQ4YMwfLly1u7GgzDMG0KInrXVM7uI4ZhGMaBlQLDMAzjwEqBYRiGcWClwDAMwziwUmAYhmEcSk4pENEMItpARHVEdEtr14dhGKacKCmlQERxAA8AmAlgNIAriWh069aKYRimfCgppQBgEoA6IcQ7QogmAPMBzG7lOhn556odOFjflNMxQggca0z5ylZtPWjc/81dh7Fs836s3nYQ6Yx/ivP6phQyhnJJQ3Maa7YfyqmOkqNKPY82pnAiplh/fetB33WOas8rjMUb9mDdjsMFrVPdniN44a29gdvTGYH6Juv5HG5oxnPrd2PLvvqC1qHYvLHtEJZt3g/AalP5IoTAkYZmNDSnkUpncj4+ym99qL7Z0+brm1JIZwT+tWEPtu43P/eG5jQeXb7VaVs7Dx3HrkMNWa+1ZvshNKbSnrKFa3dh75HG0OPSmeB3OhcOHGvChl1HcKwx/D0vNKWmFE4CsFX5vs0u80BE1xHRciJavndv8AtbLLbur8cX//gavvSn133b3t13DPNe2mQ87g+vbsGY2xd6hMajy7di9gMvY9G63b79Z9zzIq74+Sv4wP0v43sL3/Rsq29KYfRtC/FdrRwAXnl7H1ZuOYBvP7EO7//JS6jbc9Sz/UfPbMD8pVuw90gjDhxzFdvvXtmMHz/7FtbvPIzTbl+If7y+HdsO1OO02xdi6NwnsXH3EWdf/WVRed/dz+Mrj67C1x9bjflLt+CGP6xEUyqDTEYENu6Fa3fhgw+8jMdWbMPRxhQ+/ZtleHrNTpx2+0L84VXjGBsA1gs4/o5n8OiyrfjUQ8sw674X0dDs1m334QZ8+/F1nrrnwlW/ehVXz1uKww3N+Om/6vDQy5sw5JYn8JcV2wAAd/xzLUbfthA3PLISp3/zGVz72+U47/uLcf3Dy9GUsgTj/mNNuGfRW0ilM3h960EMueUJbN1f73mGL27ci7G3L8RHf/EK0hmBexa95fw22w7U49l1uzHqG09h8YY9vjoKITDr3hfx64B2J5+D3onZ/N4x7D3SiMvufwlX/PwVPLpsK0bfthAvbnTfqS376vHK2/vQlMrg879fgdv+scYoFLfsq8fZdz+Psd98BqO+8TROufUprNxyQHuWS3DnE+vw9JqdmPKd57ByywHnXHV7juA0+/6/9/SbuGfRW77rvHe0EePueAY/eGaDc9+jb1uIuX9djWseWoZzv7cYV89bin9rSvyBxXW4+bHVeGrNLgDAWXc9jyl3Pedsz2QENuw6gtcVQf7W7iN4/09ewn3PbXTKGlNpXP/wClz1qyWBzxkAbv3bG5j9wMvYdsB6zz/6i1fw9cdWhx7zg4UbMOLWp/Ca/cyeemMnJnzrWVxyz78x5vaFuGfRW0hnhKPYnlu/GzsOHg89Z760yRHNQogHATwIALW1tQVXoX9duQ3Hm9O4avJg43b5Mpt6JnMeXIKdhxrw0TMHomOl9/E+bTfKzfuOYVCPGgDA23uP2f+P4kL0CazTs2t3Y+7MU53vUuA8smSLpxwArvylt9H++qVNuGn6cPTtUgUAuO/5OnvLG1Z97r4UAHDbP9YCADJ2w3tm7W4M6FbtnOe6h1fgqsmD8O0n1lvnuXICPjCuv6+u2w8ex19WWkLzT8stHf/F6adgxj0vYtKQ7vj9ZyZj8YY9uPDUPojHCAAcxfX23mPYsOsInn9zD55/0xKAC17fgasmD8aLG/di5bsHcdOFw/H7Je9i+qm9ceBYMw7WN+Oup9Y71997pBEDu1vP96k3duJXL23Cr17ahMc+dxYONzTja39ejcvG9cc3PzAGX5r/GgDgnjkTAp89AMz5xRKs2+laITf/ZTU+fMYA/HXldgDAk2/s8uy/cO1urNlxCBMHdcOPnt2A3y/Zgi7VSfzfP9cBAL71+Do8Y3cEFn91Kj7x66UAgKWb9uO59btxz6KN2H7gOL5/xTic893Fznn/tHQrqhJxPP/mbtx6qeVZ3XbgONbtPIx1j6/DvYvewtxZp+LKSYM89Zn8HUsIyt963Y7DmHXfixg/sKuzzzf/af3+z67bjXOH94IQAud937r2bz51piNU+3Suwg3TTvGc/wMPvISD9c2esg/99D/YfPel+MlzG9GYyuDlun14uW4ffvniJmc7ADx0zZlotNvz0k37sXSTZbXcs2gjJg3tjkevPwt7jjTgtr9b9fvpv95G9w4VuOKMgQCAR5dvc675wlt78cJbe/HEf5+DMf27AIBTr92HzdbB1Q8txYsb3/M8n9e3WApi83vuO55KW+/FpveOGc8jmb/MavMNzd57OmNIN3y01qrz23uPIp0R2H7gOM4b0QvPrNuFpnQG9z63Ec3pjM+SeWTpFtz3fB2+MPVkXHvOUFz7W2sWhze/NQNVyXhofXKl1JTCdgADle8D7LKCI4TAgfpmdO9Q4dv25UdXAUCgUtB5bMU2LFy7C7/8ZC2ONFgmcCaiuyVhC8XmLOb2tgPmXsFR29w/VN+MDpVxEJFvnz8u3YJHl2/F29+ZFalO++we6rGmFDpWJp3yTe8dcxQCYFaKQW6mmF2vpZv34+6n3sS8ly3B8L2PnI4rzhjgvHCHjjf7erTH7Z6/FJzjB3XF//59DX77n4745NlDAAADu9fgQP0hp/5SKexUXq6P/PwV5/Nv/rMZr2896PQOvzZjFNJpgUE9avDp3yzDuh2H8a+vTUXfLtXYfbjRoxAAOO68TlUJHAlwe6zaehATB3VDTYX1mkmFAMBRCADwp2VbPcdJIWYyqkb37+wo/c+dfzI276vHzkNu2zjckMLSTfs9SuHQcVdYH29KY9fhBsy670UA8PSO65us5/y7V97FgfpmzJ05ytl2zUPLUJmIIUbkE1hCCJ9CkNz82CqP0Dbxqd8sw3kjfFPwALAE6rceX4d/rtqBPYrl8O0n1qN2SPfAc15630tY/NWpGNqzA2oq4p77k0z7wb+w+KtTHYWgIn/vbh2s9t+YSmPM7QsBACn7h/nBwg244NTe6FqdxPxlW1E7uBseWbrFOUdzOuN03gDg5sdWY3D3Gkwe1gPTf/iCWz5jJORr868NrpVz5pBuWLbZshyO23Wf9/ImDO/TEQDQqTKBY42pdq8UlgEYTkRDYSmDOQA+XowL/XnFNtz82GpPjyIqslHId/arf17lbJMi2SQbhX1ETBHciXjMc84gmjSlIXcXwhJQ4+54Bh+tHYBvvN8cl09nBNIZgTd3+X3ua3cc8jyDpK2o9JdI57hhe9B9qPuq7ombH1uNIw0ppDLW/f1x6Rbs0Xp0ej2unmcph+0Hj2OXLRAr4q4n9H///gb+eeM5ICLsCPEdqwLxfXc/DwD48MQBjoXywQdexpu7wt1OnaqSQMA1NtrWT9/OVaHn+PkLb3u+S996KpPxKd4n39jpfL7rqTfx2Aq/wFW7BfuPNWHit551vr9c9x7uVVwiQfxz1Q58YerJnrJOVUl075DEniPe+30npOecTSFItuwLPofqFhs/sCtiBKzcchAr3z0QeAwAbNx9BBWJGH7x73cAWLEFVUgH9fjf2XsUz9pK+8CxZt99CGGd+/7Fdbh/cZ1T/qB2nuZ0xmedXP/7FXj9tos9ZW/vOWZ8b8b07+IoBWlJZTLAln1Wm3/ttosc+VFISiqmIIRIAbgRwEIA6wE8KoRYW4xrvVxn9Q7eysPX7DQsAWMAWG7TseUepE5YsGoHlm7aB8A1TaOiWiJHbevkLyu3OwLNxE8X1+HS+17yletlafvc9U2pUIvneLNfKagvnYqqjHSr6JFX3/XEAZ7T7qFuz1Fc+KMXoFPflMayTdZLs1sRVGu2H8a+Y03IZAR25uh3lW4vq87hbeN4UxqdqoL7VfW2cI/5jbdQZI/4H6/vwLnfW+zZptZphSIUE+pFlI+L1ntjVZ/53XK8ETH5QFq9koo4oU/nKuw+7PX1v7Yl96DqGYO7eb6bLFwTP/zoOMydZblLV2wJVwo/eGaDo+wB634OGJJDhvXsAACoSMSwcO0uXPDDF7DdbjdPvLETN81/Dd/4+xrPMXobVbn6LMvD0JzOeCxVABjaswN+v8QbI2tKZ3D4uN/SktYu4Ha20kIgbQuSYigEoMSUAgAIIZ4UQowQQpwshLizWNeJ240wjyQJj+BTP6uuE5MwlZYCwXL3/PcfX8OSdyz/qS4owwK5+vkPN1gNKkbATfP9wW+J7gIJ4niTVZf6xnSw0kNuSuHrf3nD+dzQnMHpA1zLxHpZze4HiR4slyy1s2Z2H/IKqjPvXIRrfrMs0O2WC11rksbyA/VNoUpBPp9sVqDOcvuesqH2dHt1qnQ+Ewjv7juGuX9dje0tuP9dWi+3IhFD705VPktOtxyy8T+zRqFnR7/bFgB+cMW4wOOG9eyAk3t1xEldrThXtgyft3Z728ySd/Y5sRXJofpmx/3XlMrg+odX+M7zj9d3+MrWh7xLU4b1sM8nfM8mToT/1RRMY3MaBw1KoZuh3aUzAmkhvJ2AAlNq7qMTRsx+qFF8/+t3Hsb+Y0143yk9AbiCT2jHpzPC6aWlTUpBFpErzCTNmqVwyNBIjOeC26Oz3FLB91ORiNYHkL3l+qa0E18w0WBwH+07Fp6uBwANqTS61rhCIUbkiyMkYpSTMNXda0LAl4WSL+cN7wUiv3DYf6zJch8FcNwONOp1k8w5cyAG9ajB957e4ClfnsUt0rUm6fPhV1e4fuWMEPjKo6uw/N0DGNO/c+i5wvj7a95wXjxG6N+1CruPNKKhOe34sptT0X6nSUO6Y/vB47hsXH+8udNrhdU3pVARj+FDE07CwfomPL56p8e9B7jtt4/tjstV4Zssv3F3POPEHHLBpCgA4Kbpw9G7s6Wgm9MZ33vcbGjTB+ubjZ2voHo9sPjtyO9yPpScpXCikIo2Sv7vzHtfxFW/etX53mi/5EIIj1JQdILxvMLZBscElKS07yZzUkW97pGG8H0lFSHmpqm+uw43OP57wK9UdEtBCIELf/TvrPVoaE57ekFE8AQRAaB/12r9sEiYEgfUXnQ+pIXAPR8b73yXvdwD9U3oGGApEAHHm1L44AMv+4S+pF+XanzuvJONdQ7jpK7V0L0tHSrceqQzwlGo+nPV6dkx+NlIV+Q1djCfiDCmfxekMwJrlfEgTek0EjHCsF4dQq81sm8nvHzLBejXpRo1lV6Bd6C+GZ2rE4jFCJ85dxjONwSeK20lFI9RaFu+/+MTMLx3x9C6qNQ3pY1tZNzArvj6jFGGI4KpTMaQtOvWnM7g8HGvC840fmNLwPiKsAByEQ2F8lUKMhXS1KPPRmOzaim45RkhHN+o8bx2ka4AAL+loPtzAUs5uddyy2VgMtudhPUuGgPcPpJffrIW/eyU1tNO6oxxA7r4lMIxg+XwoYm+YSZoaM6gm2IpEODLuQ5y2ZgY3MP1vQ7t6RdMuSqY6aN6e75nMsLj85bpmPuPNTlBeZ3enSpxvDnt6+2qSCH4YcMzCqNDRQLVmsBQLYV0RjhtIdtAqz9dPwXnnNIz1A3WQ1Fa4wZabr83trn31ZwWSMZjeP4rU0MVQyLuPquqhLf+TamMx+r60oXD8dcvnO3Zp1Jpv2FteWjPDo41EZXeBqVQGY+hY2WwYL7ijAG+soq4qxSONqYcq/v684Zh2sheHmUq0d10kpqKBH521cRI9S8kZasUZAZQLu5e6TZqciwFbxwhnRFOD043B59Zu8vxQ6bSwpedpPcgTJk/qh9T7dk7CiTLvSRDelfZRrJWJWPOi5aIxVCVjHsyilZuOYC3DX7/W2ed6isDvEJ/x6EGvHe0CaP7ua4OXWiEoQrISoOw6JCje6BLtVch6b+lfA43zX8dKwOCrN07VBqzs1SkENQFfDaqK+K+OnXQlIIx/c3m8gmuEjq5V0f8/jOTMbJPp8D9O9vPQwiBvp2rECNg79FGvLHtEL7z5Ho0pTJI2gI/rAOrtj/ZKZsxpq+nLhIiwsRB3mC0ah0k48FXihH5xghlQ//NAUuJ1VSYz9OncyX6GToblQlXKfzk+TonFjZ31qmB55L00CzGmoo4Zo7th+99+PRI91AoWCnkoBUOHrf83u/ZvS8B4VEqaSECU1Kve3iFE9BKKT05SSoj0NCcdrJwgob8CyHQlMp4rAYZaPaf1UvYi2QKGn/ufDclMUbkpFYmYoSairhT13RG4EM//Q9mP/Cy7xxBJrDppb1y0kBHYFUmozdNVdiYrhfP0da2hIEmZBVUV0NQJk/PjhURlIL1DGT9TQMBTdRUxH2xMFXg6O1LHaAGmBVnkBsM8PbKiQidq5P449KtuOz+l/Dgv9/BvmNNzj7yvfr0+4b6zqO2P2l5qYMjTValinrPJkvh6rMG4+OTB2FEn07oYGhf866pxaa7ZuHjkwf5tgW9bx0CLIUOFQmjIqlMxB3lte+o10pLhLx/ANBNUwqyLffq7LdiijnrTNkqBcd9lItSqG/GnsMNuONxdxCS2lBf2LDXyaIJO28qk/H9qM3pDM68cxHOvHMRgOCee2Mqg+0Hj3sa8b6jlrLK1lDCMq1MAuysk3s4n2NE6FxtvWiJOKG6Iu4okrD4R5BSMJWP7t8FVbYyqMzBUlAFRJVBmcwa2y/yuQAre0d94fWAd0U8FjgeBAAe+tSZGNqzg1HRqsjfWCY9qAIyjOqKuK9ONVqgWW0L8l6kbjQJpyAffbeapG9bMh7DfiUBYffhBkexSaVwRe0A/OXzXvdPIqZaCvK/W5dsvXv1nk1KYXifTvjO5WMRj5HP7XPlpIG4YFQfEJEnc2dUX8tCMrmChEBg776mMm50uVUkYkgmrPOrLlLAe/8m9Gwj+Zv26ZSbK6yllK1ScN1H0ZXCgWNN2HrADQoJ4T3+i398zfmsxhT09FLTmIRUWuBIQ8pxBR1tNAuUww3NPhNdBhOzZetses+c1gmYLQW1Z2cF96xG6riP7GPCMqWCeum6Upg1ti/OGNzNKTcJ9yBUoWVyO805cyDW3XGJ0W9s4oZpp3iUgn4LyXjM467pVJnA3xT/97SRvVGdjIdO8Hb2yT0wbaQVu5Aux6j9k5qKuK8DoPaMUxlvAoT8HaVQMrkRZZn+ez33lalIagJYTzvedajBEdIy5tCxMuH7DdX2JN8/NVYT5t4EvFa9SYmpddcthbs+dLpxv5N7dcTmuy8NHB1tsjgAS1l0DlIKAfeh3v+HJ/qVkK5EpFuxt8lSMF6hMJRvSqrzImZ/vFXJGBqaMzhQ3+T7wYMON/r8bVIZgUTce6CetqjPpqqeS89Rjjox1uINwSmaJktBdTPECE4PKBEnVCfjzniGbOmzJnSBIVNUXaWQn6VgcjsRWb7hjpWJrNk4V04ahEE9ahw/OgDceflYzz7JOHmERWUyjtMHeF001RVxX/KA5BefOAOXKL50stV81JloTb1Xj6WQ8VoKUvDGYgDSZuErrYdknDxWbsfKhE8Am5SCnMvrjtmnYc6kQRjYvcY3tkQdbOXUSWnK2dIsvZZCuJswSJgD3oF+8r6DrJRg91EcnQ3pyGpMQY70/tKFwz3XAszppnoWmkwe6F6TW3ZaSylbS8F1H2XfV/7Ix5vTHqGiWwoqqqWgu1dSab/7SB+JXB+iFHTUuW/yZcEqf961tAwAy8VRaT+HRIxQlXRjCnkpBe2lllk8sjyXwTkepRDidsrWE1WvKy2Fyyec5MteSsZjmrAQvh626XcCLCGoKgRZBlht6SdXTgh1TQHmwLTXUsh42qV0c0hFa3q20orQXRwViRgq7M6APKPegWlKZ5xnW5WMOwFiPXahPn+ZySb/A+Ep04DXJWtSIHHF6ghzRcWVe5T3a3IFCQhPqq9KTWXCOEalIhHz3ccXLxjuuRZgVgpdNeEvn19M+b2cWA3HFApPLoPX5C5CeGdaTGdEoMmfzmIphF21oTkd6D460tDs9CwlOw7mNqJURTbgP7y6xb9NefHiRJ6XOhEnJ7U2m1K4QEvxBPyWQMIRKtb/iLMe+OoZFqDOFuhTryuFikmAJhMxY2/945MHYd41tQCCU0FN7UW6UoQALhvXH9ee4w/SqtRUxHHb+0ejY2XCERyqoshkvIL7tvePxk3Th+OiU61ZeI2Wgn2fJnef2jkAzPGyCsOz9SsFd5+P1Q7E/R+f4Jl0UlqiQaij/itDrB0gXCmov2kyi6UQZLHGicwxhXjMl9Ahn6laXm1QCnqWnGnqj6E9a3xlhaZslUI8h+wj+RJkhHcwUGMqHWjym0YcS7ItQHL4eHNgoPlIQ8onMI82ppyAWa6YGqdE9wFL4ZsR1vOTzyWbUrj1UjctVZ6zMhnDf265wBkYJV9o9SW89PTwALFUINliCpKwLCR9EJc8t2l+mWTMm/Iof+vvXD4WF4yyBO/NM0aG1l1FznGjjrcIY0TfTvj0OUOx5v8ucdqCark0ptJO8gFg9UD/30UjPMLplbkXYMnc6c4+8vmrz+gqO0tHF3ImhW1SNHpvXu0px2KE95/e39MLzmbJZbUUlHOFjXNR95Ofg+YR6tWpEvdd6Z9WfeuBeqNSqEzGA9uZaqFEeV4mqu3OSLZMw5ZQtkpBNkZ9kJlJSch9MtrKaamMf7yBc4xynsPaiOOUslgG4O9RHTreHBikDBq9rKcdRiVsiL/acGMxKErBmnulOS3w7cfX4ZW394VeI6m8DONs33tjcwb9u1Y7flS5jxROQgAPfHwibph2MoKQvWNVKeiWgvps5auq5ukDVi/xs+d6e+dSOZnSeJPxmOe5mZrAgG41zhw92bhkTB888tnJgVO160K4VptMDnCFBQCs2nYoVFEn4jH061LtrK8BuL/12Sf3QL8uVVhw4/ucWIourP76+bN9Qtc09bvuystmqZncR7dfNtp5jrm4j3RXjGc/NaYQ0lGQr+gHxvXHdz/sjSvtOdxodh/FY4GT+5kC7Z5j7Xvq2bEC/3upeXxPMae3kJSvUgiY5sKUwSP3EUJ4ArJHGlLGlbAAV5Es37wfX/jDytBr6EGxQ8ebA6etDvJVqzMq5kKYpaAK1HjMdR9lhKtUf/XSJjyhTOdsQhUG9145AZeN6+/MkimtJrmP7hqLh/iRHKWQMFsKn5gyGC98bZp7gH2uEX064ZuXuX77a88Z6lPMrv/d7KbwWgrmnkHU8RFEhLNP7unpNasM6dEB1ck4vnrxCPzginFGYWQaoPfwtZPwn1su8JWHuY96dKjAK3OnewLn+v4TBnXzTf8gF5RR0QVY2DiZoHp96n1DMf+6KQDgmRojW/aRaTI5iWdC2Yh+yo+d6Y5tmD6qN+67coJzf2pdwoS22pbCrK2zT+6Jz5w7zLNt/nVTcP15w4o6EZ6kbLOPJLqlYJqCIiNc99Hx5jTGDeyKMwZ1w7yXNzmrlels2nsMEwd1w82GZfj0QLMuPI42pkIshZTROsn2wgVh6rG454x59pMNXuQ4S6OqFE7qWo2fKOb4J84agpVbDuK/pli9ZFkdeY9BghJwFVpQTGHi4K6e3rA8VTwGnNLbdbeZhLdULqbLJ+MxUGX2+y/UC1xTEcf6b80wbpPPyaTcJw/tYRRSprYi3SdxgxI0nUNXog2GWX315xrW1oLqBVjjN26eMdIzuC+b+0hP71SJmoUetNudl4912tWLN0/D9oPHMedBa+Ej08BAE2os86WvT0MyHnPWAjddd8qwHpgyrAcWrt1l2FpYytZSkJ113eo1WgpC/rcshQ4V8axBsa/8eRWONqaM+f96qmIiRjhzSDen55tKe91UnZRe6ZEG8xoH2QbGqKgjR8MG2an56TFyJyFLZ0SosJZxAuc8IXXr1akSv//MZJ9PX/pMwywFmZYYZCnoAo6U8nOG93TcMCalUF1hHWtqD8l4DDURUmbDnlEuZBOmAHxZMsk4+QSnbDamtuKMZTAIZuf6yqPQz92QZaBeFPTxEBIiwhemnoIB3VxrOJtSUFOKn/7SuS2um4r6jAZ2r/GkpoYpBTUOoL7CPTtWok/nKufYsOQX2dHgEc1FQJr8etA3bLEbIaw5iWoq4pFe1ONNaePLYk1Y5l4nESf8+XNnOxOANaczHvdRr86V2Hz3pejbuQpb9ptXaQp6obIRtgyo2tONkT+mEMRl47wB4ihZPxJ9zzDBKg0ZT6BZEdZ6HeVvJqsj/eKJGPncCKoC9F+XEIsRvv8Ra0BUUIsplKUQ5TR6Pn3YPDumpisVRdQa6+6bbBMq5nPOMKQVO2lId2faEVMAGQBG9Q2ePlx9FlFnq9U7OWr7DnMfqYJcdVvLdiJ/g7DxKrlO2ZIPZasU5MuuC0WT+0gihHDmkQ/rwUoyQhgthfsX1+EPS9wUUHekqXXO5ozwuI/klQ43NGPR+j341Yvv+M4ZNFtnNsJeZv0lq1BjCiH3r2+LMj5AIud/kfMshb0E8qfyjlPwxkFUZLXicZn26mbc6C9ilGVSs72gLXmB//aFs93gd8iz/oTtdqvWlEBYSqbpbE6A37BN3ofq06/ULKVGQ0whV3JpJ/J3DlIEkslD/SOVg37RF742FecO75l1R72To7Z3GVz/2iX+7DO1jZ2iTO0t6y1vP8wKcBRH8C4tpnyVgv3k9UUvwiyFjMdSyH6N5nTGGIADvAupuKmC1s/RnMqgvtFNPZWNTloPr27yr8yVywultqgztJkof/vpSW69lMYeI3KskWwxBd09kUvdLh7dB/fOGS0fz1IAACAASURBVI8b7QE/JuX7/tP74a1vz3QUe1RLwVEC+n/TWAT7xQ9LH45nMeVbohQmDOqGGadZg9zCTvM/s07FW9+eiSqthxqWVWaqbtLJ+vJvHdKjBl+5aAR+8Ylap0zv1ZtiCrmSy/OSHYFE3E1N0Nvdxjtn4pHPTol8zk5VyUhrb+hKQW1nsl5yenUVKWpmntYXM5X5uFwrlez92FJoFeRzz8V9JHv+NRWJSP7iqH7WhKYUmtIZHGtKO7EEveetjgJ1zpHFRWPquQDA1JG98FNlznZ1cRP1HmOapRDWOHWXdS4NmYgwe/xJ7qyb2qCqmoo4fvjRcahIxBzFXpGIOQpUDTT7LAWn3LtdfanVuAPgdR9dpc2u6SoFc5uJ4mIMQ1461CqLWbEDXSDW5Dh1tDqAToeI8MXpwz0ptkHxCp27PjQ2p7UxolIRN1kK3n2S8Zix7ZHnc7CAD0J3H8WVNhrW1qXLWJ0iXsUd2R587USI8i4UZasUXPdRePaRd91lK05QlYwWU7j7KfOKWzoJLcgnp8WQqYf6pUyrZQX1xmUjDXsxB0VIZ40TOdMdZIQwZqno1ywEUtfJXv2Y/p0dE136ZSsSMef3UAPN+jNxYgoxXeG4+8lfWwoH1X105+VjsfnuS5W6BbtcADiuiF9+shb/vPGcLHfqR95flMepdwrC1pAIO13USfmk+6ZC+69z5aRBOHe4fxW1liIHFwqhuAUjJluot6i/W2rbDRpMqHcI5THZMo+kKAkSHa5ibl1LoWxTUoNjCmqGgPD0FJvTGTSlM6ipCB61qLJo/e5IdZGNWfZ+DjpKwfp5pHl547RTcP/iOmPwM0gpJGLWyOOwOYFM88LrxGJuPSxLIXjfKPGWqKi9MKS9PfeU4j6SVwy1FBzhIevp/W66blh2VjZr8UsXjsCHJw7AkJ4dfAMYoyDHr6hZN0HoSsEUaI4yCjbqSFlHAArgux8e61sQp9jI+1Mn58un3elHSMFcmYjhjtmnebb976Wn4qf/ett3DtmB0BXjz66a6JnhVD7ZoLERUqeFWgqx8I5IIShbS0FqY10pqN/VtW4Bd5K66mQ80tw8UbNPnOwDqRTsRexlqps8zVcuHgHAHBwOch+5rqngunSJYN7HiDw9Gb1XJq/zgXH9C9qbIc3vrwrpjAi3FPwxBdjn8loK6ksqPzlB/7CYgiFV07M9RhhiLw+a6+pqAHDaSV3wkysn4NsfPC3rvrr7KGh2zyDC3Ecm1Ey0j505CMNDVm47z7aYRoTskysyZqJOSx+13YUHcq1zXDi6j2/sx2fOHYaV37jId0yQpTBzbD+cMdgNdGd7tm6niy2FVsGdukIr9wgdr+Vw2B5NXF0RR6Yx+9uTbX0DiT5h1iGfpQD7v+Wz1GepBLKn84W5uzpF8D/HiTyzeZoE7upvXoyaZBzbI07lHYW4I7it7+pgQ+leScZjjjQPsxTclFTvfwjgVNvPO9Eeu9DLXtjEtOZz0PnDyCkRQOGyiKux6XXJNSU11062M5Axwr5X1A7Ehaf28a0s1hJkzER9F3JJfQ5CdhRyiQfFAywFHdkRDTp3zw6WVaEuS6rjpq1Grl7OlK9SsNuSfwU0b09UDUQ/tsJahPv0AV3w6jv+DCATw3t3xEZlXvnhvTtix8HjnkXupTKQAaxj9gypMpNGbUTJOKHJkOkRZJXI3keY1RJlqH8s5loKmYzZdeJaNoWMKVjnGtitBut2HsYVZwx0tkmlW5mIOQpL7a0FDeiLay9+WghMHtYDr8y9AP26WMHUMwZ3w8PXTsLkoT2M51DPU0xTPl9yXZc6ij9bRXZCou5fSIUAuPenuo/yaXf6IQmnbUQ/h6MUsih+131k3j52QBf8/trJmGRIo9WvVUzK3n2kN+qM8CoFVUkcbUyhf5cqnD6ga+TRqnqA94MTTsL5I72BN9WdEY+RM7ZB9jzUKyVjMd8iJ0DIaFDnGi1rTDFyB/dMHNw1VMkUsuHK59y9QwXe+c4sfPKswc421X0ks0jC8tZjmivKmRTRVi5SIUjOHd4rtPeXLfuoNTFlH82ZZGVPve+Unr5tua7+Jscp5LCabUGRllBjKuP89lHdtWrcRO8QOckHOSgY2fnItoTsZ88dhqkje2HOmQMD9zlneM9Iba6YlLGlYCsFrTyjuY/0QKNpxagw9EFFFXF/+qA+v7tMZZW9XrXhJuJm91HQVBJdapI40phqsWkdjxH6d63GUzedi5N7dfQtCqRiejn/dN0U9Iy4HKb3uu5nXRGnlOyjZJxwvNn/rFTk4W5KqvU/lyVZvecr/guaC0Su5WuyFCYO6ubJnvIca/+PGmjOZfRxMXBiCso4oEIITNl2o06UB7gB4mzuo16dKvGbT00K3ScbJ2JCvKL9skT0fSJ6k4hWE9HfiKirsm0uEdUR0QYiukQpn2GX1RHRLcWqG+CdDltF1QGWpeAVwG6wMtqPU21Yp1b3L6vnak4LrN52CIDbyNRLJeJmS8Ek9B+9/izMHm/5pPt0dsc2qHcsP987ZzzunTM+8D6kADy1X2c7Lz74/k1W1ORhPUJ9pdmuaxJW6uC1Rz93Fm6YdrJHGAYNXpM/uewNhmUYhVFq7qNNd12Kj9VavdBcxynozyYb0uUZNrV5VP5zywV44WtTczpGBtLVDlJU633qSHfRJ/2IeI6dPsC1FE7EtNZt3VJ4FsBcIUSKiL4LYC6ArxPRaABzAIwB0B/AIiIaYR/zAICLAGwDsIyIFggh1hWjcupqaioZLZCpB4vVoG8U9KyTRNxd3lCiZgZ55oyP+y2FigClYApkdu9QgS9fNBJXTR7smZ67f1f/4LfZ40/ylanovWK9cXpmfS1CSqoJNdA8qm9n3xw3ukUmT+Xmi7dUKeS2/7xrakPn+S8EMhsnyoR9Krm6j4go0OrIFX250yg47qPmtPueRDx2/MCu+NsXzsblP/2PZ7Am4Lpyc2nCpnhWsWjTU2cLIZ5Rvi4B8BH782wA84UQjQA2EVEdAGlT1Qkh3gEAIppv71sUpeCuphYWUwDS2mA2x3cdVSlEcB8FDboxWwpkTEk1pZzKEZb9u1Z7Zl29afoINKUy+OWLm3COwb9sQhfO4SOaCx9oNqGOaDYe63smdqDcPk7eQ0vdR1EPl6uyFRM5rUrQMpJBkOJAagvIWWFNrtQoTBjUDXV3zvStuOa6FKOfS2YFngil0J4CzZ8G8JT9+SQAW5Vt2+yyoHIfRHQdES0nouV79+7Nq0L6OgoS3X2kNw53PqJo19HnoEkm/O6jIO1fYYopxMiYO6+e0xnwpRynNqaKRAy3Xjoam+++NPLiPHoVwxpnIXszYQpG6uuglzFonIL8SV2lkF/d3MnJSkeQSkuhKmStahNOunHL57U7IcgxBM1pkXM6rcS0BGc8z5TPeMw/VXkxyGWK/Hxp0RWIaBERrTH8zVb2uRVACsAfWlpZiRDiQSFErRCitlev/IbQi8CYgtd9pDcO2f6i9oZ97qNYzLcWQ5CANVkKyXgs6+A1RynEvcqkJejusjClUNhpLoJ743JKEv1llFlSQb+d/O0HdrPcFr3zCIADboCxlJKPpKWQLRNGx1WYJXQzIcjf/PrzhuGCUVaMQF/BMB+cmUpzfA6JGOX8zPPBb/0WnhY9RSHEhWHbiegaAO8HMF24eXvbAag5WQPsMoSUFxwn+0iPKWjZR3rj0GfazIY+KjIZJ/987AFCVDYyddKuZFBMIRZzsk9MwbJCm52h7qMTFVOwfxo9E+aRz07GQy9t9s14qbt7rpo8GP26VGP6qb2RDyfClM+VhjwthfNs37pcAa8tIGMazekMbph2SqTpWrLhuHJztRSIcs7IuuKMAdh/rCmnY9p0TIGIZgC4GcD5Qoh6ZdMCAI8Q0Y9gBZqHA1gKqyM3nIiGwlIGcwB8vFj1kx6YbNlHujntWgrRrqNbCvFYzO8+CtD+rvvIu68xppCIIU6ElDLaWFUmuaTYRSFoXiHTtpYQxSLTLYVRfTvju/YCON5zWf+F851w4ej8/fyFDKgXisY8Ywr9ulQXLHB8oknGY3kFq03EHYspNzpVJXKeDfb7V4zL8SptP/vofgCVAJ61BdISIcTnhBBriehRWAHkFIAbhBBpACCiGwEsBBAHME8IYV4AuQC4g9e85aqS0FdIU4naG9ZjCvEY/O6jgHNVxqX7SLEUArSRtXqYXTcnVTI/V8AT/30OjjeFT/sdJhAL2W6jCN6oU0iQFmhuKe5zLh2mjeqFdTsPe1KQmejIBZhyHZD48GcmO9NUFJMT0REpZvaRf5UJd9udAO40lD8J4Mli1UklHaAU1MYghMFf7ASa83MfEfzuo6BevFQeuqVg3Dcew8Vj+uKJ1TtbbGKO6d8l6z5hPZZCWiVRThU22Z/KmUO64Yk3dmJw9+D5jHKhFC2FL180EtecPTTSYjGMH/mb5pp8kM8YnHyIxQin9O6Iz5/f8vEhQfCIZq2fpyb2mHqUUgxENeN0M57IL8SCFEzMiQ14Ywom4jHCjz46DrfOOhXfX7gBf3tte+ikaC1FV07FDraazj//uil4es2uyEro6rOHYNqo3hjco0BKwRn4UJDTFYR4jFghtIATMTV1S1n05fOLev4ynvvI+q/3CDzuI2HIPspxxKPffUS+eYpUOT9VmRdJKgNV5ukK5b+mDFIGz8TRv2s17vrQWCz68vkFCbwFcaJ6yWFXmTKsB775gTHRz0VUMIUAqCOaS1mEMLnguARLKaXsBFO2SsHNPgpOSRVC+KwFJ9AcUSjqlkKM/OMU1J7uQ9ec6XzWZ/ME/HnK/z19ON65yxsgrErGPQuDF4NSzLw50WRbo5lpe8TzSz5qV5SvUghYT0F9wa2UVC/u4LVoQtGvAPy9ffVUqoKIGawS/yRvrSOcWSmU3oR4TMvJNyW1PVG2MYWgqbPVeXDSGeHbbpqiOQxT/CBo7WAd9xrKaOSIxxabbPf/wyvGOQvXtFdKbUI8puW4YaLy/VXLVikETp2tracQZClElcW6EB/Rp5NvwEqQfHVXanLLgqaDPtFkUwofPmNAQa9Xii9pKWYfMS0j1/ms2iNlqxQyTqDZ++urX40pqTbRLQVXKcjBQfqRQb19WUdPTEG3FHLQCg9fOwm9OxUmf/2EuY9KWO6egGlomBOMOxVKq1ajVSljpZB98Nq7++rRp7M3vY8MaaImOlYmcLQx5WQaDevlZr3ohwalVEpLQd2sy+Jc3EfnDs9vnigTei+5HDvN7nq5ZSxB2hltbQ6oYlC2fR136mxvufr9hkdWBjaNbMLYXXeZsGTudCy48RxlazQXkDMdgzrbqb58YBkK41KBY5LtkfwGr7UnythSsP77As0h7iRATUkNP39NRQIH6puREUDfLl6Xja5PghTMucN7YuxJXfD/Lhru7huw7nCxWPTl8yNN2lWOneWwGVyZtoljKZTxb1q+SiFgllRdSfiyj+xGk82n/ttPT8Kfl2/1uZ8AU0zBfI5OVUn884vneMpOtNsmaLxDxyqr6XysdiD+tHyrcZ9CXv9jIYudtxayDcwa27eVa8IUCvd1Kl+tULZKwZn7SPvxM5rdGJx9FC6NT+ndEXNnnWrcph+by1xBujJqrZTUykQcm+++FFv21RdVKfTuVFWys3cSEZb+z/SiL7HJnDg4+6iMYwp3fvA0DOpeExpTsL63bJyCCf3InNaDLRGl4Fy/bFuQRe/OVSdkxS3mxKCvzleOlG1rnjysB8YO6BI6zQUAX+tw3EeaMP7kWdEXJ4kaUzBRaoFmHtnMtCfcmEL5qoWyVQqA1WN/e+8xXPLjf2PPkQYAfqWwcc9R3zGAX7B/6cIReO0bF+VVj1wGQemWQqEXz8kVHsDFtCeIs4/KWynIHvqG3Udw7W+WI50RvsZw+wJtnR8yu49iFL3H7x+nEL3OpSaEcxk8xzAlD7uPyjfQDHiF8RvbD+H5N/dkXZUraJZUIorsyiGE9/YvGdMHe480Go8tNRlcakqKYVqCO6K5fNVCWSsFXbB3qko42Ue3zByFu596M/BYfa2bXCwFPdKsC/pffKI28NBS65m3dqCbYQpJa7tjS4Gydh/pP388Ro77yDS+AAhOSY0RRQ66mq4blVIL7JZ79hHTvqi0M8k6VpZvf7l87xx+wS6EG2iuTpofjTzCnwVEkWMDvnEK0Q4zXre1KTUlxTAtYfLQ7rhl5ijMKcHBkieKMlcK1v9OVQkcaUjZK61lO8Y8IR5RdIGt79U5h2Uz2X3EMMWDiPC5809u7Wq0KmVt/Ev5qi7WLYRAjIDGVNp4TFBKKuWZfXTH7DG47PT+keusLqcwfmDXyMcVC1YKDNO+KG9LQRudLIQ1e2qMCJ2rvL13Imt7kAyMEUXuxavZR1dOGpRT71911zx87aTIxxWLZJwwoFs1vnzRiNauCsMwBaCslYIMkrrLKlruo1iMMHVkL/TsWIH3jlozhMaJkBLCl07qnEvRFpOHdg+9rmdN5pzr7B5RCr10IsJLX7+gtavBMEyBKGulIEWyEwsQrvuIiDDztH54eMm7AGxhnBGBUlzK6sVfnYrencyZS96r5ocatygFpcAwTPuirJWCFOQxJaaQEcIRtqqrJhEjNCFYoMsA9NCeHQL2UHf2Hxe5zkqdWCcwDFNoih5oJqKvEJEgop72dyKi+4iojohWE9FEZd+riWij/Xd18etm/VeFfzrj9sDDVjxr0XUVrZCz+4gtBYZhikhRLQUiGgjgYgBblOKZAIbbf5MB/AzAZCLqDuB2ALWwOu0riGiBEOJAsernWATKHOoZ230EeEctu4PWrP+9sriIwvDEFHKU66Y6MQzDFIpiWwo/BnAzvPNLzQbwO2GxBEBXIuoH4BIAzwoh9tuK4FkAM4pZOWceIyXQLIRwvquuGmlNyF5+VTKOP352Souumw9sKTAMU0yKphSIaDaA7UKIVdqmkwCoS3Vts8uCyk3nvo6IlhPR8r1797akjgCUcQrCiiXrFgSgKAVFDuc7jkyNI+QaU4h7so/yuz7DMEwQLXIfEdEiAKYFam8F8D+wXEcFRwjxIIAHAaC2tjbv6QylPHaW4IO1TKfrPgpXCvlOntWSDn68BQqFYRgmGy1SCkKIC03lRDQWwFAAq2zBNQDASiKaBGA7AHVikQF22XYAU7Xyf7WkftmQyiBhDxPed7QRj7y6xbcdcIWxGiTO21LI7zDrmmweMAxTRIriPhJCvCGE6C2EGCKEGALLFTRRCLELwAIAn7SzkKYAOCSE2AlgIYCLiagbEXWDZWUsLEb9JE5MwRb4f1y6xbPd46oxWgp5XrdAlgLDMEyhaY1xCk8CmAWgDkA9gE8BgBBiPxF9C8Aye787hBD7i1kRKeil8E9qiyTEs4wezt99k79g51lJGYYpJidEKdjWgvwsANwQsN88APNORJ0AZRrsAKUQMwaa8x9j4BzXArnO7iOGYYpJWc+SSlqWUTLuFbiqjnBTUl3yTQltWUpqCw5mGIbJQpkrBeu/FPiJWIilYFAAeSuFFpgKHFNgGKaYlLVS0FNPEz5LoUiB5vwO89SDYRimGJS1UtDXU6gICTTLTapIbpXsI1YKDMMUkfJWCtrgNb0Xbhyn4Ak05xtTyF+w89QWDMMUk7JWCsIeCy2tgIzwDo42uo+U7bE8nx5bCgzDlCplrRQksvedzmhKwWgpuNtb0uPPFw40MwxTTMpaKQh78tYgpRAzWAooxDQXLRqnkP+xDMMw2ShrESO9RVJI+91HymeTpZD3OAUe0cwwTGlS3krB/h9oKZhGNCvbWyP7iAPNDMMUk/JWCrYOkJ3vdMa7Xe2Vm2Rx/oPX8jqsRddkGIaJQnkrBV9MwasVsi2yk/90eOw+YhimNClrpeD4j6SlYH+/fIK14JtnOU7jegqtt8iOPtCOYRimEJS1ZNFjChk7pvC1S0YCMK9y1vrTXFj/q5Jl/dMxDFMkWLJAjSl43UnGaS4KoBRagqxbdUX8xF+cYZh2T1krBSG0mILz3drucR/F/O6j1lijuTFlxT2qkqwUGIYpPGWuFKz/pKWkOiuykSF+oAj0/GO++WuFXp0qAQA3TD0l73MwDMME0RrLcZYMTpxZG7xmch/FnEAz/GU5yviWWAodKxPYfPel+Z+AYRgmBLYU4Pb4Mxmv+8irFKz/puU4c81C4qRShmFKlfJWCvo4BVtLOMt0Kk/HtLiNvpxnVFqy8hrDMEwxKW+lIGMK9nc5otkJNJtmSVWOd9ZjyPEpskpgGKZUKWulIHEDzZZWkG6juCn7yBNoztdSyLuqDMMwRYWVAvwT4jkrsZkGrynH5R9TYK3AMExpUtZKQWjjEjLaVNrZAs3OthxzU9lSYBimVClvpWD/l0I9fESz31Jwp8koZi0ZhmFOHGWtFCRuoDnYfWQavCb3z3XmUrYUGIYpVYqqFIjoi0T0JhGtJaLvKeVziaiOiDYQ0SVK+Qy7rI6Ibilm3YCQEc0G95Fpmosu1UnEY4RbZp6a03U5JZVhmFKlaCOaiWgagNkAxgkhGomot10+GsAcAGMA9AewiIhG2Ic9AOAiANsALCOiBUKIdcWqoztOwfqeFgJE5vEHbkzBPb4iEcPb35mV83VZJTAMU6oUc5qLzwO4WwjRCABCiD12+WwA8+3yTURUB2CSva1OCPEOABDRfHvf4ikFZ0SzO3W2x2VkGLxWCIHOhgLDMKVKMd1HIwCcS0SvEtELRHSmXX4SgK3KftvssqByH0R0HREtJ6Lle/fuzbuCeqA4LYQnaGya+6gQcEoqwzClSossBSJaBKCvYdOt9rm7A5gC4EwAjxLRsJZcTyKEeBDAgwBQW1srsuwech77gxJTIMMoZvVzIXQDWwoMw5QqLVIKQogLg7YR0ecB/FVYgwGWElEGQE8A2wEMVHYdYJchpLxIaOMUMsIbR1AsBTf5qOUSnXUCwzClSjHdR38HMA0A7EByBYD3ACwAMIeIKoloKIDhAJYCWAZgOBENJaIKWMHoBUWsny+m4HMfZVmOM29YKzAMU6IUM9A8D8A8IloDoAnA1bbVsJaIHoUVQE4BuEEIkQYAIroRwEIAcQDzhBBri1g/BymjMxkgmQiwFOT/QriPWCswDFOiFE0pCCGaAPxXwLY7AdxpKH8SwJPFqpP/etZ/Z0SznZIqMQWaRd4RDBeOKTAMU6qU9YhmOU5BCul0RnjXZSZ/TKEQsE5gGKZUKW+loMUUAG3AmvJ0CinIeUQzwzClSlkrhapkHABQoSyxFpSSKi2IgriPWn4KhmGYolDWSuHmGSNx47RT8IHx/Z2yoMFrEoGWawU2FBiGKVXKWil0qkriq5eMRFLxE+050uh8JipSoJltBYZhSpSyVgoOEWR0QXv3rBMYhilRWCkgmsCXnqQCGArsPmIYpmRhpYBoHfdCunxYJzAMU6qwUkC0FNGCjlNgU4FhmBKFlUJEqKCBZoZhmNKElQKiuo8KeD3WCgzDlCisFJBroLkA4xTYVmAYpkRhpYBoQtqJA/CEeAzDtGNYKSCakGZBzjBMOcBKISJOoLmV68EwDFNMWCkgoqVwgq/HMAzTGrBSQLSYgjv3EQeaGYZpv7BSwImPKbClwDBMqcJKAdFcQwWd+6gA52AYhikGrBQQcZqLQs59xKYCwzAlCisFROy58xrNDMOUAawU4PXxf+fyscZ9CrrIDmsFhmFKFFYK8Lpzzh3e07yP/b8w6ymwVmAYpjRhpRARuWJnIVJSGYZhShVWChqxmLkXz2MLGIYpB1gpaASJ/gLOh8cwDFOyFE0pENF4IlpCRK8T0XIimmSXExHdR0R1RLSaiCYqx1xNRBvtv6uLVbcwYgH+fo4DMAxTDiSKeO7vAfg/IcRTRDTL/j4VwEwAw+2/yQB+BmAyEXUHcDuAWlgd8hVEtEAIcaCIdfQR4D1i5xHDMGVBMd1HAkBn+3MXADvsz7MB/E5YLAHQlYj6AbgEwLNCiP22IngWwIwi1s9MNunP/iOGYdoxxbQUvgRgIRH9AJbyOdsuPwnAVmW/bXZZULkPIroOwHUAMGjQoIJWOth9VNDLMAzDlCQtUgpEtAhAX8OmWwFMB/D/hBB/IaKPAvg1gAtbcj2JEOJBAA8CQG1tbUH77tkNBTYVGIZpv7RIKQghAoU8Ef0OwE321z8D+JX9eTuAgcquA+yy7bBiDmr5v1pSv3zQLYXuHSqw/1iTk5LKwxQYhmnPFDOmsAPA+fbnCwBstD8vAPBJOwtpCoBDQoidABYCuJiIuhFRNwAX22UnFN1N9OLN07DqtovdlFRWCgzDtGOKGVP4LIB7iSgBoAF2DADAkwBmAagDUA/gUwAghNhPRN8CsMze7w4hxP4i1s+InnraodJ6RBxSYBimHCiaUhBCvATgDEO5AHBDwDHzAMwrVp2iEJSSyjAMUw7wiGaNbIPUONDMMEx7hpWCRuDgNbYgGIYpA1gpaGSb+I4DzQzDtGdYKWgEWwRsKjAM0/5hpaARNKKZYRimHGCloBGkE3jqbIZhygFWChqBcx/Z/zmmwDBMe4aVgkZ25xFrBYZh2i+sFDSC3Ucca2AYpv3DSkEj6+A1NhQYhmnHsFKICNsJDMOUA6wUGIZhGAdWCjnC3iOGYdozrBQiwnFmhmHKAVYKEXEX2WFbgWGY9gsrhYg4y3G2cj0YhmGKCSuFHGFDgWGY9gwrhahwTIFhmDKAlQLDMAzjwEohR9h7xDBMe4aVQkTYe8QwTDnASiFHOCWVYZj2DCuFiPAsqQzDlAOsFBiGYRgHVgoR4ZXXGIYpB1qkFIjoCiJaS0QZIqrVts0lojoi2kBElyjlM+yyOiK6RSkfSkSv2uV/IqKKltSt0LD3iGGYcqCllsIaAB8C8G+1kIhGA5gDYAyAGQB+SkRxIooDeADATACjAVxp7wsA3wXwYyHEKQAOALi2hXUrCoKTUhmG7sGlIAAABv5JREFUace0SCkIIdYLITYYNs0GMF8I0SiE2ASgDsAk+69OCPGOEKIJwHwAs8mK4l4A4DH7+N8C+GBL6lZoiJNSGYYpA4oVUzgJwFbl+za7LKi8B4CDQoiUVl5ycEyBYZj2TCLbDkS0CEBfw6ZbhRD/KHyVskNE1wG4DgAGDRp0Qq7Zq1MlAGB0v84n5HoMwzCtQValIIS4MI/zbgcwUPk+wC5DQPk+AF2JKGFbC+r+pjo9COBBAKitrT0hffeRfTvh7ze8D2P6s1JgGKb9Uiz30QIAc4iokoiGAhgOYCmAZQCG25lGFbCC0QuENUx4MYCP2MdfDaBVrJAwxg/simScs3gZhmm/tDQl9XIi2gbgLABPENFCABBCrAXwKIB1AJ4GcIMQIm1bATcCWAhgPYBH7X0B4OsAvkxEdbBiDL9uSd0YhmGY3MnqPgpDCPE3AH8L2HYngDsN5U8CeNJQ/g6s7CSGYRimlWiRUmDy54WvTUVVMt7a1WAYhvHASqGVGNyjQ2tXgWEYxgdHTRmGYRgHVgoMwzCMAysFhmEYxoGVAsMwDOPASoFhGIZxYKXAMAzDOLBSYBiGYRxYKTAMwzAOrBQYhmEYB1YKDMMwjAMrBYZhGMaBlQLDMAzjwEqBYRiGcWClwDAMwziwUmAYhmEcWCkwDMMwDrzIjs3P/+sMJOPU2tVgGIZpVVgp2Mw4rW9rV4FhGKbVYfcRwzAM48BKgWEYhnFgpcAwDMM4sFJgGIZhHFgpMAzDMA6sFBiGYRgHVgoMwzCMAysFhmEYxoGEEK1dhxZBRHsBvJvn4T0BvFfA6rQmfC+lR3u5D4DvpVRpyb0MFkL00gvbvFJoCUS0XAhR29r1KAR8L6VHe7kPgO+lVCnGvbD7iGEYhnFgpcAwDMM4lLtSeLC1K1BA+F5Kj/ZyHwDfS6lS8Hsp65gCwzAM46XcLQWGYRhGgZUCwzAM41CWSoGIZhDRBiKqI6JbWrs+2SCieUS0h4jWKGXdiehZItpo/+9mlxMR3Wff22oimth6NfdDRAOJaDERrSOitUR0k13e5u6HiKqIaCkRrbLv5f/s8qFE9Kpd5z8RUYVdXml/r7O3D2nN+usQUZyIXiOix+3vbfU+NhPRG0T0OhEtt8vaXPsCACLqSkSPEdGbRLSeiM4q9r2UnVIgojiABwDMBDAawJVENLp1a5WV3wCYoZXdAuA5IcRwAM/Z3wHrvobbf9cB+NkJqmNUUgC+IoQYDWAKgBvs598W76cRwAVCiHEAxgOYQURTAHwXwI+FEKcAOADgWnv/awEcsMt/bO9XStwEYL3yva3eBwBME0KMV3L422L7AoB7ATwthBgFYBys36e49yKEKKs/AGcBWKh8nwtgbmvXK0K9hwBYo3zfAKCf/bkfgA32518AuNK0Xyn+AfgHgIva+v0AqAGwEsBkWCNME3p7A7AQwFn254S9H7V23e36DLAFzAUAHgdAbfE+7DptBtBTK2tz7QtAFwCb9Gdb7HspO0sBwEkAtirft9llbY0+Qoid9uddAPrYn9vM/dluhwkAXkUbvR/b5fI6gD0AngXwNoCDQoiUvYtaX+de7O2HAPQ4sTUO5B4ANwPI2N97oG3eBwAIAM8Q0Qoius4ua4vtayiAvQAest16vyKiDijyvZSjUmh3CKtb0KZyi4moI4C/APiSEOKwuq0t3Y8QIi2EGA+rpz0JwKhWrlLOENH7AewRQqxo7boUiHOEEBNhuVNuIKLz1I1tqH0lAEwE8DMhxAQAx+C6igAU517KUSlsBzBQ+T7ALmtr7CaifgBg/99jl5f8/RFREpZC+IMQ4q92cZu9HwAQQhwEsBiWm6UrESXsTWp9nXuxt3cBsO8EV9XE+wB8gIg2A5gPy4V0L9refQAAhBDb7f97APwNlrJui+1rG4BtQohX7e+PwVISRb2XclQKywAMtzMrKgDMAbCgleuUDwsAXG1/vhqWb16Wf9LORJgC4JBiarY6REQAfg1gvRDiR8qmNnc/RNSLiLran6thxUbWw1IOH7F30+9F3uNHADxv9/RaFSHEXCHEACHEEFjvw/NCiKvQxu4DAIioAxF1kp8BXAxgDdpg+xJC7AKwlYhG2kXTAaxDse+ltYMprRTAmQXgLVj+31tbuz4R6vtHADsBNMPqPVwLy4f7HICNABYB6G7vS7Cyq94G8AaA2tauv3Yv58Ayd1cDeN3+m9UW7wfA6QBes+9lDYDb7PJhAJYCqAPwZwCVdnmV/b3O3j6ste/BcE9TATzeVu/DrvMq+2+tfL/bYvuy6zcewHK7jf0dQLdi3wtPc8EwDMM4lKP7iGEYhgmAlQLDMAzjwEqBYRiGcWClwDAMwziwUmAYhmEcWCkwDMMwDqwUGIZhGIf/D274kw4xnMN0AAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.plot(episode_rewards)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "irl",
   "language": "python",
   "name": "irl"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
