{"metadata":{"accelerator":"GPU","colab":{"provenance":[]},"gpuClass":"standard","kernelspec":{"name":"python3","display_name":"Python 3","language":"python"},"language_info":{"name":"python","version":"3.10.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[],"dockerImageVersionId":30587,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"!pip install gymnasium[classic-control]","metadata":{"id":"LXAtkG4EiC4r","outputId":"42026478-ffb2-41d2-86b4-75274c228366","execution":{"iopub.status.busy":"2023-12-08T16:55:38.173917Z","iopub.execute_input":"2023-12-08T16:55:38.174287Z","iopub.status.idle":"2023-12-08T16:55:54.015973Z","shell.execute_reply.started":"2023-12-08T16:55:38.174258Z","shell.execute_reply":"2023-12-08T16:55:54.014626Z"},"trusted":true},"execution_count":1,"outputs":[{"name":"stdout","text":"Requirement already satisfied: gymnasium[classic-control] in /opt/conda/lib/python3.10/site-packages (0.26.3)\n\u001b[33mWARNING: gymnasium 0.26.3 does not provide the extra 'classic-control'\u001b[0m\u001b[33m\n\u001b[0mRequirement already satisfied: numpy>=1.18.0 in /opt/conda/lib/python3.10/site-packages (from gymnasium[classic-control]) (1.24.3)\nRequirement already satisfied: cloudpickle>=1.2.0 in /opt/conda/lib/python3.10/site-packages (from gymnasium[classic-control]) (2.2.1)\nRequirement already satisfied: gymnasium-notices>=0.0.1 in /opt/conda/lib/python3.10/site-packages (from gymnasium[classic-control]) (0.0.1)\n","output_type":"stream"}]},{"cell_type":"code","source":"import gymnasium as gym\nimport torch\nimport torch.nn as nn\nimport torch.optim as optim\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport os\nimport random\n\nimport torch\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F","metadata":{"id":"MXfLkUeefmds","execution":{"iopub.status.busy":"2023-12-08T16:59:03.463193Z","iopub.execute_input":"2023-12-08T16:59:03.463828Z","iopub.status.idle":"2023-12-08T16:59:03.472368Z","shell.execute_reply.started":"2023-12-08T16:59:03.463782Z","shell.execute_reply":"2023-12-08T16:59:03.469661Z"},"trusted":true},"execution_count":3,"outputs":[]},{"cell_type":"code","source":"# Define the Policy Network\nclass PolicyNetwork(nn.Module):\n    def __init__(self, n_inputs, n_actions):\n        super(PolicyNetwork, self).__init__()\n        self.fc = nn.Linear(n_inputs, 32)\n        self.out = nn.Linear(32, n_actions)\n        self.logsoftmax=torch.nn.LogSoftmax(dim=-1)\n\n    def forward(self, x):\n        x = torch.relu(self.fc(x))\n        logits=self.out(x)\n        return torch.softmax(logits, dim=-1), self.logsoftmax(logits)\n\n# Function to choose an action based on the policy\ndef choose_action(policy_net, state):\n    probabilities, log_probabilities = policy_net(torch.from_numpy(state).float().unsqueeze(0))\n    action = torch.multinomial(probabilities, 1).item()\n    return action, log_probabilities[:, action]\n\n# Training function\ndef train(env, policy_net, optimizer, episodes, gamma=0.99):\n    hist=[]\n    for episode in range(episodes):\n        state = env.reset()[0]\n        done = False\n        log_probs = []\n        rewards = []\n        count=0\n        while not done and count<1000:\n            action, log_prob = choose_action(policy_net, state)\n            # print(\"action:\", action, \"log_prob:\", log_prob)\n            state, reward, done, _, _= env.step(action)\n            log_probs.append(log_prob)\n            rewards.append(reward) #reward = state[0]\n            count+=1\n        env.close()\n        hist.append(sum(rewards))\n        # Update policy\n        discounted_rewards = []\n        R = 0\n        for r in rewards[::-1]:\n            R = r + gamma * R\n            discounted_rewards.insert(0, R)\n        discounted_rewards = torch.tensor(discounted_rewards)\n        policy_loss = []\n        for log_prob, reward in zip(log_probs, discounted_rewards):\n            policy_loss.append(-log_prob * reward)\n        optimizer.zero_grad()\n        policy_loss = torch.cat(policy_loss).sum()\n        policy_loss.backward()\n        optimizer.step()\n\n        if episode % 1000 ==0:\n            print(f'Episode {episode}: total reward -> {sum(rewards)}')\n            \n    return hist","metadata":{"id":"-czOCfcwgyOi"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Main experiment\n# env.close()\n# envV.close()\nhist_vanila=[]\nfor i in range(20):\n    env = gym.make('MountainCar-v0')\n    n_inputs = env.observation_space.shape[0]\n    n_actions = env.action_space.n\n    policy_net = PolicyNetwork(n_inputs, n_actions)\n    optimizer = optim.Adam(policy_net.parameters(), lr=0.01)\n\n    # Train the model\n    hist_vanila.append(train(env, policy_net, optimizer, episodes=500))\n\nenv.close()\n\nimport numpy as np\nhist_vanila = np.array(hist_vanila)","metadata":{"id":"VAV7CNGjhVX-","outputId":"104b2a44-b7e5-4df6-81fe-0e508a20aabe"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"class SimpleMLP(nn.Module):\n    def __init__(self, layers_size, activation):\n        super(SimpleMLP, self).__init__()\n        layers = []\n\n        # Hidden layers\n        for i in range(len(layers_size)-1):\n            layers.append(nn.Linear(layers_size[i], layers_size[i+1]))\n            layers.append(nn.ReLU())\n\n        # Sequential container for all layers\n        self.layers = nn.Sequential(*layers)\n        self.layers_size=layers_size\n        self.activation=activation\n\n    def forward(self, x):\n        x = self.layers(x)\n        x = self.activation(x)\n        return x\n\nclass basedOnLibrary:\n\n    def __init__(self):\n        self.all_models = {}\n        self.eval_mode=False\n        self.log_probs = [torch.tensor([0], dtype=torch.float32)]\n        self.rewards = []\n\n        _ = torch.randn(1, 1)\n        _.requires_grad = True\n        self.optimizer=optim.Adam([_], lr=0.01)\n\n    def episode_ends(self, gamma=0.999):\n        if not self.eval_mode:\n            # Update policy\n            discounted_rewards = []\n            R = 0\n            for r in self.rewards[::-1]:\n                R = r + gamma * R\n                discounted_rewards.insert(0, R)\n            discounted_rewards = torch.tensor(discounted_rewards)\n            policy_loss = []\n            for log_prob, reward in zip(self.log_probs, discounted_rewards):\n                policy_loss.append(-log_prob * reward)\n            self.optimizer.zero_grad()\n            policy_loss = torch.cat(policy_loss).sum()\n            policy_loss.backward()\n            self.optimizer.step()\n\n            self.optimizer.zero_grad()\n\n            self.log_probs = [torch.tensor([0], dtype=torch.float32)]\n            self.rewards = []\n\n\n    def eval(self):\n        self.eval_mode = True\n\n    def train(self):\n        self.eval_mode = False\n\n    def get_model(self, model_name, input_size=None, hidden_layers=[32], activation=None, output_size=None):\n        if model_name not in self.all_models:\n            assert input_size is not None\n            assert output_size is not None\n            assert activation is not None\n            self.all_models[model_name] = SimpleMLP([input_size]+hidden_layers+[output_size], activation=activation)\n            self.optimizer.add_param_group({'params':list(self.all_models[model_name].parameters())})\n\n        return self.all_models[model_name]\n\n    @staticmethod\n    def list_to_tensor(list):\n        a=[]\n        for x in list:\n            if torch.is_tensor(x):\n                a.append(x)\n            else:\n                a.append(torch.tensor(x))\n        return a\n\n    def if_based_on(self, line_name, *input, **kwargs):  # we could attach a module on top of more than one argument\n        input=basedOnLibrary.list_to_tensor(input)\n        x = torch.cat(input, dim=0)\n        log_p = self.get_model(line_name, input_size=x.shape[0], output_size=1, activation=F.logsigmoid ,**kwargs)(x)\n        p = torch.exp(log_p)\n        if (not self.eval_mode and random.random() < p) or (self.eval_mode and 0.5 <= p):\n            decision = True\n            log_probability_of_decision = log_p\n        else:\n            decision = False\n            log_probability_of_decision = torch.log(1 - p)\n        if not self.eval_mode:\n            self.log_probs[-1] += log_probability_of_decision\n        return decision\n\n    def choose_based_on(self, line_name, count, *input, **kwargs):\n        input=basedOnLibrary.list_to_tensor(input)\n        x = torch.cat(input, dim=0)\n        log_p = self.get_model(line_name, input_size=x.shape[0], output_size=count, activation=F.log_softmax, **kwargs)(x)\n        if not self.eval_mode:\n            decision=torch.multinomial(torch.exp(log_p), 1).item()\n            self.log_probs[-1] += log_p[decision]\n        else:\n            decision=torch.argmax(log_p).item()\n\n        return decision\n\n    def choose_based_on_continues(self, line_name, count, *input, **kwargs):\n        input=basedOnLibrary.list_to_tensor(input)\n        x = torch.cat(input, dim=0)\n        mean = self.get_model(line_name, input_size=x.shape[0], output_size=1, activation=F.sigmoid , **kwargs)(x)\n        mean=mean*count\n        # Create a normal distribution\n        normal_dist = torch.distributions.Normal(mean, 2)\n\n        # Create an array for numbers 1 to 10\n        numbers = torch.arange(0, count).float()\n\n        # Calculate probabilities for each number\n        probabilities = torch.exp(normal_dist.log_prob(numbers))\n        probabilities_sum = torch.sum(probabilities)\n\n        # Normalize the probabilities\n        probabilities /=probabilities_sum\n\n        if not self.eval_mode:\n            decision=torch.multinomial(probabilities, 1).item()\n            self.log_probs[-1] += normal_dist.log_prob(decision)-torch.log(probabilities_sum)\n        else:\n            decision=torch.argmax(probabilities).item()\n\n        return decision\n\n\n    def recored_reward(self, reward):\n        if not self.eval_mode:\n            self.log_probs.append(torch.tensor([0], dtype=torch.float32))\n            self.rewards.append(reward)","metadata":{"id":"4h3DJtV1doiR","execution":{"iopub.status.busy":"2023-12-08T16:59:12.536386Z","iopub.execute_input":"2023-12-08T16:59:12.536775Z","iopub.status.idle":"2023-12-08T16:59:12.570609Z","shell.execute_reply.started":"2023-12-08T16:59:12.536736Z","shell.execute_reply":"2023-12-08T16:59:12.569476Z"},"trusted":true},"execution_count":4,"outputs":[]},{"cell_type":"code","source":"left, right=0, 2\nvalley_threshold = -0.5\n\ndef car(state, f):\n    x, v = state\n    if x > valley_threshold and v < 0:\n        return left\n    if x < valley_threshold and v > 0:\n        return right\n    if x > valley_threshold:\n        if f.if_based_on(\"right\", state):\n            return left\n        else:\n            return right\n    if x < valley_threshold:\n        if f.if_based_on(\"left\", state):\n            return left\n        else:\n            return right\n","metadata":{"id":"ZGkq-VjckIBG","execution":{"iopub.status.busy":"2023-12-08T16:59:17.921212Z","iopub.execute_input":"2023-12-08T16:59:17.921647Z","iopub.status.idle":"2023-12-08T16:59:17.928899Z","shell.execute_reply.started":"2023-12-08T16:59:17.921611Z","shell.execute_reply":"2023-12-08T16:59:17.927547Z"},"trusted":true},"execution_count":5,"outputs":[]},{"cell_type":"code","source":"# Training function\ndef train_basedOn(env, car, f, episodes, gamma=0.9999):\n    hist=[]\n    for episode in range(episodes):\n        state = env.reset()[0]\n        done = False\n        rewards = []\n        count=0\n        while not done and count<1000:\n            action = car(state, f)\n            state, reward, done, _, _= env.step(action)\n            f.recored_reward(reward)\n            rewards.append(reward)\n            count+=1\n        env.close()\n        f.episode_ends()\n        hist.append(sum(rewards))\n\n        if episode % 100 ==0:\n            print(f'Episode {episode}: total reward -> {sum(rewards)}')\n    return hist","metadata":{"id":"ywS4OBZA7c0n","execution":{"iopub.status.busy":"2023-12-08T17:03:49.166269Z","iopub.execute_input":"2023-12-08T17:03:49.166729Z","iopub.status.idle":"2023-12-08T17:03:49.176004Z","shell.execute_reply.started":"2023-12-08T17:03:49.166694Z","shell.execute_reply":"2023-12-08T17:03:49.174901Z"},"trusted":true},"execution_count":10,"outputs":[]},{"cell_type":"code","source":"env = gym.make('MountainCar-v0')\nhist_basedOn=[]\nfor i in range(20):\n    env.close()\n    env = gym.make('MountainCar-v0')\n    f=basedOnLibrary()\n\n    # Train the model\n    hist_basedOn.append(train_basedOn(env, car, f, episodes=2000))\n\nenv.close()\nhist_basedOn = np.array(hist_basedOn)","metadata":{"execution":{"iopub.status.busy":"2023-12-08T17:03:49.481056Z","iopub.execute_input":"2023-12-08T17:03:49.481443Z","iopub.status.idle":"2023-12-08T17:55:01.377329Z","shell.execute_reply.started":"2023-12-08T17:03:49.481415Z","shell.execute_reply":"2023-12-08T17:55:01.376160Z"},"trusted":true},"execution_count":11,"outputs":[{"name":"stdout","text":"Episode 0: total reward -> -229.0\nEpisode 100: total reward -> -237.0\nEpisode 200: total reward -> -167.0\nEpisode 300: total reward -> -266.0\nEpisode 400: total reward -> -154.0\nEpisode 500: total reward -> -163.0\nEpisode 600: total reward -> -268.0\nEpisode 700: total reward -> -176.0\nEpisode 800: total reward -> -160.0\nEpisode 900: total reward -> -150.0\nEpisode 1000: total reward -> -151.0\nEpisode 1100: total reward -> -158.0\nEpisode 1200: total reward -> -167.0\nEpisode 1300: total reward -> -158.0\nEpisode 1400: total reward -> -237.0\nEpisode 1500: total reward -> -155.0\nEpisode 1600: total reward -> -153.0\nEpisode 1700: total reward -> -151.0\nEpisode 1800: total reward -> -159.0\nEpisode 1900: total reward -> -150.0\nEpisode 0: total reward -> -266.0\nEpisode 100: total reward -> -339.0\nEpisode 200: total reward -> -257.0\nEpisode 300: total reward -> -200.0\nEpisode 400: total reward -> -224.0\nEpisode 500: total reward -> -207.0\nEpisode 600: total reward -> -282.0\nEpisode 700: total reward -> -210.0\nEpisode 800: total reward -> -229.0\nEpisode 900: total reward -> -331.0\nEpisode 1000: total reward -> -209.0\nEpisode 1100: total reward -> -295.0\nEpisode 1200: total reward -> -219.0\nEpisode 1300: total reward -> -220.0\nEpisode 1400: total reward -> -222.0\nEpisode 1500: total reward -> -307.0\nEpisode 1600: total reward -> -216.0\nEpisode 1700: total reward -> -255.0\nEpisode 1800: total reward -> -248.0\nEpisode 1900: total reward -> -259.0\nEpisode 0: total reward -> -250.0\nEpisode 100: total reward -> -269.0\nEpisode 200: total reward -> -218.0\nEpisode 300: total reward -> -242.0\nEpisode 400: total reward -> -268.0\nEpisode 500: total reward -> -267.0\nEpisode 600: total reward -> -249.0\nEpisode 700: total reward -> -285.0\nEpisode 800: total reward -> -320.0\nEpisode 900: total reward -> -262.0\nEpisode 1000: total reward -> -225.0\nEpisode 1100: total reward -> -234.0\nEpisode 1200: total reward -> -224.0\nEpisode 1300: total reward -> -280.0\nEpisode 1400: total reward -> -311.0\nEpisode 1500: total reward -> -246.0\nEpisode 1600: total reward -> -179.0\nEpisode 1700: total reward -> -253.0\nEpisode 1800: total reward -> -314.0\nEpisode 1900: total reward -> -314.0\nEpisode 0: total reward -> -245.0\nEpisode 100: total reward -> -179.0\nEpisode 200: total reward -> -179.0\nEpisode 300: total reward -> -234.0\nEpisode 400: total reward -> -260.0\nEpisode 500: total reward -> -287.0\nEpisode 600: total reward -> -244.0\nEpisode 700: total reward -> -267.0\nEpisode 800: total reward -> -209.0\nEpisode 900: total reward -> -232.0\nEpisode 1000: total reward -> -226.0\nEpisode 1100: total reward -> -319.0\nEpisode 1200: total reward -> -214.0\nEpisode 1300: total reward -> -258.0\nEpisode 1400: total reward -> -208.0\nEpisode 1500: total reward -> -228.0\nEpisode 1600: total reward -> -261.0\nEpisode 1700: total reward -> -255.0\nEpisode 1800: total reward -> -298.0\nEpisode 1900: total reward -> -264.0\nEpisode 0: total reward -> -207.0\nEpisode 100: total reward -> -261.0\nEpisode 200: total reward -> -232.0\nEpisode 300: total reward -> -244.0\nEpisode 400: total reward -> -213.0\nEpisode 500: total reward -> -178.0\nEpisode 600: total reward -> -233.0\nEpisode 700: total reward -> -322.0\nEpisode 800: total reward -> -285.0\nEpisode 900: total reward -> -294.0\nEpisode 1000: total reward -> -185.0\nEpisode 1100: total reward -> -331.0\nEpisode 1200: total reward -> -234.0\nEpisode 1300: total reward -> -229.0\nEpisode 1400: total reward -> -256.0\nEpisode 1500: total reward -> -203.0\nEpisode 1600: total reward -> -259.0\nEpisode 1700: total reward -> -223.0\nEpisode 1800: total reward -> -259.0\nEpisode 1900: total reward -> -234.0\nEpisode 0: total reward -> -280.0\nEpisode 100: total reward -> -268.0\nEpisode 200: total reward -> -301.0\nEpisode 300: total reward -> -211.0\nEpisode 400: total reward -> -224.0\nEpisode 500: total reward -> -277.0\nEpisode 600: total reward -> -220.0\nEpisode 700: total reward -> -225.0\nEpisode 800: total reward -> -192.0\nEpisode 900: total reward -> -236.0\nEpisode 1000: total reward -> -216.0\nEpisode 1100: total reward -> -275.0\nEpisode 1200: total reward -> -185.0\nEpisode 1300: total reward -> -209.0\nEpisode 1400: total reward -> -325.0\nEpisode 1500: total reward -> -209.0\nEpisode 1600: total reward -> -223.0\nEpisode 1700: total reward -> -311.0\nEpisode 1800: total reward -> -244.0\nEpisode 1900: total reward -> -189.0\nEpisode 0: total reward -> -387.0\nEpisode 100: total reward -> -213.0\nEpisode 200: total reward -> -176.0\nEpisode 300: total reward -> -205.0\nEpisode 400: total reward -> -280.0\nEpisode 500: total reward -> -216.0\nEpisode 600: total reward -> -284.0\nEpisode 700: total reward -> -215.0\nEpisode 800: total reward -> -194.0\nEpisode 900: total reward -> -311.0\nEpisode 1000: total reward -> -260.0\nEpisode 1100: total reward -> -217.0\nEpisode 1200: total reward -> -263.0\nEpisode 1300: total reward -> -214.0\nEpisode 1400: total reward -> -237.0\nEpisode 1500: total reward -> -318.0\nEpisode 1600: total reward -> -213.0\nEpisode 1700: total reward -> -296.0\nEpisode 1800: total reward -> -331.0\nEpisode 1900: total reward -> -365.0\nEpisode 0: total reward -> -190.0\nEpisode 100: total reward -> -219.0\nEpisode 200: total reward -> -180.0\nEpisode 300: total reward -> -283.0\nEpisode 400: total reward -> -384.0\nEpisode 500: total reward -> -270.0\nEpisode 600: total reward -> -206.0\nEpisode 700: total reward -> -284.0\nEpisode 800: total reward -> -220.0\nEpisode 900: total reward -> -310.0\nEpisode 1000: total reward -> -230.0\nEpisode 1100: total reward -> -190.0\nEpisode 1200: total reward -> -206.0\nEpisode 1300: total reward -> -238.0\nEpisode 1400: total reward -> -283.0\nEpisode 1500: total reward -> -324.0\nEpisode 1600: total reward -> -325.0\nEpisode 1700: total reward -> -231.0\nEpisode 1800: total reward -> -176.0\nEpisode 1900: total reward -> -217.0\nEpisode 0: total reward -> -313.0\nEpisode 100: total reward -> -277.0\nEpisode 200: total reward -> -185.0\nEpisode 300: total reward -> -243.0\nEpisode 400: total reward -> -160.0\nEpisode 500: total reward -> -210.0\nEpisode 600: total reward -> -208.0\nEpisode 700: total reward -> -256.0\nEpisode 800: total reward -> -236.0\nEpisode 900: total reward -> -247.0\nEpisode 1000: total reward -> -149.0\nEpisode 1100: total reward -> -353.0\nEpisode 1200: total reward -> -170.0\nEpisode 1300: total reward -> -298.0\nEpisode 1400: total reward -> -256.0\nEpisode 1500: total reward -> -416.0\nEpisode 1600: total reward -> -263.0\nEpisode 1700: total reward -> -230.0\nEpisode 1800: total reward -> -187.0\nEpisode 1900: total reward -> -228.0\nEpisode 0: total reward -> -152.0\nEpisode 100: total reward -> -244.0\nEpisode 200: total reward -> -158.0\nEpisode 300: total reward -> -160.0\nEpisode 400: total reward -> -179.0\nEpisode 500: total reward -> -153.0\nEpisode 600: total reward -> -160.0\nEpisode 700: total reward -> -230.0\nEpisode 800: total reward -> -156.0\nEpisode 900: total reward -> -156.0\nEpisode 1000: total reward -> -149.0\nEpisode 1100: total reward -> -157.0\nEpisode 1200: total reward -> -161.0\nEpisode 1300: total reward -> -151.0\nEpisode 1400: total reward -> -154.0\nEpisode 1500: total reward -> -162.0\nEpisode 1600: total reward -> -149.0\nEpisode 1700: total reward -> -151.0\nEpisode 1800: total reward -> -152.0\nEpisode 1900: total reward -> -233.0\nEpisode 0: total reward -> -207.0\nEpisode 100: total reward -> -148.0\nEpisode 200: total reward -> -183.0\nEpisode 300: total reward -> -250.0\nEpisode 400: total reward -> -153.0\nEpisode 500: total reward -> -140.0\nEpisode 600: total reward -> -156.0\nEpisode 700: total reward -> -163.0\nEpisode 800: total reward -> -149.0\nEpisode 900: total reward -> -246.0\nEpisode 1000: total reward -> -155.0\nEpisode 1100: total reward -> -155.0\nEpisode 1200: total reward -> -150.0\nEpisode 1300: total reward -> -162.0\nEpisode 1400: total reward -> -159.0\nEpisode 1500: total reward -> -157.0\nEpisode 1600: total reward -> -245.0\nEpisode 1700: total reward -> -153.0\nEpisode 1800: total reward -> -157.0\nEpisode 1900: total reward -> -158.0\nEpisode 0: total reward -> -235.0\nEpisode 100: total reward -> -317.0\nEpisode 200: total reward -> -229.0\nEpisode 300: total reward -> -263.0\nEpisode 400: total reward -> -207.0\nEpisode 500: total reward -> -221.0\nEpisode 600: total reward -> -224.0\nEpisode 700: total reward -> -240.0\nEpisode 800: total reward -> -301.0\nEpisode 900: total reward -> -211.0\nEpisode 1000: total reward -> -176.0\nEpisode 1100: total reward -> -211.0\nEpisode 1200: total reward -> -208.0\nEpisode 1300: total reward -> -270.0\nEpisode 1400: total reward -> -245.0\nEpisode 1500: total reward -> -262.0\nEpisode 1600: total reward -> -186.0\nEpisode 1700: total reward -> -255.0\nEpisode 1800: total reward -> -277.0\nEpisode 1900: total reward -> -184.0\nEpisode 0: total reward -> -172.0\nEpisode 100: total reward -> -212.0\nEpisode 200: total reward -> -241.0\nEpisode 300: total reward -> -179.0\nEpisode 400: total reward -> -251.0\nEpisode 500: total reward -> -244.0\nEpisode 600: total reward -> -217.0\nEpisode 700: total reward -> -191.0\nEpisode 800: total reward -> -190.0\nEpisode 900: total reward -> -210.0\nEpisode 1000: total reward -> -209.0\nEpisode 1100: total reward -> -310.0\nEpisode 1200: total reward -> -211.0\nEpisode 1300: total reward -> -246.0\nEpisode 1400: total reward -> -233.0\nEpisode 1500: total reward -> -191.0\nEpisode 1600: total reward -> -211.0\nEpisode 1700: total reward -> -223.0\nEpisode 1800: total reward -> -465.0\nEpisode 1900: total reward -> -329.0\nEpisode 0: total reward -> -226.0\nEpisode 100: total reward -> -239.0\nEpisode 200: total reward -> -280.0\nEpisode 300: total reward -> -310.0\nEpisode 400: total reward -> -175.0\nEpisode 500: total reward -> -251.0\nEpisode 600: total reward -> -203.0\nEpisode 700: total reward -> -258.0\nEpisode 800: total reward -> -246.0\nEpisode 900: total reward -> -292.0\nEpisode 1000: total reward -> -225.0\nEpisode 1100: total reward -> -178.0\nEpisode 1200: total reward -> -266.0\nEpisode 1300: total reward -> -275.0\nEpisode 1400: total reward -> -226.0\nEpisode 1500: total reward -> -292.0\nEpisode 1600: total reward -> -358.0\nEpisode 1700: total reward -> -319.0\nEpisode 1800: total reward -> -221.0\nEpisode 1900: total reward -> -183.0\nEpisode 0: total reward -> -275.0\nEpisode 100: total reward -> -214.0\nEpisode 200: total reward -> -253.0\nEpisode 300: total reward -> -268.0\nEpisode 400: total reward -> -188.0\nEpisode 500: total reward -> -278.0\nEpisode 600: total reward -> -222.0\nEpisode 700: total reward -> -236.0\nEpisode 800: total reward -> -263.0\nEpisode 900: total reward -> -349.0\nEpisode 1000: total reward -> -255.0\nEpisode 1100: total reward -> -235.0\nEpisode 1200: total reward -> -201.0\nEpisode 1300: total reward -> -339.0\nEpisode 1400: total reward -> -178.0\nEpisode 1500: total reward -> -218.0\nEpisode 1600: total reward -> -298.0\nEpisode 1700: total reward -> -184.0\nEpisode 1800: total reward -> -268.0\nEpisode 1900: total reward -> -251.0\nEpisode 0: total reward -> -274.0\nEpisode 100: total reward -> -157.0\nEpisode 200: total reward -> -225.0\nEpisode 300: total reward -> -163.0\nEpisode 400: total reward -> -157.0\nEpisode 500: total reward -> -160.0\nEpisode 600: total reward -> -161.0\nEpisode 700: total reward -> -159.0\nEpisode 800: total reward -> -152.0\nEpisode 900: total reward -> -154.0\nEpisode 1000: total reward -> -164.0\nEpisode 1100: total reward -> -151.0\nEpisode 1200: total reward -> -159.0\nEpisode 1300: total reward -> -261.0\nEpisode 1400: total reward -> -157.0\nEpisode 1500: total reward -> -160.0\nEpisode 1600: total reward -> -150.0\nEpisode 1700: total reward -> -163.0\nEpisode 1800: total reward -> -159.0\nEpisode 1900: total reward -> -242.0\nEpisode 0: total reward -> -207.0\nEpisode 100: total reward -> -279.0\nEpisode 200: total reward -> -386.0\nEpisode 300: total reward -> -215.0\nEpisode 400: total reward -> -186.0\nEpisode 500: total reward -> -223.0\nEpisode 600: total reward -> -246.0\nEpisode 700: total reward -> -243.0\nEpisode 800: total reward -> -214.0\nEpisode 900: total reward -> -292.0\nEpisode 1000: total reward -> -215.0\nEpisode 1100: total reward -> -189.0\nEpisode 1200: total reward -> -225.0\nEpisode 1300: total reward -> -378.0\nEpisode 1400: total reward -> -216.0\nEpisode 1500: total reward -> -216.0\nEpisode 1600: total reward -> -230.0\nEpisode 1700: total reward -> -209.0\nEpisode 1800: total reward -> -280.0\nEpisode 1900: total reward -> -302.0\nEpisode 0: total reward -> -355.0\nEpisode 100: total reward -> -151.0\nEpisode 200: total reward -> -227.0\nEpisode 300: total reward -> -244.0\nEpisode 400: total reward -> -254.0\nEpisode 500: total reward -> -185.0\nEpisode 600: total reward -> -377.0\nEpisode 700: total reward -> -362.0\nEpisode 800: total reward -> -270.0\nEpisode 900: total reward -> -277.0\nEpisode 1000: total reward -> -224.0\nEpisode 1100: total reward -> -281.0\nEpisode 1200: total reward -> -288.0\nEpisode 1300: total reward -> -306.0\nEpisode 1400: total reward -> -275.0\nEpisode 1500: total reward -> -292.0\nEpisode 1600: total reward -> -287.0\nEpisode 1700: total reward -> -225.0\nEpisode 1800: total reward -> -216.0\nEpisode 1900: total reward -> -235.0\nEpisode 0: total reward -> -241.0\nEpisode 100: total reward -> -212.0\nEpisode 200: total reward -> -265.0\nEpisode 300: total reward -> -248.0\nEpisode 400: total reward -> -221.0\nEpisode 500: total reward -> -248.0\nEpisode 600: total reward -> -294.0\nEpisode 700: total reward -> -185.0\nEpisode 800: total reward -> -262.0\nEpisode 900: total reward -> -220.0\nEpisode 1000: total reward -> -316.0\nEpisode 1100: total reward -> -222.0\nEpisode 1200: total reward -> -210.0\nEpisode 1300: total reward -> -217.0\nEpisode 1400: total reward -> -216.0\nEpisode 1500: total reward -> -223.0\nEpisode 1600: total reward -> -372.0\nEpisode 1700: total reward -> -262.0\nEpisode 1800: total reward -> -235.0\nEpisode 1900: total reward -> -211.0\nEpisode 0: total reward -> -218.0\nEpisode 100: total reward -> -214.0\nEpisode 200: total reward -> -252.0\nEpisode 300: total reward -> -241.0\nEpisode 400: total reward -> -266.0\nEpisode 500: total reward -> -242.0\nEpisode 600: total reward -> -280.0\nEpisode 700: total reward -> -252.0\nEpisode 800: total reward -> -213.0\nEpisode 900: total reward -> -276.0\nEpisode 1000: total reward -> -215.0\nEpisode 1100: total reward -> -261.0\nEpisode 1200: total reward -> -215.0\nEpisode 1300: total reward -> -216.0\nEpisode 1400: total reward -> -243.0\nEpisode 1500: total reward -> -210.0\nEpisode 1600: total reward -> -213.0\nEpisode 1700: total reward -> -212.0\nEpisode 1800: total reward -> -240.0\nEpisode 1900: total reward -> -252.0\n","output_type":"stream"}]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}