"""
Reinforcement learning maze example.

Red rectangle:          explorer.
Black rectangles:       hells       [reward = -1].
Yellow bin circle:      paradise    [reward = +1].
All other states:       ground      [reward = 0].

This script is the environment part of this example. The RL is in RL_brain.py.

View more on my tutorial page: https://morvanzhou.github.io/tutorials/
"""
'''
imitate the OpenAI MPE Cooperation Navigation Env;
Two targets: yellow oval;
Two agents: red and blue rectangle;
One obstacle: black rectangle;
Similar to simple_spread: Agents are rewarded based on minimum agent distance to each landmark, 
penalized for collisions or hit the black rectangle;
observation of agent: 
1. its own state;
2. the distance between two targets' positions and its own position;
3. the distance between the other agent's position and its own position;

Each agent has four observations: 
obs1=its own position, 
obs2=distance to target1, 
obs3= distance to target2, 
obs4= distance to another agent.

'''
from collections import defaultdict

import numpy as np
import time
import sys
if sys.version_info.major == 2:
    import Tkinter as tk
else:
    import tkinter as tk


UNIT = 40   # pixels
MAZE_H = 6  # grid height
MAZE_W = 6  # grid width


class Maze(tk.Tk, object):
    def __init__(self):
        super(Maze, self).__init__()
        self.action_space = ['u', 'd', 'l', 'r']  # five action possibilities, up down left right stay
        self.n_actions = len(self.action_space)  ##TODO
        self.n_features = 2 ##TODO
        self.title('maze')
        self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT))
        self._build_maze()
        self.location_count_1 = defaultdict(lambda: 0)
        self.location_count_2 = defaultdict(lambda: 0)
        self.location_count_3 = defaultdict(lambda: 0)


    def _build_maze(self):
        self.canvas = tk.Canvas(self, bg='white',
                           height=MAZE_H * UNIT,
                           width=MAZE_W * UNIT)

        # create grids
        for c in range(0, MAZE_W * UNIT, UNIT):
            x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
            self.canvas.create_line(x0, y0, x1, y1)
        for r in range(0, MAZE_H * UNIT, UNIT):
            x0, y0, x1, y1 = 0, r, MAZE_W * UNIT, r
            self.canvas.create_line(x0, y0, x1, y1)

        # create origin
        origin = np.array([20, 20])

        # hell
        #hell1_center = origin + np.array([UNIT * 2, UNIT])
        #self.hell1 = self.canvas.create_rectangle(
        #    hell1_center[0] - 15, hell1_center[1] - 15,
        #    hell1_center[0] + 15, hell1_center[1] + 15,
        #    fill='black')
        # hell
        #hell2_center = origin + np.array([UNIT, UNIT * 2])
        #self.hell2 = self.canvas.create_rectangle(
        #    hell2_center[0] - 15, hell2_center[1] - 15,
        #    hell2_center[0] + 15, hell2_center[1] + 15,
        #    fill='black')

        # create oval target 1
        oval1_center = origin + np.array([UNIT * 2, UNIT * 2])
        self.oval1 = self.canvas.create_oval(
            oval1_center[0] - 15, oval1_center[1] - 15,
            oval1_center[0] + 15, oval1_center[1] + 15,
            fill='purple')

        #oval1_center = origin + np.array([UNIT * 3, UNIT * 3])
        #self.oval1 = self.canvas.create_oval(
        #    oval1_center[0] - 15, oval1_center[1] - 15,
        #    oval1_center[0] + 15, oval1_center[1] + 15,
        #    fill='purple')

        # create oval target 2
        oval2_center = origin + np.array([UNIT * 3, UNIT * 3])
        self.oval2 = self.canvas.create_oval(
            oval2_center[0] - 15, oval2_center[1] - 15,
            oval2_center[0] + 15, oval2_center[1] + 15,
            fill='yellow')
        #oval2_center = origin + np.array([UNIT * 3, UNIT * 3])
        #self.oval2 = self.canvas.create_oval(
        #    oval2_center[0] - 15, oval2_center[1] - 15,
        #    oval2_center[0] + 15, oval2_center[1] + 15,
        #    fill='yellow')

        # create pink rect 1, agent 1
        rec1_center = origin + np.array([UNIT * np.random.randint(1, 4), UNIT * np.random.randint(1, 4)])
        self.rect1 = self.canvas.create_rectangle(
            rec1_center[0] - 15, rec1_center[1] - 15,
            rec1_center[0] + 15, rec1_center[1] + 15,
            fill='pink')

        # create aqua rect 2, agent 2
        rec2_center = origin + np.array([UNIT * np.random.randint(1, 4), UNIT * np.random.randint(1, 4)])
        self.rect2 = self.canvas.create_rectangle(
            rec2_center[0] - 15, rec2_center[1] - 15,
            rec2_center[0] + 15, rec2_center[1] + 15,
            fill='aqua')

        # create green rect 3, agent 3
        rec3_center = origin + np.array([UNIT * np.random.randint(1, 4), UNIT * np.random.randint(1, 4)])
        self.rect3 = self.canvas.create_rectangle(
            rec3_center[0] - 15, rec3_center[1] - 15,
            rec3_center[0] + 15, rec3_center[1] + 15,
            fill='mediumseagreen')
        # pack all
        self.canvas.pack()

    def reset(self,reminding_step):
        self.update()
        time.sleep(0.1)
        self.canvas.delete(self.rect1)
        self.canvas.delete(self.rect2)
        self.canvas.delete(self.rect3)



        #self.canvas.delete(self.oval3)

        origin = np.array([20, 20])
        # create red rect 1, agent 1
        rec1_center = origin + np.array([UNIT * np.random.randint(1, 4), UNIT * np.random.randint(1, 4)])
        self.rect1 = self.canvas.create_rectangle(
            rec1_center[0] - 15, rec1_center[1] - 15,
            rec1_center[0] + 15, rec1_center[1] + 15,
            fill='pink')

        # create blue rect 2, agent 2
        rec2_center = origin + np.array([UNIT * np.random.randint(1, 4), UNIT * np.random.randint(1, 4)])
        self.rect2 = self.canvas.create_rectangle(
            rec2_center[0] - 15, rec2_center[1] - 15,
            rec2_center[0] + 15, rec2_center[1] + 15,
            fill='aqua')
        # create green rect 3, agent 3
        rec3_center = origin + np.array([UNIT * np.random.randint(1, 4), UNIT * np.random.randint(1, 4)])
        self.rect3 = self.canvas.create_rectangle(
            rec3_center[0] - 15, rec3_center[1] - 15,
            rec3_center[0] + 15, rec3_center[1] + 15,
            fill='mediumseagreen')


        # create oval target 3
        #oval3_center = origin + np.array([UNIT * np.random.randint(1, 4), UNIT * np.random.randint(1, 4)])
        #self.oval3 = self.canvas.create_oval(
        #    oval3_center[0] - 15, oval3_center[1] - 15,
        #    oval3_center[0] + 15, oval3_center[1] + 15,
        #    fill='black')


        obs_1, obs_2, obs_3 = self.getObsOrObsNext(reminding_step)

        # return observation
        return [obs_1, obs_2, obs_3]



    def step(self, action_n,reminding_step):

        s_1 = self.canvas.coords(self.rect1)
        s_2 = self.canvas.coords(self.rect2)
        s_3 = self.canvas.coords(self.rect3)


        a_1 = action_n[0]-1
        a_2 = action_n[1]-1
        a_3 = action_n[2]-1


        base_action_1 = np.array([0, 0])
        base_action_2 = np.array([0, 0])
        base_action_3 = np.array([0, 0])


        if a_1 == 1:  # up
            if s_1[1] > UNIT:
                base_action_1[1] -= UNIT
        elif a_1 == 0:  # down
            if s_1[1] < (MAZE_H - 1) * UNIT:
                base_action_1[1] += UNIT
        elif a_1 == 2:  # right
            if s_1[0] < (MAZE_W - 1) * UNIT:
                base_action_1[0] += UNIT
        elif a_1 == 3:  # left
            if s_1[0] > UNIT:
                base_action_1[0] -= UNIT
        #elif a_1 == 4: # stay
        #    print('agent 1 stay')

        if a_2 == 1:  # up
            if s_2[1] > UNIT:
                base_action_2[1] -= UNIT
        elif a_2 == 0:  # down
            if s_2[1] < (MAZE_H - 1) * UNIT:
                base_action_2[1] += UNIT
        elif a_2 == 2:  # right
            if s_2[0] < (MAZE_W - 1) * UNIT:
                base_action_2[0] += UNIT
        elif a_2 == 3:  # left
            if s_2[0] > UNIT:
                base_action_2[0] -= UNIT
        #elif a_2 == 4: # stay
        #    print('agent 2 stay')

        if a_3 == 1:  # up
            if s_3[1] > UNIT:
                base_action_3[1] -= UNIT
        elif a_3 == 0:  # down
            if s_3[1] < (MAZE_H - 1) * UNIT:
                base_action_3[1] += UNIT
        elif a_3 == 2:  # right
            if s_3[0] < (MAZE_W - 1) * UNIT:
                base_action_3[0] += UNIT
        elif a_3 == 3:  # left
            if s_3[0] > UNIT:
                base_action_3[0] -= UNIT
        #elif a_2 == 4: # stay
        #    print('agent 2 stay')
        obs_1, obs_2 , obs_3= self.getObsOrObsNext(reminding_step)
        self.location_count_1[str(obs_1)] = self.location_count_1[str(obs_1)] + 1
        self.location_count_2[str(obs_2)] = self.location_count_2[str(obs_2)] + 1
        self.location_count_3[str(obs_3)] = self.location_count_3[str(obs_3)] + 1


        self.canvas.move(self.rect1, base_action_1[0], base_action_1[1])  # move agent
        self.canvas.move(self.rect2, base_action_2[0], base_action_2[1])
        self.canvas.move(self.rect3, base_action_3[0], base_action_3[1])


        next_coords_1 = self.canvas.coords(self.rect1)  # next state
        next_coords_2 = self.canvas.coords(self.rect2)
        next_coords_3 = self.canvas.coords(self.rect3)

        #Continuous Reward Signal
        reward = 0
        # distance between target 1 and all agents

        RewardHelper_oval1_rec1 = (np.array(self.canvas.coords(self.rect1)[:2]) - np.array(self.canvas.coords(self.oval1)[:2])) / (
                MAZE_H * UNIT)
        RewardHelper_oval1_rec2 = (np.array(self.canvas.coords(self.rect2)[:2]) - np.array(
           self.canvas.coords(self.oval1)[:2])) / (
                                       MAZE_H * UNIT)
        RewardHelper_oval1_rec3 = (np.array(self.canvas.coords(self.rect3)[:2]) - np.array(
            self.canvas.coords(self.oval1)[:2])) / (
                                          MAZE_H * UNIT)
        dist_oval1_rec1 = np.sqrt(RewardHelper_oval1_rec1[0] ** 2 + RewardHelper_oval1_rec1[1] ** 2)
        dist_oval1_rec2 = np.sqrt(RewardHelper_oval1_rec2[0] ** 2 + RewardHelper_oval1_rec2[1] ** 2)
        dist_oval1_rec3 = np.sqrt(RewardHelper_oval1_rec3[0] ** 2 + RewardHelper_oval1_rec3[1] ** 2)


        oval1_dis_compare = min(dist_oval1_rec1, dist_oval1_rec2, dist_oval1_rec3)

        #reward -= oval1_dis_compare

        # distance between target 2 and all agents

        RewardHelper_oval2_rec1 = (np.array(self.canvas.coords(self.rect1)[:2]) - np.array(
            self.canvas.coords(self.oval2)[:2])) / (
                                          MAZE_H * UNIT)
        RewardHelper_oval2_rec2 = (np.array(self.canvas.coords(self.rect2)[:2]) - np.array(
            self.canvas.coords(self.oval2)[:2])) / (
                                          MAZE_H * UNIT)
        RewardHelper_oval2_rec3 = (np.array(self.canvas.coords(self.rect3)[:2]) - np.array(
            self.canvas.coords(self.oval2)[:2])) / (
                                          MAZE_H * UNIT)
        dist_oval2_rec1 = np.sqrt(RewardHelper_oval2_rec1[0] ** 2 + RewardHelper_oval2_rec1[1] ** 2)
        dist_oval2_rec2 = np.sqrt(RewardHelper_oval2_rec2[0] ** 2 + RewardHelper_oval2_rec2[1] ** 2)
        dist_oval2_rec3 = np.sqrt(RewardHelper_oval2_rec3[0] ** 2 + RewardHelper_oval2_rec3[1] ** 2)

        oval2_dis_compare = min(dist_oval2_rec1, dist_oval2_rec2, dist_oval2_rec3)

        #reward -= oval2_dis_compare

        reward += -max(oval1_dis_compare, oval2_dis_compare)

        # reward function
        if next_coords_1 == self.canvas.coords(self.oval1) and next_coords_2 == self.canvas.coords(self.oval1) and next_coords_3 == self.canvas.coords(self.oval1):
            reward += 128
            time.sleep(0.2)
            done = True
        elif next_coords_1 == self.canvas.coords(self.oval2) and next_coords_2 == self.canvas.coords(self.oval2) and next_coords_3 == self.canvas.coords(self.oval2):
            reward += 64
            time.sleep(0.1)
            done = True
        # elif (next_coords_1 == self.canvas.coords(self.oval1) and next_coords_2 == self.canvas.coords(self.oval2)) or (next_coords_1 == self.canvas.coords(self.oval2) and next_coords_2 == self.canvas.coords(self.oval1)):
        #    reward += 0
        #    done = True
        elif next_coords_1 in [self.canvas.coords(self.oval1), self.canvas.coords(self.oval2)]:
            reward += 10
            done = False
        elif next_coords_2 in [self.canvas.coords(self.oval1), self.canvas.coords(self.oval2)]:
            reward += 10
            done = False
        elif next_coords_3 in [self.canvas.coords(self.oval1), self.canvas.coords(self.oval2)]:
            reward += 10
            done = False
        # elif (next_coords_1 == self.canvas.coords(self.hell1)) or (next_coords_2 == self.canvas.coords(self.hell1)):
        #    reward += -50
        #    done = False
        else:
            reward = 0
            done = False

        obs_1_, obs_2_, obs_3_ = self.getObsOrObsNext(reminding_step)

        return [obs_1_, obs_2_, obs_3_], reward, done


    def render(self):
        time.sleep(0.01)
        self.update()

    def getObsOrObsNext(self, reminding_step):
        obs_1 = []
        obs_1_SelfState = (np.array(self.canvas.coords(self.rect1)[:2])) / (
                MAZE_H * UNIT)
        obs_1_DisToOval1 = (np.array(self.canvas.coords(self.rect1)[:2]) - np.array(
            self.canvas.coords(self.oval1)[:2])) / (
                                   MAZE_H * UNIT)
        obs_1_oval1_pos = (np.array(self.canvas.coords(self.oval1)[:2])) / (
                MAZE_H * UNIT)
        obs_1_DisToOval2 = (np.array(self.canvas.coords(self.rect1)[:2]) - np.array(
            self.canvas.coords(self.oval2)[:2])) / (
                                   MAZE_H * UNIT)
        obs_1_oval2_pos = (np.array(self.canvas.coords(self.oval2)[:2])) / (
                MAZE_H * UNIT)
        #obs_1_DisToOval3 = (np.array(self.canvas.coords(self.rect1)[:2]) - np.array(
        #    self.canvas.coords(self.oval3)[:2])) / (
        #                           MAZE_H * UNIT)
        #obs_1_oval3_pos = (np.array(self.canvas.coords(self.oval3)[:2])) / (
        #        MAZE_H * UNIT)
        obs_1_DisToAgent2 = (np.array(self.canvas.coords(self.rect1)[:2]) - np.array(
            self.canvas.coords(self.rect2)[:2])) / (
                                    MAZE_H * UNIT)



        #obs_1.append(obs_1_oval1_pos)
        #obs_1.append(obs_1_oval2_pos)
        #obs_1.append(obs_1_oval3_pos)

        #obs_1.append(obs_1_DisToOval1)
        #obs_1.append(obs_1_DisToOval2)
        #obs_1.append(obs_1_DisToOval3)

        # obs_1.append(obs_1_DisToAgent2)

        obs_2 = []
        obs_2_SelfState = (np.array(self.canvas.coords(self.rect2)[:2])) / (
                MAZE_H * UNIT)
        obs_2_DisToOval1 = (np.array(self.canvas.coords(self.rect2)[:2]) - np.array(
            self.canvas.coords(self.oval1)[:2])) / (
                                   MAZE_H * UNIT)
        obs_2_DisToOval2 = (np.array(self.canvas.coords(self.rect2)[:2]) - np.array(
            self.canvas.coords(self.oval2)[:2])) / (
                                   MAZE_H * UNIT)
        #obs_2_DisToOval3 = (np.array(self.canvas.coords(self.rect2)[:2]) - np.array(
        #    self.canvas.coords(self.oval3)[:2])) / (
        #                           MAZE_H * UNIT)
        obs_2_DisToAgent2 = (np.array(self.canvas.coords(self.rect2)[:2]) - np.array(
            self.canvas.coords(self.rect1)[:2])) / (
                                    MAZE_H * UNIT)
        obs_2_oval1_pos = (np.array(self.canvas.coords(self.oval1)[:2])) / (
                MAZE_H * UNIT)
        obs_2_oval2_pos = (np.array(self.canvas.coords(self.oval2)[:2])) / (
                MAZE_H * UNIT)
        #obs_2_oval3_pos = (np.array(self.canvas.coords(self.oval3)[:2])) / (
        #        MAZE_H * UNIT)

        #obs_2.append(obs_2_oval1_pos)
        #obs_2.append(obs_2_oval2_pos)
        #obs_2.append(obs_2_oval3_pos)

        #obs_2.append(obs_2_DisToOval1)
        #obs_2.append(obs_2_DisToOval2)
        #obs_2.append(obs_2_DisToOval3)

        # obs_2.append(obs_2_DisToAgent2)

        obs_3 = []
        obs_3_SelfState = (np.array(self.canvas.coords(self.rect3)[:2])) / (
                MAZE_H * UNIT)


        obs_1.append(obs_1_SelfState)
        #obs_1.append(obs_2_SelfState)

        obs_2.append(obs_2_SelfState)
        #obs_2.append(obs_1_SelfState)

        obs_3.append(obs_3_SelfState)



        return obs_1, obs_2, obs_3


def update():
    for t in range(10):
        s = env.reset()
        while True:
            env.render()
            a = [0,1]
            s, r, done = env.step(a)
            if done:
                break

if __name__ == '__main__':
    env = Maze()
    env.after(100, update)
    env.mainloop()



