import numpy as np
import torch
import matplotlib.pyplot as plt
import random
class ENV:
    def __init__(self, theta_1, theta_2, feedback, physical, delta_1, delta_2):
        self.state = [np.array([0, -0.5],dtype=float), np.array([0.5, 0],dtype=float)]
        self.space_bound = 1
        self.action_bound = np.array([0.1, 0.1])
        self.target = np.array([[0, 0.5], [-0.5, 0]])
        self.theta_1 = theta_1
        self.theta_2 = theta_2 + delta_2
        self.feedback = feedback
        self.physical = physical
        self.delta_1 = delta_1

    def learner_reward(self, state, action):
        dis = ((state[0][0] - state[1][0]) ** 2 + (state[0][1] - state[1][1]) ** 2) ** 0.5
        speed = (action[1][0]**2 + action[1][1]**2) ** 0.5 - (action[0][0]**2 + action[0][1]**2) ** 0.5
        dis_g = ((state[0][0] - self.target[0][0])**2 + (state[0][1] - self.target[0][1])**2)**0.5
        #feature = np.array([0.1*dis * dis_g, 0.1*speed * dis_g])
        feature = np.array([-np.exp(-dis) * dis_g, -np.exp(-speed-0.5) * dis_g])
        if dis <= 0.15:
            feature[0] = -10*np.exp(5 * -dis)

        if self.feedback:
            reward = (np.array([1.0, 1.0]) + self.delta_1).T @ feature
        else:
            reward = self.theta_1.T @ feature
        # if ((state[0][0] - self.target[0][0]) ** 2 + (state[0][1] - self.target[0][1]) ** 2) ** 0.5 <= 0.05:
        #     reward += 10
        return reward - dis_g

    def expert_reward(self, state, action):
        dis = ((state[0][0] - state[1][0]) ** 2 + (state[0][1] - state[1][1]) ** 2) ** 0.5
        #feature = np.array([-((state[1][0] - self.target[1][0])**2 + (state[1][1] - self.target[1][1])**2)**0.5, -(1+np.exp(10 * -min(dis,0.1)))])
        feature = np.array(
            [-((state[1][0] - self.target[1][0])**2 + (state[1][1] - self.target[1][1])**2)**0.5,
             0])
        if dis <= 0.1:
            feature[1] = -10*np.exp(5 * -dis)
        if self.physical:
            reward = np.array([1.0, 1.0]).T @ feature
        else:
            reward = self.theta_2.T @ feature
        # if ((state[1][0] - self.target[1][0]) ** 2 + (state[1][1] - self.target[1][1]) ** 2) ** 0.5 <= 0.05:
        #     reward += 10
        return reward

    def step(self, state, action):
        done = np.zeros(2)
        reward = np.zeros(2)

        x_learner = state[0][0]
        y_learner = state[0][1]
        #heading_learner = state[0][2]
        x_expert = state[1][0]
        y_expert = state[1][1]
        #heading_expert = state[1][2]
        v_learner = action[0][0]
        av_learner = action[0][1]
        v_expert = action[1][0]
        av_expert = action[1][1]
        #x_learner += 0.1 * np.cos(heading_learner)*v_learner
        #y_learner += 0.1 * np.sin(heading_learner)*v_learner
        #heading_learner += 0.1 * av_learner
        #x_expert += 0.1 * np.cos(heading_expert)*v_expert
        #y_expert += 0.1 * np.sin(heading_expert)*v_expert
        #heading_expert += 0.1 * av_expert
        #reward[0] = -((state[0][0] - self.target[0][0]) ** 2 + (state[0][1] - self.target[0][1]) ** 2) ** 0.5
        reward[0] = self.learner_reward(state, action)
        #reward[1] = -((state[1][0] - self.target[1][0]) ** 2 + (state[1][1] - self.target[1][1]) ** 2) ** 0.5
        reward[1] = self.expert_reward(state,action)
        # reward[0] = self.learner_reward(state, action)
        # reward[1] = self.expert_reward(state, action)
        #self.state = np.array([[x_learner, y_learner, heading_learner], [x_expert, y_expert, heading_expert]])
        # if (x_learner > self.space_bound or x_learner < -self.space_bound or y_learner > self.space_bound
        #         or y_learner < -self.space_bound):
        #     done[0] = 1
        #     reward[0] = -100
        # if (x_expert > self.space_bound or x_expert < -self.space_bound or y_expert > self.space_bound
        #         or y_expert < -self.space_bound):
        #     done[1] = 1
        #     reward[1] = -100

        if ((state[0][0] - self.target[0][0])**2 + (state[0][1] - self.target[0][1])**2)**0.5 <= 0.05:
            #reward[0] = 500
            done[0] = 1
        if ((state[1][0] - self.target[1][0])**2 + (state[1][1] - self.target[1][1])**2)**0.5 <= 0.05:
            #reward[1] = 500
            done[1] = 1
        # if ((x_learner - x_expert)**2 + (y_learner - y_expert)**2)**0.5 <= 0.05:
        #     # done[0] = 1
        #     # done[1] = 1
        #     reward[0] = -100
        #     reward[1] = -100
        x_learner += v_learner
        y_learner += av_learner
        x_expert += v_expert
        y_expert += av_expert

        self.state = [np.array([x_learner, y_learner],dtype=float), np.array([x_expert, y_expert],dtype=float)]
        return self.state, reward.tolist(), done.tolist()

    def reset(self):
        self.state = [np.array([random.uniform(-0.1,0.1), random.uniform(-0.5,-0.4)]), np.array([random.uniform(0.4,0.5), random.uniform(-0.1,0.1)])]
        return self.state




