import numpy as np

from collections import defaultdict
from copy import deepcopy
from ast import literal_eval

#########################################################################################
    
def task_exp(tasks, task):
    exp = 'm+'
    if len(tasks)>0:
        goals = tasks[0].keys()
        for goal in goals:
            if task[goal]>0:
                for j in range(len(tasks)):
                    if tasks[j][goal]>0:
                        exp += str(j)
                    else:
                        exp += '-'+str(j)
                    exp += '.'
                exp = exp[:-1]
                exp += '+'
    exp = exp[:-1]
    return exp

def exp_task(tasks, exp):
    task = defaultdict(int)
    if len(tasks)>0:
        goals = tasks[0].keys()
        for goal in goals:
            for e1 in exp.split('+')[1:]:
                b = 1
                for e2 in e1.split('.'):
                    j = abs(int(e2))
                    if '-' not in e2:
                        b *= (tasks[j][goal]>0)
                    else:
                        b *= 1-(tasks[j][goal]>0)
                    if not b:
                        break
                if b:
                    task[goal] = 1
                    break
    return task

def exp_value(values, exp, n_actions = 5):
    def Q_(state,goal):
        Q = values[0][state][goal]*0
        if len(values)>0:
            for e1 in exp.split('+')[1:]:
                b = values[0][state][goal]
                for e2 in e1.split('.'):
                    j = abs(int(e2))
                    if '-' not in e2:
                        b = np.min([b,values[j][state][goal]],axis=0)
                    else:
                        b = np.min([b, (values[0][state][goal]-values[j][state][goal])], axis=0)
                Q = np.max([Q,b], axis=0)
        return Q                    
    return Q_

#########################################################################################
def Q_equal(Q1,Q2,epsilon=1e-8):    
    for state in Q1:
        for action in range(len(Q1[state])): 
            v1 = Q1[state][action]
            v2 = Q2[state][action]
            if abs(v1-v2)>epsilon:
                return False
    return True

def EQ_equal(EQ1,EQ2,epsilon=1e-8):    
    for state in EQ1:
        for goal in EQ1[state]:
            for action in range(len(EQ1[state][goal])): 
                v1 = EQ1[state][goal][action]
                v2 = EQ2[state][goal][action]
                if abs(v1-v2)>epsilon:
                    return False
    return True

def Q_copy(Q1,Q2):    
    for state in Q1:
        Q2[state] = Q1[state].copy()

def EQ_copy(EQ1,EQ2):    
    for state in EQ1:
        for goal in EQ1[state]:
            EQ2[state][goal] = EQ1[state][goal].copy()

#########################################################################################
def epsilon_greedy_policy_improvement(env, Q, epsilon=1):
    """
    Implements policy improvement by acting epsilon-greedily on Q

    Arguments:
    env -- environment with which agent interacts
    Q -- Action function for current policy
    epsilon -- probability

    Returns:
    policy_improved -- Improved policy
    """
        
    def policy_improved(state, epsilon = epsilon, Q=Q):
        probs = np.ones(env.action_space.n, dtype=float)*(epsilon/env.action_space.n)
        best_action = np.random.choice(np.flatnonzero(Q[state] == Q[state].max())) #np.argmax(Q[state]) #
        probs[best_action] += 1.0 - epsilon
        return probs

    return policy_improved

def epsilon_greedy_generalised_policy_improvement(env, Q, epsilon = 1):
    """
    Implements generalised policy improvement by acting epsilon-greedily on Q

    Arguments:
    env -- environment with which agent interacts
    Q -- Action function for current policy

    Returns:
    policy_improved -- Improved policy
    """
    
    def policy_improved(state, goal = None, epsilon = epsilon, Q=Q):
        probs = np.ones(env.action_space.n, dtype=float)*(epsilon/env.action_space.n)
        values = [Q[state][goal]] if goal else [Q[state][goal] for goal in Q[state].keys()]
        if len(values)==0:
            best_action = np.random.randint(env.action_space.n)
        else:
            values = np.max(values,axis=0)
            best_action = np.random.choice(np.flatnonzero(values == values.max()))
        probs[best_action] += 1.0 - epsilon
        return probs

    return policy_improved

def evaluateQ(env, Q, gamma=0.9):
    G=0
    behaviour_policy =  epsilon_greedy_policy_improvement(env, Q)
    for i in range(100):        
        state = env.reset()
        for t in range(100):
            probs = behaviour_policy(state, epsilon = 0)
            action = np.random.choice(np.arange(len(probs)), p=probs)    
            state_, reward, done, _ = env.step(action) 
            G += (gamma**t)*reward
            state = state_
            if done:
                break
    return G/100

def evaluate(env, Q, gamma=0.9):
    G=0
    behaviour_policy =  epsilon_greedy_generalised_policy_improvement(env, Q)
    for i in range(100):        
        state = env.reset()
        for t in range(100):
            probs = behaviour_policy(state, epsilon = 0)
            action = np.random.choice(np.arange(len(probs)), p=probs)    
            state_, reward, done, _ = env.step(action) 
            G += (gamma**t)*reward
            state = state_
            if done:
                break
    return G/100

def evaluateT(env, goals, task, Q, learned=None, gamma=0.9):
    G=0
    tasks_SOP, values_SOP = learned    
    exp = task_exp(tasks_SOP, task)
    task_ = exp_task(tasks_SOP, exp)
    Q_ = exp_value(values_SOP, exp)
    behaviour_policy =  epsilon_greedy_generalised_policy_improvement(env, Q)
    for i in range(100):        
        state = env.reset()
        values = np.array([Q_(state,goal).max() for goal in goals])
        goal = goals[values.argmax()]                
        for t in range(100):
            if task[goal]==task_[goal]:
                action = Q_(state,goal).argmax()
            else:
                probs = behaviour_policy(state, goal, epsilon = 0)
                action = np.random.choice(np.arange(len(probs)), p=probs)    
            state_, reward, done, _ = env.step(action) 
            G += (gamma**t)*reward
            state = state_
            if done:
                break
    return G/100

#########################################################################################
def Q_learning(env, Q_init=None, Q_optimal=None, gamma=1, epsilon=0.1, alpha=0.1, maxiter=100, maxstep=100):
    """
    Implements Q_learning

    Arguments:
    env -- environment with which agent interacts
    gamma -- discount factor
    alpha -- learning rate
    maxiter -- maximum number of episodes

    Returns:
    Q -- New estimate of Q function
    """
    Q = Q_init if Q_init else defaultdict(lambda: np.zeros(env.action_space.n))
    Q_target = defaultdict(lambda: np.zeros(env.action_space.n))
    behaviour_policy =  epsilon_greedy_policy_improvement(env, Q, epsilon = epsilon)
    
    stop_cond = lambda k: k > maxiter
    if Q_optimal:
        stop_cond = lambda k: k>1000 and Q_equal(Q_optimal,Q)
                
    stats = {"E":[], "R":[], "T":0}
    k=0
    T=0
    state = env.reset()
    stats["R"].append(0)
    while not stop_cond(k):
        probs = behaviour_policy(state, epsilon = epsilon)
        action = np.random.choice(np.arange(len(probs)), p=probs)            
        state_, reward, done, _ = env.step(action)
        
        stats["R"][k] += reward
        
        G = 0 if done else np.max(Q[state_])
        TD_target = reward + gamma*G
        TD_error = TD_target - Q[state][action]
        Q[state][action] = Q[state][action] + alpha*TD_error
        
        state = state_
        T+=1
        if done:            
            if k%10==0:
                stats["E"].append(evaluateQ(env, Q, gamma))
                
            # # # print(k)
            # if Q_equal(Q,Q_target):
            #     break
            # Q_copy(Q,Q_target)
            
            state = env.reset()
            stats["R"].append(0)
            k+=1
    stats["T"] = T
    
    return Q, stats

def Goal_Oriented_Q_learning(env, T_states=None, Q_optimal=None, gamma=1, epsilon=0.1, alpha=0.1, maxiter=100, maxstep=100):
    """
    Implements Goal Oriented Q_learning

    Arguments:
    env -- environment with which agent interacts
    gamma -- discount factor
    alpha -- learning rate
    maxiter -- maximum number of episodes

    Returns:
    Q -- New estimate of Q function
    """
    N = env.rmin#min(env.rmin, (env.rmin-env.rmax)*env.diameter)
    Q = defaultdict(lambda: defaultdict(lambda: np.zeros(env.action_space.n)))
    Q_target = defaultdict(lambda: defaultdict(lambda: np.zeros(env.action_space.n)))
    behaviour_policy =  epsilon_greedy_generalised_policy_improvement(env, Q, epsilon = epsilon)
    
    sMem={} # Goals memory
    if T_states:
        for state in T_states:
            sMem[str(state)]=0
    
    stop_cond = lambda k: k > maxiter
    if Q_optimal:
        stop_cond = lambda k: k>1000 and EQ_equal(Q,Q_target)
                
    stats = {"R":[], "T":0}
    k=0
    T=0
    state = env.reset()
    stats["R"].append(0)
    while not stop_cond(k):
        probs = behaviour_policy(state, epsilon = epsilon)
        action = np.random.choice(np.arange(len(probs)), p=probs)            
        state_, reward, done, _ = env.step(action)
        
        stats["R"][k] += (gamma**T)*reward
        
        if done:
            sMem[state] = 0
        
        for goal in sMem.keys():
            if state != goal and done:  
                reward_ = N
            else:
                reward_ = reward
            
            G = 0 if done else np.max(Q[state_][goal])
            TD_target = reward_ + gamma*G
            TD_error = TD_target - Q[state][goal][action]
            Q[state][goal][action] = Q[state][goal][action] + alpha*TD_error
                
        state = state_
        T+=1
        if done:
            # print(k)
            EQ_copy(Q,Q_target)
            
            state = env.reset()
            stats["R"].append(0)
            stats["T"] += T
            k+=1
            T=0

    return Q, stats

def GOAL(env, T_states=None, Q_optimal=None, gamma=1, epsilon=0.1, alpha=0.1, maxiter=100, maxstep=100):
    """
    Implements Goal Oriented Q_learning

    Arguments:
    env -- environment with which agent interacts
    gamma -- discount factor
    alpha -- learning rate
    maxiter -- maximum number of episodes

    Returns:
    Q -- New estimate of Q function
    """
    N = env.rmin#min(env.rmin, (env.rmin-env.rmax)*env.diameter)
    task = defaultdict(int)
    Q = defaultdict(lambda: defaultdict(lambda: np.zeros(env.action_space.n)))
    Q_target = defaultdict(lambda: defaultdict(lambda: np.zeros(env.action_space.n)))
    behaviour_policy =  epsilon_greedy_generalised_policy_improvement(env, Q)
    
    sMem={} # Goals memory
    if T_states:
        for state in T_states:
            sMem[str(state)]=0
    
    stop_cond = lambda k: k > maxiter
    if Q_optimal:
        stop_cond = lambda k: k>1000 and EQ_equal(Q,Q_target)
                
    stats = {"E":[], "R":[], "T":0}
    k=0
    T=0
    state = env.reset()
    stats["R"].append(0)
    goals = []
    goal = None
    while not stop_cond(k):
        probs = behaviour_policy(state, goal = goal, epsilon = epsilon)
        action = np.random.choice(np.arange(len(probs)), p=probs)            
        state_, reward, done, _ = env.step(action)  
        
        stats["R"][k] += (gamma**T)*reward
        if done:
            sMem[state] = 0
            goals = list(sMem.keys())
                               
        for goal_ in goals:
            if state != goal_ and done:  
                reward_ = N
            else:
                reward_ = reward
            
            G = 0 if done else np.max(Q[state_][goal_])
            TD_target = reward_ + gamma*G
            TD_error = TD_target - Q[state][goal_][action]
            Q[state][goal_][action] = Q[state][goal_][action] + alpha*TD_error
        
        T+=1
        if done:
            if k%10==0:
                stats["E"].append(evaluate(env, Q, gamma))
                
            task[state] = task[state] + alpha*(int(reward>0) - task[state])
            
            # print(k)
            EQ_copy(Q,Q_target)
                        
            # sMem[state] = 0
            # goals = list(sMem.keys())
            
            state = env.reset()
            
            # probs = np.ones(len(goals), dtype=float)*(epsilon/len(goals))
            # values = [Q[state][goal].max() for goal in goals]
            # values = np.array(values)
            # best_goal = np.random.choice(np.flatnonzero(values == values.max()))
            # probs[best_goal] += 1.0 - epsilon
            # goal = np.random.choice(goals, p=probs) 
            goal = np.random.choice(goals)
            
            stats["R"].append(0)
            stats["T"] += T
            k+=1
            T=0
        else:
            state = state_

    return task, Q, stats

def GOALT(env, learned = None, T_states=None, Q_optimal=None, gamma=1, epsilon=0.1, alpha=0.1, maxiter=100, maxstep=100):
    """
    Implements Goal Oriented Q_learning

    Arguments:
    env -- environment with which agent interacts
    gamma -- discount factor
    alpha -- learning rate
    maxiter -- maximum number of episodes

    Returns:
    Q -- New estimate of Q function
    """
    N = env.rmin#min(env.rmin, (env.rmin-env.rmax)*env.diameter)
    task = defaultdict(int)
    Q = defaultdict(lambda: defaultdict(lambda: np.zeros(env.action_space.n)))
    Q_target = defaultdict(lambda: defaultdict(lambda: np.zeros(env.action_space.n)))
    behaviour_policy =  epsilon_greedy_generalised_policy_improvement(env, Q)
    
    tasks_SOP, values_SOP = learned
    
    sMem={'w':0} # Goals memory
    if T_states:
        for state in T_states:
            sMem[str(state)]=0
    
    stop_cond = lambda k: k > maxiter
    if Q_optimal:
        stop_cond = lambda k: k>1000 and EQ_equal(Q,Q_target)
                
    stats = {"E":[], "R":[], "T":0}
    k=0
    T=0
    state = env.reset()
    stats["R"].append(0)
    goals = list(sMem.keys())  
    goal = np.random.choice(goals)
    
    exp = task_exp(tasks_SOP, task)
    task_ = exp_task(tasks_SOP, exp)
    Q_ = exp_value(values_SOP, exp)      
    
    while not stop_cond(k):           
        if np.random.random()>epsilon:            
            if task[goal]==task_[goal]:
                action = Q_(state,goal).argmax()
            else:
                probs = behaviour_policy(state, goal = goal, epsilon = 0)
                action = np.random.choice(np.arange(len(probs)), p=probs) 
        else:
            action = np.random.randint(env.action_space.n)
        state_, reward, done, _ = env.step(action)  
        
        stats["R"][k] += (gamma**T)*reward
        if done:
            sMem[state] = 0
            goals = list(sMem.keys())
                               
        for goal_ in goals:
            if state != goal_ and done:  
                reward_ = N
            else:
                reward_ = reward
            
            G = 0 if done else np.max(Q[state_][goal_])
            TD_target = reward_ + gamma*G
            TD_error = TD_target - Q[state][goal_][action]
            Q[state][goal_][action] = Q[state][goal_][action] + alpha*TD_error
        
        T+=1
        if done:
            if k%10==0:
                stats["E"].append(evaluateT(env, goals, task, Q, learned, gamma))
                
            task[state] = task[state] + alpha*(int(reward>0) - task[state])
            
            # print(k)
            EQ_copy(Q,Q_target)
                        
            # sMem[state] = 0
            # goals = list(sMem.keys())
            
            state = env.reset()
            
            # probs = np.ones(len(goals), dtype=float)*(epsilon/len(goals))
            # values = [Q[state][goal].max() for goal in goals]
            # values = np.array(values)
            # best_goal = np.random.choice(np.flatnonzero(values == values.max()))
            # probs[best_goal] += 1.0 - epsilon
            # goal = np.random.choice(goals, p=probs) 
            goal = np.random.choice(goals)
                    
            exp = task_exp(tasks_SOP, task)
            task_ = exp_task(tasks_SOP, exp)
            Q_ = exp_value(values_SOP, exp)
            
            stats["R"].append(0)
            stats["T"] += T
            k+=1
            T=0
        else:
            state = state_

    return task, Q, stats


def SFGPIB(envs, T_states=None, Q_optimal=None, gamma=1, epsilon=0.1, alpha=0.1, maxiter=100, maxstep=100):
    """
    Implements SFGPI

    Arguments:
    env -- environment with which agent interacts
    gamma -- discount factor
    alpha -- learning rate
    maxiter -- maximum number of episodes

    Returns:
    Q -- New estimate of Q function
    """
    
    rs = [defaultdict(lambda: np.zeros(env.action_space.n)) for env in envs]
    Qs = [defaultdict(lambda: defaultdict(lambda: np.zeros(env.action_space.n))) for env in envs]
    behaviour_policies =  [epsilon_greedy_generalised_policy_improvement(envs[t], Qs[t]) for t in range(len(envs))]
    
    stop_cond = lambda k: k > maxiter
                
    stats = {"R":[], "T":0}
    k=0
    T=0
    t = np.random.randint(len(envs))
    state = envs[t].reset()
    stats["R"].append(0)
    while not stop_cond(k):
        probs = behaviour_policies[t](state, epsilon = epsilon)
        action = np.random.choice(np.arange(len(probs)), p=probs)            
        state_, reward, done, _ = envs[t].step(action)  
        
        stats["R"][k] += (gamma**T)*reward
                               
        for i in range(len(envs)):      
            a_ = np.argmax(Qs[i][state_][i])
            G = 0 if done else Qs[t][state_][i][a_]
            TD_target = reward + gamma*G
            TD_error = TD_target - Qs[t][state][i][action]
            Qs[t][state][i][action] = Qs[t][state][i][action] + alpha*TD_error
        
        T+=1
        if done or T>100:
            rs[t][state][action] = rs[t][state][action] + alpha*(reward - rs[t][state][action])
            
            t = np.random.randint(len(envs))
            state = envs[t].reset()
                        
            stats["R"].append(0)
            stats["T"] += T
            k+=1
            T=0
        else:
            state = state_

    return rs, Qs, stats

def SFGPIT(envs, T_states=None, Q_optimal=None, gamma=1, epsilon=0.1, alpha=0.1, maxiter=100, maxstep=100):
    """
    Implements SFGPI

    Arguments:
    env -- environment with which agent interacts
    gamma -- discount factor
    alpha -- learning rate
    maxiter -- maximum number of episodes

    Returns:
    Q -- New estimate of Q function
    """
    
    rs = [defaultdict(lambda: np.zeros(env.action_space.n)) for env in envs]
    Qs = [defaultdict(lambda: defaultdict(lambda: np.zeros(env.action_space.n))) for env in envs]
    behaviour_policies =  [epsilon_greedy_generalised_policy_improvement(envs[t], Qs[t]) for t in range(len(envs))]
    
    stop_cond = lambda k: k > maxiter
                
    stats = {"R":[], "T":0}
    k=0
    T=0
    t = np.random.randint(len(envs))
    state = envs[t].reset()
    stats["R"].append(0)
    while not stop_cond(k):
        probs = behaviour_policies[t](state, epsilon = epsilon)
        action = np.random.choice(np.arange(len(probs)), p=probs)            
        state_, reward, done, _ = envs[t].step(action)  
        
        stats["R"][k] += (gamma**T)*reward
                               
        for i in range(len(envs)):      
            a_ = np.argmax(Qs[i][state_][i])
            G = 0 if done else Qs[t][state_][i][a_]
            TD_target = reward + gamma*G
            TD_error = TD_target - Qs[t][state][i][action]
            Qs[t][state][i][action] = Qs[t][state][i][action] + alpha*TD_error
        
        T+=1
        if done or T>100:
            rs[t][state][action] = rs[t][state][action] + alpha*(reward - rs[t][state][action])
            
            t = np.random.randint(len(envs))
            state = envs[t].reset()
                        
            stats["R"].append(0)
            stats["T"] += T
            k+=1
            T=0
        else:
            state = state_

    return rs, Qs, stats

#########################################################################################
def EQ_NP(EQ):
    P = defaultdict(lambda: defaultdict(lambda: 0))
    for state in EQ:
        for goal in EQ[state]:
                P[state][goal] = np.argmax(EQ[state][goal])
                #v = EQ[state][goal]
                #P[state][goal] = np.random.choice(np.flatnonzero(v == v.max()))
    return P
def EQ_P(EQ, goal=None):
    P = defaultdict(lambda: 0)
    for state in EQ:
        if goal:
            P[state] = np.argmax(EQ[state][goal])
            #v = EQ[state][goal]
            #P[state] = np.random.choice(np.flatnonzero(v == v.max()))
        else:
            Vs = [EQ[state][goal] for goal in EQ[state].keys()]
            P[state] = np.argmax(np.max(Vs,axis=0))
            #v = np.max(Vs,axis=0)
            #P[state] = np.random.choice(np.flatnonzero(v == v.max()))
    return P
def Q_P(Q):
    P = defaultdict(lambda: 0)
    for state in Q:
        P[state] = np.argmax(Q[state])
    return P

def EQ_NV(EQ):
    V = defaultdict(lambda: defaultdict(lambda: 0))
    for state in EQ:
        for goal in EQ[state]:
                V[state][goal] = np.max(EQ[state][goal])
    return V
def EQ_V(EQ, goal=None):
    V = defaultdict(lambda: 0)
    for state in EQ:
        if goal:
            V[state] = np.max(EQ[state][goal])
        else:
            Vs = [EQ[state][goal] for goal in EQ[state].keys()]
            V[state] = np.max(np.max(Vs,axis=0))
    return V
def NV_V(NV, goal=None):
    V = defaultdict(lambda: 0)
    for state in NV:
        if goal:
            V[state] = NV[state][goal]
        else:
            Vs = [NV[state][goal] for goal in NV[state].keys()]
            V[state] = np.max(Vs)
    return V
def Q_V(Q):
    V = defaultdict(lambda: 0)
    for state in Q:
        V[state] = np.max(Q[state])
    return V

def EQ_Q(EQ, goal=None):
    Q = defaultdict(lambda: np.zeros(5))
    for state in EQ:
        if goal:
            Q[state] = EQ[state][goal]
        else:
            Vs = [EQ[state][goal] for goal in EQ[state].keys()]
            Q[state] = np.max(Vs,axis=0)
    return Q

#########################################################################################
def MAX(Q1, Q2):
    Q = defaultdict(lambda: 0)
    for s in list(set(list(Q1.keys())) & set(list(Q2.keys()))):
        Q[s] = np.max([Q1[s],Q2[s]], axis=0)
    return Q

def MIN(Q1, Q2):
    Q = defaultdict(lambda: 0)
    for s in list(set(list(Q1.keys())) & set(list(Q2.keys()))):
        Q[s] = np.min([Q1[s],Q2[s]], axis=0)
    return Q

def AVG(Q1, Q2):
    Q = defaultdict(lambda: 0)
    for s in list(set(list(Q1.keys())) & set(list(Q2.keys()))):
        Q[s] = (Q1[s]+Q2[s])/2
    return Q

#########################################################################################
def EQMAX(EQ,rmax=2): #Estimating EQ_max
    rmax = rmax
    EQ_max = defaultdict(lambda: defaultdict(lambda: np.zeros(5)))
    for s in list(EQ.keys()):
        for g in list(EQ[s].keys()):
            c = rmax-max(EQ[g][g])
            if s==g:
                EQ_max[s][g] = EQ[s][g]*0 + rmax
            else:      
                EQ_max[s][g] = EQ[s][g] + c   
    return EQ_max

def EQMIN(EQ,rmin=-0.1): #Estimating EQ_min
    rmin = rmin
    EQ_min = defaultdict(lambda: defaultdict(lambda: np.zeros(5)))
    for s in list(EQ.keys()):
        for g in list(EQ[s].keys()):
            c = rmin-max(EQ[g][g])
            if s==g:
                EQ_min[s][g] = EQ[s][g]*0 + rmin
            else:      
                EQ_min[s][g] = EQ[s][g] + c  
    return EQ_min

def NOTD(EQ, EQ_max):
    EQ_not = defaultdict(lambda: defaultdict(lambda: np.zeros(5)))
    for s in list(EQ_max.keys()):
        for g in list(EQ_max[s].keys()):
            EQ_not[s][g] = EQ_max[s][g] - EQ[s][g]    
    return EQ_not

def NOT(EQ, EQ_max=None, EQ_min=None):
    EQ_max = EQ_max if EQ_max else EQMAX(EQ)
    EQ_min = EQ_min if EQ_min else EQMIN(EQ)
    EQ_not = defaultdict(lambda: defaultdict(lambda: np.zeros(5)))
    for s in list(EQ_max.keys()):
        for g in list(EQ_max[s].keys()):
            EQ_not[s][g] = (EQ_max[s][g]+EQ_min[s][g]) - EQ[s][g]    
    return EQ_not

def OR(EQ1, EQ2):
    EQ = defaultdict(lambda: defaultdict(lambda: np.zeros(5)))
    for s in list(set(list(EQ1.keys())) | set(list(EQ2.keys()))):
        for g in list(set(list(EQ1[s].keys())) | set(list(EQ2[s].keys()))):
            EQ[s][g] = np.max([EQ1[s][g],EQ2[s][g]],axis=0)
    return EQ

def AND(EQ1, EQ2):
    EQ = defaultdict(lambda: defaultdict(lambda: np.zeros(5)))
    for s in list(set(list(EQ1.keys())) | set(list(EQ2.keys()))):
        for g in list(set(list(EQ1[s].keys())) | set(list(EQ2[s].keys()))):
            EQ[s][g] = np.min([EQ1[s][g],EQ2[s][g]],axis=0)
    return EQ
#########################################################################################
