

from __future__ import division
import numpy as np
import pylab as pl
import matplotlib.pyplot as plt
from random import random
import pandas as pd
import random
import heapq
import math
from tqdm import tqdm
import scipy.stats as st
import seaborn as sns
from typing import Any




N=2
M=1
beta=1
state=[0]*N
belief=[0]*N
reward=[0]*N
action=[0]*N
total_reward=[]
T_max=2000

l=[]
for i in range(N):
    l.append(i)
# print(l)
A=set()
B=set()
C=[]

value_max=0



transition_active = np.array([[0.5, 0.5], [0.5, 0.5]])
transition_passive = np.array([[0.5, 0.5], [0.5, 0.5]])
transitionName_active = [["00", "01"], ["10", "11"]]
transitionMatrix_active = [[0.5, 0.5], [0.5, 0.5]]
transitionName_passive = [["00", "01"], ["10", "11"]]
transitionMatrix_passive = [[0.5, 0.5], [0.5, 0.5]]

q1 = np.array([[0.5, 0.5], [0.5, 0.5]])
q2 = np.array([[0.5, 0.5], [0.5, 0.5]])

r2 = np.array([[0.8, 0.2]])  # active
r1 = np.array([[0.3, 0.7]])  # passive

value_list_active = [10, 20]
probability_active = [0.8, 0.2]
value_list_passive = [10, 20]
probability_passive = [0.3, 0.7]

R1=[]
R2=[]
x=0
y=0
z=0
w=0




lists_action = [[] for _ in range(N)]
for i in range(len(lists_action)):
    lists_action[i] = []


lists_state = [[] for _ in range(N)]
for i in range(len(lists_state)):
    lists_state[i] = [0]


lists_belief = [[] for _ in range(N)]
for i in range(len(lists_belief)):
    lists_belief[i] = [0]


lists_action_state = [[] for _ in range(N)]
for i in range(len(lists_action_state)):
    lists_action_state[i] = [0 for _ in range(4)]
    # print(lists_action_state[i])


optimal = []


Q_value=[]
for  i in range(N):
    Q_value.append(np.array([[0, 0], [0, 0]],dtype='float32'))

value_function = np.array([[0, 0]],dtype='float32')



alpha_value= [[] for _ in range(N)]
for i in range(len(alpha_value)):
    alpha_value[i] = [0 for _ in range(4)]



whittle_index= [[] for _ in range(T_max+1)]
whittle_index[0] = [0]*N
for i in range(1,len(whittle_index)):
    whittle_index[i] = []



def calculation_value_function(q1,q2,r1,r2,value):

    beta=0.9

    result=0


    # value_state0=[]
    # value_state1=[]
    q_value = np.array([[0, 0], [0, 0]],dtype='float32')
    value_state = value
    Q1 = q1
    Q2 = q2
    R1 = r1
    R2 = r2


    q_value[0][0] = Q1[0][0] * (10 * R1[0][1] + R1[0][1] *beta * value_state[0][0] + (-10) * R1[0][0] + R1[0][0]*beta * value_state[0][0]) + \
                        Q1[0][1] * (10 * R2[0][0] + R2[0][0]*beta * value_state[0][1]+   (20) * R2[0][1] + R2[0][1]*beta * value_state[0][1])
    q_value[0][1] = Q2[0][0] * (10 * R1[0][1] + R1[0][1]*beta * value_state[0][0] + (-10) * R1[0][0] + R1[0][0] *beta * value_state[0][0]) +\
                        Q2[0][1] * (10 * R2[0][0] + R2[0][0] *beta * value_state[0][1] + (20)  * R2[0][1] + R2[0][1]*beta * value_state[0][1])
    q_value[1][0] = Q1[1][0] * (10 * R1[0][1] + R1[0][1]*beta * value_state[0][0] + (-10) * R1[0][0] + R1[0][0]*beta * value_state[0][0]) + \
                        Q1[1][1] * (10 * R2[0][0] + R2[0][0]*beta * value_state[0][1] + (20)  * R2[0][1] + R2[0][1]*beta * value_state[0][1])
    q_value[1][1] = Q2[1][0] * (10 * R1[0][1] + R1[0][1]*beta * value_state[0][0] + (-10) * R1[0][0] + R1[0][0]*beta * value_state[0][0]) + \
                        Q2[1][1] * (10 * R2[0][0] + R2[0][0]*beta * value_state[0][1] + (20)  * R2[0][1] +R2[0][1] *beta * value_state[0][1])
    # print(q_value)
    if q_value[0][0]> q_value[0][1] or q_value[0][0]== q_value[0][1]:
        value_state[0][0] =q_value[0][0]
    else:
        value_state[0][0] = q_value[0][1]
    if q_value[1][0]> q_value[1][1] or q_value[1][0]== q_value[1][1]:
        value_state[0][1] = q_value[1][0]
    else:
        value_state[0][1] = q_value[1][1]

    return q_value,value_state


def activity_forecast(state,events,transition,days):

    activityToday = state
    transitionName=events
    transitionMatrix=transition

    activityList = [activityToday]
    i = 0
    while i != days:
        if activityToday == 0:
            change = np.random.choice(transitionName[0],replace=True,p=transitionMatrix[0])
            if change == "00":
                activityList.append(0)
            else:
                activityList.append(1)
        else:
            change = np.random.choice(transitionName[1],replace=True,p=transitionMatrix[1])
            if change == "10":
                activityList.append(0)
            else:
                activityList.append(1)
        i += 1
    return activityList[-1]



def calculate_transitions(b):
    list4=[]
    list5=[]
    list7=[]
    list6=[]
    T = np.zeros(shape=(2, 2))
    mylist=[]
    for i in range(len(b)):
        mylist.append(b[i])

    if len(mylist) >= 2:
        a=sum((1-mylist[i]) * (1-mylist[i + 1]) for i in range(len(mylist) - 1))
        # list4 =[(1-mylist[i]) * (1-mylist[i + 1]) for i in range(len(mylist) - 1)]

        # list5 = [(1-mylist[i]) * mylist[i + 1] for i in range(len(mylist) - 1)]
        b=sum((1-mylist[i]) * mylist[i + 1] for i in range(len(mylist) - 1))

        # list6 = [mylist[i] * (1-mylist[i + 1]) for i in range(len(mylist) - 1)]
        c=sum(mylist[i] * (1-mylist[i + 1]) for i in range(len(mylist) - 1))

        # list7 = [mylist[i] * mylist[i + 1] for i in range(len(mylist) - 1)]
        d=sum(mylist[i] * mylist[i + 1] for i in range(len(mylist) - 1))
        if (a+b)!=0:
            T[0][0]=a/(a+b)
            T[0][1] = b / (a + b)
        else:
            T[0][0] = 0.5
            T[0][1] = 0.5
        if (c+d)!=0:
            T[1][0]=c / (c + d)
            T[1][1]=d / (c + d)
        else:
            T[1][0] = 0.5
            T[1][1] = 0.5
    else:
        T= np.array([[0.5, 0.5], [0.5, 0.5]])

    return T

def count_state_action(state, action):
    lists_action_state=[0]*4
    for i in range(len(action)):
        if  state[i] == 0 and action[i] == 0 :
            lists_action_state[0] = lists_action_state[0] + 1
        if state[i]  == 0 and action[i] == 1:
            lists_action_state[1] = lists_action_state[1] + 1
        if  state[i] == 1 and action[i] == 0 :
            lists_action_state[2] = lists_action_state[2] + 1
        if  state[i] == 1 and action[i] == 1 :
            lists_action_state[3] = lists_action_state[3] + 1
    return lists_action_state

def belief_count_state(belief,action):
    lists_action_state = [0] * 4
    for i in range(len(belief)):
        if action[i]==0:
            lists_action_state[0] = lists_action_state[0] + (1-belief[i])
            lists_action_state[2] = lists_action_state[2] + belief[i]
        if action[i]==1:
            lists_action_state[1] = lists_action_state[1] + (1-belief[i])
            lists_action_state[3] = lists_action_state[3] +  belief[i]
    return lists_action_state




def number_of_certain_probability(sequence, probability):
    x = np.random.uniform(0, 1)
    cumulative_probability = 0.0
    for item, item_probability in zip(sequence, probability):
        cumulative_probability += item_probability
        if x < cumulative_probability:
            break
    return item

def b_update(reward_active,reward_passive,transition1,transition2,reward,b):
    Q=reward_active
    P= reward_passive
    b_new=0
    if reward== 10:
        sum1 = Q[0][0] * transition1[1][1] * b + P[0][1] * transition1[0][1] * (1 - b)
        # print(sum1)
        sum2 = sum1 + P[0][1] * transition1[0][0] * (1 - b) + Q[0][0] * transition1[1][0] * b
        #print(sum2)
        if sum2!=0:
            b_new=sum1/sum2
    if  reward== 20:
        b_new = 1
    if  reward== -10:
        b_new = 0
    if  reward==0:
        b_new = b*transition2[1][1]+(1-b)*transition2[0][1]
    return b_new


# def b_update(reward_active,reward_passive,transition1,transition2,reward,b):
#     P= reward_passive
#     b_new=0
#     if reward== 1:
#         sum1 = Q[0][0] * transition1[1][1] * b + P[0][0] * transition1[0][1] * (1 - b)
#         # print(sum1)
#         sum2 = sum1 + P[0][0] * transition1[0][0] * (1 - b) + Q[0][0] * transition1[1][0] * b
#         #print(sum2)
#         if sum2!=0:
#             b_new=sum1/sum2
#
#     if  reward== 2:
#         sum1 = Q[0][1] * transition1[1][1] * b + P[0][1] * transition1[0][1] * (1 - b)
#         # print(sum1)
#         sum2 = sum1 + P[0][1] * transition1[0][0] * (1 - b) + Q[0][1] * transition1[1][0] * b
#         # print(sum2)
#         if sum2 != 0:
#             b_new = sum1 / sum2
#
#     if  reward==0:
#         b_new = b*transition2[1][1]+(1-b)*transition2[0][1]
#     return b_new





# def b_update(reward_active,reward_passive,transition1,transition2,reward,b):
#     Q=reward_active
#     P= reward_passive
#     b_new=0
#
#     if reward== 1:
#         sum1 = Q[0][0] * transition1[1][1] * b + P[0][0] * transition1[0][1] * (1 - b)
#         # print(sum1)
#         sum2 = sum1 + P[0][0] * transition1[0][0] * (1 - b) + Q[0][0] * transition1[1][0] * b
#         #print(sum2)
#         if sum2!=0:
#             b_new=sum1/sum2
#
#     if  reward== 5:
#         sum1 = Q[0][1] * transition1[1][1] * b + P[0][1] * transition1[0][1] * (1 - b)
#         # print(sum1)
#         sum2 = sum1 + P[0][1] * transition1[0][0] * (1 - b) + Q[0][1] * transition1[1][0] * b
#         # print(sum2)
#         if sum2 != 0:
#             b_new = sum1 / sum2
#
#     if  reward== 10:
#         sum1 = Q[0][2] * transition1[1][1] * b + P[0][2] * transition1[0][1] * (1 - b)
#         # print(sum1)
#         sum2 = sum1 + P[0][2] * transition1[0][0] * (1 - b) + Q[0][2] * transition1[1][0] * b
#         # print(sum2)
#         if sum2 != 0:
#             b_new = sum1 / sum2
#
#     if  reward==0:
#         b_new = b*transition2[1][1]+(1-b)*transition2[0][1]
#     return b_new





def find_max_matrix(T1):
    result1=0
    result2=0
    if T1[0][0]>=T1[0][1]:
        result1=T1[0][0]
    else:
        result1 = T1[0][1]
    if T1[1][0]>=T1[1][1]:
        result2=T1[1][0]
    else:
        result2 = T1[1][1]
    if result1>=result2:
        result=result1
    else:
        result=result2
    return result


res = [[[0 for item3 in range(2)] for item2 in range(2)] for item1 in range(N)]
value_max = [[0 for item3 in range(1)]  for item1 in range(N)]


def array_diff(a, b):
  c = list(set(a) - set(b))
  return c

def bellman(N,M,state,key):
    index=[]
    state_new=[]
    access=[]
    index_diff=[]
    result=[]
    l=[]
    action = [0] * N
    for i in range(N):
        l.append(i)
    for i in range(len(state)):
        if state[i]==key:
            index.append(i)
    # print(index)

    if len(index)>M or len(index)==M:
        index=random.sample(index, M)

    if (len(index)>0 and len(index)<M) or (len(index)==0):
        index_diff = array_diff(l,index)
        for j in range(len(index_diff)):
            state_new.append(state[index_diff[j]])
        # print(index_diff)
        # print(state_new)
        if key==1:
            result = list(map(state_new.index, heapq.nlargest(M-len(index), state_new)))
        if key==0:
            result = list(map(state_new.index, heapq.nsmallest(M-len(index), state_new)))

        # print(result)
        for j in range(len(result)):
            index.append(index_diff[result[j]])

    for i in range(len(index)):
        action[index[i]] = 1
    return action


def beta_update(belief,a1,b1,a2,b2,a3,b3,a4,b4,action):
    w1=belief[-2]*belief[-1]
    w2=belief[-2]*(1-belief[-1])
    w3=(1-belief[-2])*belief[-1]
    w4=(1-belief[-2])*(1-belief[-1])
    if (w1+w2)!=0:
        p1=w1/(w1+w2)
    else:
        p1=0
    if (w3+w4)!=0:
        p2=w4/(w3+w4)
    else:
        p2=0

    if action==1:
        if random.random()<p1:
            a2 = a2 + 1
        else:
            b2 = b2 + 1

        if random.random() < p2:
            a1 = a1 + 1
        else:
            b1 = b1 + 1

    if action==0:
        if random.random()<p1:
            a4 = a4 + 1
        else:
            b4 = b4 + 1

        if random.random() < p2:
            a3 = a3 + 1
        else:
            b3 = b3 + 1

    return a1,b1,a2,b2,a3,b3,a4,b4



def find_optimal_action(q1,q2,r1,r2):

    value=np.array([[0,0]])

    result1=np.array([[0,0], [0,0]])
    result2=np.array([[0,0]])

    for t in range(100):
        result1,result2 = calculation_value_function(q1,q2,r1,r2,value)

    if result1[0][0]>=result1[0][1]:
        key1 = result1[0][0]
    else:
        key1 = result1[0][1]

    if result1[1][0]>=result1[1][1]:
        key2 = result1[1][0]
    else:
        key2 = result1[1][1]

    if key1>=key2:
        key3 = 0
    else:
        key3 = 1
    return key3


def  spectral_mattix(Y,):

    n=len(Y)
    arm_total=2
    push_each_time=1
    v1_estimate=[]
    v2_estimate=[]

    Y = [[] for _ in range(n)]
    for i in range(n):
        Y[i] = [0]*(2*arm_total)


    # Y=[[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 0, 0, 0], [0, 0, 0, 1]]
    Y2=Y[1:n-1]
    # print(Y2)
    Y1=Y[0:n-2]
    # print(Y1)
    Y3=Y[2:n]
    # print(Y3)

    Inital=[0,0,0,0]
    sum12=np.tensordot(Inital, Inital, axes=0)
    sum32=sum12
    sum31=sum12
    sum21=sum12
    sum13=sum12
    sum123=sum12
    # print(np.tensordot(Y1[0], Y2[0], axes=0))
    # print(sum12)

    for t in range(len(Y2)):
        sum12=sum12+np.tensordot(Y1[t], Y2[t], axes=0)
        sum32=sum32+np.tensordot(Y3[t], Y2[t], axes=0)
        sum31=sum31+np.tensordot(Y3[t], Y1[t], axes=0)
        sum21=sum21+np.tensordot(Y2[t], Y1[t], axes=0)

        sum13=sum13+np.tensordot(Y1[t], Y3[t], axes=0)
        temp2=np.tensordot(Y1[t], Y2[t], axes=0)
        sum123=sum123+np.tensordot(temp2, Y3[t], axes=0)

    sum12=sum12/(len(Y2))
    sum32=sum32/(len(Y2))
    sum31=sum31/(len(Y2))
    sum21=sum21/(len(Y2))
    sum13=sum13/(len(Y2))
    sum123=sum123/(len(Y2))


    for i in range(len(Y2)):
        v1_estimate.append(sum32*np.linalg.pinv(sum12)*Y1[i])
        v2_estimate.append(sum31*np.linalg.pinv(sum21)*Y2[i])
    # print(v1_estimate)
    # print(v2_estimate)


    sum_M2=np.tensordot(Inital, Inital, axes=0)
    sum_M3=np.tensordot(Inital, Inital, axes=0)

    for t in range(len(Y2)):
        sum_M2=sum_M2+np.tensordot(v1_estimate[t], v2_estimate[t], axes=0)
        temp1=np.tensordot(v1_estimate[t], v2_estimate[t], axes=0)
        temp2=np.tensordot(temp1, Y3[t], axes=0)
        sum_M3=sum_M3+temp2

    sum_M2=sum_M2/(len(Y2))
    sum_M3=sum_M3/(len(Y2))
    # print(sum_M2)
    # print(sum_M3)

    U1, S1, U2T = np.linalg.svd(sum_M2)
    U2=np.transpose(U2T)
    U4, S2, U3T = np.linalg.svd(sum13)
    U3=np.transpose(U3T)


    a=random.uniform(0, 360)
    print(a)



    U3=np.array([[1,1],[1,1]])


    theta=np.array([[math.cos(a),-math.sin(a)],[math.sin(a),math.cos(a)]])
    print(theta[0])
    # print(type(theta[0]))

    lam=np.zeros((2,2))


    temp4=np.dot(np.transpose(U1),sum12)
    temp5=np.dot(temp4,U2)
    part2=np.linalg.pinv(temp5)


    temp7=np.dot(np.transpose(U1),sum123)

    for i in range(theta.shape[0]):
        temp3=U3*theta[i]
        temp8=np.dot(temp7,temp3)
        temp9=np.dot(temp8,U2)

        B123=np.dot(temp9,part2)

        D,V=np.linalg.eig(B123)

        lam[i]=np.diagonal(D)

    temp10=np.dot(U3,np.linalg.pinv(theta))


    M3_eatimate=np.dot(temp10,lam)

    return M3_eatimate





access=[0]*N
optimal_solution=0

sum_final=[]

sum_duo=[]



state = [0] * N
belief = [0] * N
reward = [0] * T_max
action = [0] * N
total_reward = []

transition_active = np.array([[0.7, 0.3], [0.3, 0.7]])
transition_passive = np.array([[0.9, 0.1], [0.1, 0.9]])

transitionName_active = [["00", "01"], ["10", "11"]]
transitionMatrix_active = [[0.7, 0.3], [0.3, 0.7]]

transitionName_passive = [["00", "01"], ["10", "11"]]
transitionMatrix_passive = [[0.9, 0.1], [0.1, 0.9]]
q1 = np.array([[0.7, 0.3], [0.3, 0.7]])
q2 = np.array([[0.9, 0.1], [0.1, 0.9]])

value_list_active = [10, 20]
probability_active = [0.2, 0.8]
value_list_passive = [-10, 10]
probability_passive = [0.8, 0.2]

r1 = np.array([[0.8, 0.2]])
r2 = np.array([[0.2, 0.8]])


value_list = [0, 1]
probability = [0.5,0.5]

epison_1=0.1
epison_2=0.1


a = st.beta.rvs(1, 1, size=1)
a = (1-epison_1)*a[0]+epison_1

b = st.beta.rvs(1, 1, size=1)
b = (1-epison_2)*b[0]+epison_2


for i in tqdm(range(100)):

    state = [0] * N
    belief = [0.5] * N
    reward = [0] * N
    action = [0] * N
    total_reward = []



    transition_active = np.array([[a, 1-a], [1-a, a]])
    transition_passive = np.array([[a, 1-a], [1-a, a]])

    transitionName_active = [["00", "01"], ["10", "11"]]
    transitionMatrix_active = [[a, 1-a], [1-a, a]]

    transitionName_passive = [["00", "01"], ["10", "11"]]
    transitionMatrix_passive = [[a, 1-a], [1-a, a]]
    q1 = np.array([[a, 1-a], [1-a, a]])
    q2 = q1

    value_list_active = [10, 20]
    probability_active = [1 - b, b]
    value_list_passive = [-10, 10]
    probability_passive = [b, 1 - b]
    r1 = np.array([[b, 1 - b]])
    r2 = np.array([[1 - b, b]])

    access = [0] * N

    # optimal_solution = find_optimal_action(q1, q2, r1, r2)
    optimal_solution = 1

    belief_big = [0] * T_max
    tau1 = 100
    tau2 = 1000

    estimated_p00=0
    estimated_p11=0



    for k in range(20):
        for t in range(tau1):
            sum=0
            result=number_of_certain_probability(value_list, probability)
            action[result]=1
            for i in range(N):
                if action[i] == 1 and state[i] == 1:
                    reward[i] = number_of_certain_probability(value_list_active, probability_active)
                if action[i] == 1 and state[i] == 0:
                    reward[i] = number_of_certain_probability(value_list_passive, probability_passive)
                # if action[i] == 0:
                #     reward[t] = 0
            for ele in range(0, len(reward)):
                sum = sum + reward[ele]
            total_reward.append(sum)

            for i in range(N):
                if action[i] == 1:
                    state[i] = activity_forecast(state[i], transitionName_active, transitionMatrix_active, 1)
                else:
                    state[i] = activity_forecast(state[i], transitionName_passive, transitionMatrix_passive, 1)
            for i in range(len(reward)):
                belief[i] = b_update(r2, r1, q1, q2, reward[i], belief[i])

        # temp=spectral_mattix(total_reward)
        # estimated_p00=temp[0]
        # estimated_p11=temp[1]

        for t in range(tau1,math.ceil(math.sqrt(k))*tau2):

            key3 = optimal_solution
            access = bellman(N, 1, belief, key3)
            action = access

            sum=0
            for i in range(N):
                if action[i] == 1 and state[i] == 1:
                    reward[i] = number_of_certain_probability(value_list_active, probability_active)
                if action[i] == 1 and state[i] == 0:
                    reward[i] = number_of_certain_probability(value_list_passive, probability_passive)
                if action[i] == 0:
                    reward[i] = 0
            for ele in range(0, len(reward)):
                sum = sum + reward[ele]
            total_reward.append(sum)

            for i in range(N):
                if action[i] == 1:
                    state[i] = activity_forecast(state[i], transitionName_active, transitionMatrix_active, 1)
                else:
                    state[i] = activity_forecast(state[i], transitionName_passive, transitionMatrix_passive, 1)

            for i in range(len(reward)):
                belief[i] = b_update(r2, r1, q1, q2, reward[i], belief[i])

            optimal_solution = find_optimal_action(q1, q2, r1, r2)
            if t>T_max:
                break

    sum_duo.append(total_reward)



for i in range(len(sum_duo)):
    temp = 0
    for j in range(len(sum_duo[i])):
        temp = temp + np.power(beta, j) * sum_duo[i][j]
        sum_duo[i][j] = temp

final = [0] * T_max
for i in range(T_max):
    for j in range(len(sum_duo)):
        final[i] = final[i] + sum_duo[j][i]
    final[i] = final[i] / len(sum_duo)

# print(final)

file3 = open("seeu.txt", 'w')
for i in range(len(final)):
    file3.write(str(final[i]) + '\n')
file3.close()





