

from __future__ import division
import numpy as np
import pylab as pl
import matplotlib.pyplot as plt
from random import random
import pandas as pd
import random
import heapq
from tqdm import tqdm
import scipy.stats as st
import seaborn as sns
from typing import Any
from numpy import *



T_max=50000
N=2
M=1
beta=1
state=[0]*N
belief=[0]*N
reward=[0]*N
action=[0]*N
total_reward=[]


l=[]
for i in range(N):
    l.append(i)
# print(l)
A=set()
B=set()
C=[]

value_max=0

R1=[]
R2=[]
x=0
y=0
z=0
w=0




lists_action = [[] for _ in range(N)]
for i in range(len(lists_action)):
    lists_action[i] = []


lists_state = [[] for _ in range(N)]
for i in range(len(lists_state)):
    lists_state[i] = [0]


lists_action_state = [[] for _ in range(N)]
for i in range(len(lists_action_state)):
    lists_action_state[i] = [0 for _ in range(4)]
    # print(lists_action_state[i])


optimal = []


Q_value=[]
for  i in range(N):
    Q_value.append(np.array([[0, 0], [0, 0]],dtype='float32'))

value_function = np.array([[0, 0]],dtype='float32')



alpha_value= [[] for _ in range(N)]
for i in range(len(alpha_value)):
    alpha_value[i] = [0 for _ in range(4)]

# value_function= [[] for _ in range(N)]
# for i in range(len(value_function)):
#     value_function[i] = [0 for _ in range(2)]


whittle_index= [[] for _ in range(T_max+1)]
whittle_index[0] = [0]*N
for i in range(1,len(whittle_index)):
    whittle_index[i] = []



def calculation_value_function(q1,q2,r1,r2,value):

    beta=0.9

    result=0

    # value_state0=[]
    # value_state1=[]
    q_value = np.array([[0, 0], [0, 0]],dtype='float32')
    value_state = value
    Q1 = q1
    Q2 = q2
    R1 = r1
    R2 = r2

    q_value[0][0] = Q1[0][0] * (
                10 * R1[0][1] + R1[0][1] * beta * value_state[0][0] + (-10) * R1[0][0] + R1[0][0] * beta *
                value_state[0][0]) + \
                    Q1[0][1] * (10 * R2[0][0] + R2[0][0] * beta * value_state[0][1] + (20) * R2[0][1] + R2[0][
        1] * beta * value_state[0][1])
    q_value[0][1] = Q2[0][0] * (
                10 * R1[0][1] + R1[0][1] * beta * value_state[0][0] + (-10) * R1[0][0] + R1[0][0] * beta *
                value_state[0][0]) + \
                    Q2[0][1] * (10 * R2[0][0] + R2[0][0] * beta * value_state[0][1] + (20) * R2[0][1] + R2[0][
        1] * beta * value_state[0][1])
    q_value[1][0] = Q1[1][0] * (
                10 * R1[0][1] + R1[0][1] * beta * value_state[0][0] + (-10) * R1[0][0] + R1[0][0] * beta *
                value_state[0][0]) + \
                    Q1[1][1] * (10 * R2[0][0] + R2[0][0] * beta * value_state[0][1] + (20) * R2[0][1] + R2[0][
        1] * beta * value_state[0][1])
    q_value[1][1] = Q2[1][0] * (
                10 * R1[0][1] + R1[0][1] * beta * value_state[0][0] + (-10) * R1[0][0] + R1[0][0] * beta *
                value_state[0][0]) + \
                    Q2[1][1] * (10 * R2[0][0] + R2[0][0] * beta * value_state[0][1] + (20) * R2[0][1] + R2[0][
        1] * beta * value_state[0][1])
    # print(q_value)
    if q_value[0][0]> q_value[0][1] or q_value[0][0]== q_value[0][1]:
        value_state[0][0] =q_value[0][0]
    else:
        value_state[0][0] = q_value[0][1]
    if q_value[1][0]> q_value[1][1] or q_value[1][0]== q_value[1][1]:
        value_state[0][1] = q_value[1][0]
    else:
        value_state[0][1] = q_value[1][1]

    return q_value,value_state


def activity_forecast(state,events,transition,days):

    activityToday = state
    transitionName=events
    transitionMatrix=transition

    activityList = [activityToday]
    i = 0
    while i != days:
        if activityToday == 0:
            change = np.random.choice(transitionName[0],replace=True,p=transitionMatrix[0])
            if change == "00":
                activityList.append(0)
            else:
                activityList.append(1)
        else:
            change = np.random.choice(transitionName[1],replace=True,p=transitionMatrix[1])
            if change == "10":
                activityList.append(0)
            else:
                activityList.append(1)
        i += 1
    return activityList[-1]



def number_of_certain_probability(sequence, probability):
    x = np.random.uniform(0, 1)
    cumulative_probability = 0.0
    for item, item_probability in zip(sequence, probability):
        cumulative_probability += item_probability
        if x < cumulative_probability:
            break
    return item





def array_diff(a, b):
  c = list(set(a) - set(b))
  return c

def bellman(N,M,state,key):
    index=[]
    state_new=[]
    access=[]
    index_diff=[]
    result=[]
    l=[]
    action = [0] * N
    for i in range(N):
        l.append(i)
    for i in range(len(state)):
        if state[i]==key:
            index.append(i)
    # print(index)

    if len(index)>M or len(index)==M:
        index=random.sample(index, M)

    if (len(index)>0 and len(index)<M) or (len(index)==0):
        index_diff = array_diff(l,index)
        for j in range(len(index_diff)):
            state_new.append(state[index_diff[j]])
        # print(index_diff)
        # print(state_new)
        if key==1:
            result = list(map(state_new.index, heapq.nlargest(M-len(index), state_new)))
        if key==0:
            result = list(map(state_new.index, heapq.nsmallest(M-len(index), state_new)))

        # print(result)
        for j in range(len(result)):
            index.append(index_diff[result[j]])

    for i in range(len(index)):
        action[index[i]] = 1
    return action

def b_update(reward_active,reward_passive,transition1,transition2,reward,b):
    Q=reward_active
    P= reward_passive
    b_new=0

    if reward== 10:
        sum1 = Q[0][0] * transition1[1][1] * b + P[0][1] * transition1[0][1] * (1 - b)
        # print(sum1)
        sum2 = sum1 + P[0][1] * transition1[0][0] * (1 - b) + Q[0][0] * transition1[1][0] * b
        #print(sum2)
        if sum2!=0:
            b_new=sum1/sum2
    if  reward== 20:
        b_new = 1
    if  reward== -10:
        b_new = 0
    if  reward==0:
        b_new = b*transition2[1][1]+(1-b)*transition2[0][1]
    return b_new





def find_optimal_action(q1,q2,r1,r2):

    value=np.array([[0,0]])

    result1=np.array([[0,0], [0,0]])
    result2=np.array([[0,0]])

    for t in range(100):
        result1,result2 = calculation_value_function(q1,q2,r1,r2,value)

    if result1[0][0]>=result1[0][1]:
        key1 = result1[0][0]
    else:
        key1 = result1[0][1]

    if result1[1][0]>=result1[1][1]:
        key2 = result1[1][0]
    else:
        key2 = result1[1][1]

    if key1>=key2:
        key3 = 0
    else:
        key3 = 1
    return key3


L=1000
D=8000
sum_duo=[]
access=[0]*N
optimal_solution=0

play_time_1=0
play_time_2=0


sum_final=[]

epison_1=0.1
epison_2=0.1


a = st.beta.rvs(1, 1, size=1)
a = (1-epison_1)*a[0]+epison_1

b = st.beta.rvs(1, 1, size=1)
b = (1-epison_2)*b[0]+epison_2


for i in tqdm(range(100)):

    state = [0] * N
    belief = [0] * N
    reward = [0] * N
    action = [0] * N
    total_reward = []
    T1=50

    transition_active = np.array([[a, 1-a], [1-a, a]])
    transition_passive = np.array([[a, 1-a], [1-a, a]])

    transitionName_active = [["00", "01"], ["10", "11"]]
    transitionMatrix_active = [[a, 1-a], [1-a, a]]

    transitionName_passive = [["00", "01"], ["10", "11"]]
    transitionMatrix_passive = [[a, 1-a], [1-a, a]]
    q1 = np.array([[a, 1-a], [1-a, a]])
    q2 = np.array([[a, 1-a], [1-a, a]])

    value_list_active = [10, 20]
    probability_active = [1-b, b]
    value_list_passive = [-10, 10]
    probability_passive = [b, 1-b]

    r1 = np.array([[b, 1-b]])
    r2 = np.array([[1-b, b]])

    access = [0] * N

    # optimal_solution = find_optimal_action(q1, q2, r1, r2)
    optimal_solution = 1
    Reward_arm1 = []
    Reward_arm2 = []

    #对于时间1

    count_exploration=1
    count_exploitation=0
    temp_1=math.pow(4, count_exploration-1)
    # temp_2=math.pow(4, count_exploration)



    for k in range(1,5):
        time_bound_00=int((2/3)*(math.pow(4,k-1)-1))
        time_bound_01=int((2/3)*(math.pow(4,k)-1))
        for t in range(time_bound_00,time_bound_01):
            time_bound1=int((2/3)*(math.pow(4,k-1)-1))
            time_bound2=int((2/3)*(math.pow(4,k)-1)-math.pow(4,k-1))

            if t in range(time_bound1,time_bound2):
                sum=0
                action=[1,0]
                play_time_1 = play_time_1 + 1
                # print(action)
                for i in range(N):
                    if action[i] == 1 and state[i] == 1:
                        reward[i] = number_of_certain_probability(value_list_active, probability_active)
                    if action[i] == 1 and state[i] == 0:
                        reward[i] = number_of_certain_probability(value_list_passive, probability_passive)
                    if action[i] == 0:
                        reward[i] = 0
                for ele in range(0, len(reward)):
                    sum = sum + reward[ele]
                total_reward.append(sum)

                Reward_arm1.append(reward[0])
                for i in range(N):
                    if action[i] == 1:
                        state[i] = activity_forecast(state[i], transitionName_active, transitionMatrix_active, 1)
                    if action[i] == 0:
                        state[i] = activity_forecast(state[i], transitionName_passive, transitionMatrix_passive, 1)


            time_bound3 = int((2 / 3) * (math.pow(4, k) - 1))
            if t in range(time_bound2,time_bound3):
                sum = 0
                action = [0, 1]
                play_time_2=play_time_2+1
                # print(action)
                for i in range(N):
                    if action[i] == 1 and state[i] == 1:
                        reward[i] = number_of_certain_probability(value_list_active, probability_active)
                    if action[i] == 1 and state[i] == 0:
                        reward[i] = number_of_certain_probability(value_list_passive, probability_passive)
                    if action[i] == 0:
                        reward[i] = 0
                for ele in range(0, len(reward)):
                    sum = sum + reward[ele]
                total_reward.append(sum)

                Reward_arm2.append(reward[0])
                for i in range(N):
                    if action[i] == 1:
                        state[i] = activity_forecast(state[i], transitionName_active, transitionMatrix_active, 1)
                    if action[i] == 0:
                        state[i] = activity_forecast(state[i], transitionName_passive, transitionMatrix_passive, 1)

            count_exploration=count_exploration+1


    for t in range(int((2/3)*(math.pow(4, 10)-1)),T_max):
        if (play_time_1)>D*math.log(t):
            count_exploitation=count_exploitation+1
            index_1=mean(Reward_arm1)+sqrt(L*math.log(t)/play_time_1)
            index_2=mean(Reward_arm1)+sqrt(L*math.log(t)/play_time_2)
            if index_1>index_2:
                sum=0
                action=[1,0]
                play_time_1=play_time_1+1

                for i in range(N):
                    if action[i] == 1 and state[i] == 1:
                        reward[i] = number_of_certain_probability(value_list_active, probability_active)
                    if action[i] == 1 and state[i] == 0:
                        reward[i] = number_of_certain_probability(value_list_passive, probability_passive)
                    if action[i] == 0:
                        reward[i] = 0
                for ele in range(0, len(reward)):
                    sum = sum + reward[ele]
                total_reward.append(sum)

                Reward_arm1.append(reward[0])
                for i in range(N):
                    if action[i] == 1:
                        state[i] = activity_forecast(state[i], transitionName_active, transitionMatrix_active, 1)
                    if action[i] == 0:
                        state[i] = activity_forecast(state[i], transitionName_passive, transitionMatrix_passive, 1)
            else:
                sum = 0
                action = [0, 1]
                play_time_2 = play_time_2 + 1

                for i in range(N):
                    if action[i] == 1 and state[i] == 1:
                        reward[i] = number_of_certain_probability(value_list_active, probability_active)
                    if action[i] == 1 and state[i] == 0:
                        reward[i] = number_of_certain_probability(value_list_passive, probability_passive)
                    if action[i] == 0:
                        reward[i] = 0
                for ele in range(0, len(reward)):
                    sum = sum + reward[ele]
                total_reward.append(sum)

                Reward_arm2.append(reward[0])
                for i in range(N):
                    if action[i] == 1:
                        state[i] = activity_forecast(state[i], transitionName_active, transitionMatrix_active, 1)
                    if action[i] == 0:
                        state[i] = activity_forecast(state[i], transitionName_passive, transitionMatrix_passive, 1)
        else:
            count_exploration=count_exploration+1
            time_bound4 = int((2 / 3) * (math.pow(4, count_exploration - 1) - 1))
            time_bound5 = int((2 / 3) * (math.pow(4, count_exploration) - 1) - math.pow(4, count_exploration - 1))
            time_bound6 = int((2 / 3) * (math.pow(4, count_exploration) - 1))
            if t in range(time_bound4 ,time_bound5):
                sum = 0
                action = [1, 0]
                play_time_1 = play_time_1 + 1

                for i in range(N):
                    if action[i] == 1 and state[i] == 1:
                        reward[i] = number_of_certain_probability(value_list_active, probability_active)
                    if action[i] == 1 and state[i] == 0:
                        reward[i] = number_of_certain_probability(value_list_passive, probability_passive)
                    if action[i] == 0:
                        reward[i] = 0
                for ele in range(0, len(reward)):
                    sum = sum + reward[ele]
                total_reward.append(sum)

                Reward_arm1.append(reward[0])
                for i in range(N):
                    if action[i] == 1:
                        state[i] = activity_forecast(state[i], transitionName_active, transitionMatrix_active, 1)
                    if action[i] == 0:
                        state[i] = activity_forecast(state[i], transitionName_passive, transitionMatrix_passive, 1)
            if t in range(time_bound5,time_bound6):
                sum = 0
                action = [0, 1]
                play_time_2 = play_time_2 + 1

                for i in range(N):
                    if action[i] == 1 and state[i] == 1:
                        reward[i] = number_of_certain_probability(value_list_active, probability_active)
                    if action[i] == 1 and state[i] == 0:
                        reward[i] = number_of_certain_probability(value_list_passive, probability_passive)
                    if action[i] == 0:
                        reward[i] = 0
                for ele in range(0, len(reward)):
                    sum = sum + reward[ele]
                total_reward.append(sum)

                Reward_arm2.append(reward[0])
                for i in range(N):
                    if action[i] == 1:
                        state[i] = activity_forecast(state[i], transitionName_active, transitionMatrix_active, 1)
                    if action[i] == 0:
                        state[i] = activity_forecast(state[i], transitionName_passive, transitionMatrix_passive, 1)

    sum_duo.append(total_reward)

print(len(sum_duo))
for i in range(len(sum_duo)):
    temp = 0
    for j in range(len(sum_duo[i])):
        temp = temp + np.power(beta, j) * sum_duo[i][j]
        sum_duo[i][j] = temp


final = [0] * len(sum_duo[0])


for i in range(len(sum_duo[0])):
    for j in range(len(sum_duo)):
        final[i] = final[i] + sum_duo[j][i]
    final[i] = final[i] / len(sum_duo)


file3 = open("RUCB.txt", 'w')
for i in range(len(final)):
    file3.write(str(final[i]) + '\n')
file3.close()
