

from __future__ import division
import numpy as np
import pylab as pl
import matplotlib.pyplot as plt
from random import random
import pandas as pd
import random
import heapq
from tqdm import tqdm
import scipy.stats as st
import seaborn as sns
from typing import Any
from numpy import *



T_max=50000
N=2
M=1
beta=1
state=[0]*N
belief=[0]*N
reward=[0]*N
action=[0]*N
total_reward=[]


l=[]
for i in range(N):
    l.append(i)
# print(l)
A=set()
B=set()
C=[]

value_max=0

R1=[]
R2=[]
x=0
y=0
z=0
w=0



lists_action = [[] for _ in range(N)]
for i in range(len(lists_action)):
    lists_action[i] = []


lists_state = [[] for _ in range(N)]
for i in range(len(lists_state)):
    lists_state[i] = [0]


lists_action_state = [[] for _ in range(N)]
for i in range(len(lists_action_state)):
    lists_action_state[i] = [0 for _ in range(4)]
    # print(lists_action_state[i])


optimal = []


Q_value=[]
for  i in range(N):
    Q_value.append(np.array([[0, 0], [0, 0]],dtype='float32'))

value_function = np.array([[0, 0]],dtype='float32')



alpha_value= [[] for _ in range(N)]
for i in range(len(alpha_value)):
    alpha_value[i] = [0 for _ in range(4)]

# value_function= [[] for _ in range(N)]
# for i in range(len(value_function)):
#     value_function[i] = [0 for _ in range(2)]


whittle_index= [[] for _ in range(T_max+1)]
whittle_index[0] = [0]*N
for i in range(1,len(whittle_index)):
    whittle_index[i] = []



def calculation_value_function(q1,q2,r1,r2,value):

    beta=0.9

    result=0

    # value_state0=[]
    # value_state1=[]
    q_value = np.array([[0, 0], [0, 0]],dtype='float32')
    value_state = value
    Q1 = q1
    Q2 = q2
    R1 = r1
    R2 = r2

    q_value[0][0] = Q1[0][0] * (
                10 * R1[0][1] + R1[0][1] * beta * value_state[0][0] + (-10) * R1[0][0] + R1[0][0] * beta *
                value_state[0][0]) + \
                    Q1[0][1] * (10 * R2[0][0] + R2[0][0] * beta * value_state[0][1] + (20) * R2[0][1] + R2[0][
        1] * beta * value_state[0][1])
    q_value[0][1] = Q2[0][0] * (
                10 * R1[0][1] + R1[0][1] * beta * value_state[0][0] + (-10) * R1[0][0] + R1[0][0] * beta *
                value_state[0][0]) + \
                    Q2[0][1] * (10 * R2[0][0] + R2[0][0] * beta * value_state[0][1] + (20) * R2[0][1] + R2[0][
        1] * beta * value_state[0][1])
    q_value[1][0] = Q1[1][0] * (
                10 * R1[0][1] + R1[0][1] * beta * value_state[0][0] + (-10) * R1[0][0] + R1[0][0] * beta *
                value_state[0][0]) + \
                    Q1[1][1] * (10 * R2[0][0] + R2[0][0] * beta * value_state[0][1] + (20) * R2[0][1] + R2[0][
        1] * beta * value_state[0][1])
    q_value[1][1] = Q2[1][0] * (
                10 * R1[0][1] + R1[0][1] * beta * value_state[0][0] + (-10) * R1[0][0] + R1[0][0] * beta *
                value_state[0][0]) + \
                    Q2[1][1] * (10 * R2[0][0] + R2[0][0] * beta * value_state[0][1] + (20) * R2[0][1] + R2[0][
        1] * beta * value_state[0][1])
    # print(q_value)
    if q_value[0][0]> q_value[0][1] or q_value[0][0]== q_value[0][1]:
        value_state[0][0] =q_value[0][0]
    else:
        value_state[0][0] = q_value[0][1]
    if q_value[1][0]> q_value[1][1] or q_value[1][0]== q_value[1][1]:
        value_state[0][1] = q_value[1][0]
    else:
        value_state[0][1] = q_value[1][1]

    return q_value,value_state


def activity_forecast(state,events,transition,days):

    activityToday = state
    transitionName=events
    transitionMatrix=transition

    activityList = [activityToday]
    i = 0
    while i != days:
        if activityToday == 0:
            change = np.random.choice(transitionName[0],replace=True,p=transitionMatrix[0])
            if change == "00":
                activityList.append(0)
            else:
                activityList.append(1)
        else:
            change = np.random.choice(transitionName[1],replace=True,p=transitionMatrix[1])
            if change == "10":
                activityList.append(0)
            else:
                activityList.append(1)
        i += 1
    return activityList[-1]



def number_of_certain_probability(sequence, probability):
    x = np.random.uniform(0, 1)
    cumulative_probability = 0.0
    for item, item_probability in zip(sequence, probability):
        cumulative_probability += item_probability
        if x < cumulative_probability:
            break
    return item





def array_diff(a, b):
  c = list(set(a) - set(b))
  return c

def bellman(N,M,state,key):
    index=[]
    state_new=[]
    access=[]
    index_diff=[]
    result=[]
    l=[]
    action = [0] * N
    for i in range(N):
        l.append(i)
    for i in range(len(state)):
        if state[i]==key:
            index.append(i)
    # print(index)

    if len(index)>M or len(index)==M:
        index=random.sample(index, M)

    if (len(index)>0 and len(index)<M) or (len(index)==0):
        index_diff = array_diff(l,index)
        for j in range(len(index_diff)):
            state_new.append(state[index_diff[j]])
        # print(index_diff)
        # print(state_new)
        if key==1:
            result = list(map(state_new.index, heapq.nlargest(M-len(index), state_new)))
        if key==0:
            result = list(map(state_new.index, heapq.nsmallest(M-len(index), state_new)))

        # print(result)
        for j in range(len(result)):
            index.append(index_diff[result[j]])

    for i in range(len(index)):
        action[index[i]] = 1
    return action

def b_update(reward_active,reward_passive,transition1,transition2,reward,b):
    Q=reward_active
    P= reward_passive
    b_new=0

    if reward== 10:
        sum1 = Q[0][0] * transition1[1][1] * b + P[0][1] * transition1[0][1] * (1 - b)
        # print(sum1)
        sum2 = sum1 + P[0][1] * transition1[0][0] * (1 - b) + Q[0][0] * transition1[1][0] * b
        #print(sum2)
        if sum2!=0:
            b_new=sum1/sum2
    if  reward== 20:
        b_new = 1
    if  reward== -10:
        b_new = 0
    if  reward==0:
        b_new = b*transition2[1][1]+(1-b)*transition2[0][1]
    return b_new





def find_optimal_action(q1,q2,r1,r2):

    value=np.array([[0,0]])

    result1=np.array([[0,0], [0,0]])
    result2=np.array([[0,0]])

    for t in range(100):
        result1,result2 = calculation_value_function(q1,q2,r1,r2,value)

    if result1[0][0]>=result1[0][1]:
        key1 = result1[0][0]
    else:
        key1 = result1[0][1]

    if result1[1][0]>=result1[1][1]:
        key2 = result1[1][0]
    else:
        key2 = result1[1][1]

    if key1>=key2:
        key3 = 0
    else:
        key3 = 1
    return key3



sum_duo=[]
access=[0]*N
optimal_solution=0



sum_final=[]
Reward_arm1 = []
Reward_arm2 = []

epison_1=0.1
epison_2=0.1


a = st.beta.rvs(1, 1, size=1)
a = (1-epison_1)*a[0]+epison_1

b = st.beta.rvs(1, 1, size=1)
b = (1-epison_2)*b[0]+epison_2



for i in tqdm(range(100)):
    estimation_1 = 0
    estimation_2 = 0
    state = [0] * N
    belief = [0] * N
    reward = [0] * N
    action = [0] * N
    total_reward = []
    T1=100
    T2=50

    transition_active = np.array([[a, 1 - a], [1 - a, a]])
    transition_passive = np.array([[a, 1 - a], [1 - a, a]])

    transitionName_active = [["00", "01"], ["10", "11"]]
    transitionMatrix_active = [[a, 1 - a], [1 - a, a]]

    transitionName_passive = [["00", "01"], ["10", "11"]]
    transitionMatrix_passive = [[a, 1 - a], [1 - a, a]]
    q1 = np.array([[a, 1 - a], [1 - a, a]])
    q2 = np.array([[a, 1 - a], [1 - a, a]])

    value_list_active = [10, 20]
    probability_active = [1 - b, b]
    value_list_passive = [-10, 10]
    probability_passive = [b, 1 - b]

    r1 = np.array([[b, 1 - b]])
    r2 = np.array([[1 - b, b]])

    access = [0] * N

    # optimal_solution = find_optimal_action(q1, q2, r1, r2)
    optimal_solution = 1
    Reward_arm1 = []
    Reward_arm2 = []


    for t in range(T1):
        sum=0
        action = [1, 0]
        for i in range(N):
            if action[i] == 1 and state[i] == 1:
                reward[i] = number_of_certain_probability(value_list_active, probability_active)
            if action[i] == 1 and state[i] == 0:
                reward[i] = number_of_certain_probability(value_list_passive, probability_passive)
            if action[i] == 0:
                reward[i] = 0

        for ele in range(0, len(reward)):
            sum = sum + reward[ele]
        total_reward.append(sum)
    # print(reward)
        Reward_arm1.append(reward[0])
        for i in range(N):
            if action[i] == 1:
                state[i] = activity_forecast(state[i], transitionName_active, transitionMatrix_active, 1)
            if action[i] == 0:
                state[i] = activity_forecast(state[i], transitionName_passive, transitionMatrix_passive, 1)
    # print(state)


    for t in range(T1):
        sum=0
        action = [0, 1]
        for i in range(N):
            if action[i] == 1 and state[i] == 1:
                reward[i] = number_of_certain_probability(value_list_active, probability_active)
            if action[i] == 1 and state[i] == 0:
                reward[i] = number_of_certain_probability(value_list_passive, probability_passive)
            if action[i] == 0:
                reward[i] = 0

        for ele in range(0, len(reward)):
            sum = sum + reward[ele]
        total_reward.append(sum)

        Reward_arm2.append(reward[1])
        for i in range(N):
            if action[i] == 1:
                state[i] = activity_forecast(state[i], transitionName_active, transitionMatrix_active, 1)
            if action[i] == 0:
                state[i] = activity_forecast(state[i], transitionName_passive, transitionMatrix_passive, 1)

    # print(Reward_arm1,Reward_arm2)
    # # print(state)
    # print(mean(Reward_arm1))
    # print(mean(Reward_arm2))

    for t in range(2*T1,T_max):

        reward_1_window=[]
        reward_2_window=[]

        sum=0
        sum1=0
        sum2 = []
        # if t==0:
        #     action = [1, 0]

        if t<T2:
            # k1=math.sqrt((2*math.log(t,10))/len(Reward_arm1))
            estimation_1=mean(Reward_arm1)+math.sqrt((2*math.log(t,10))/len(Reward_arm1))
            estimation_2=mean(Reward_arm2)+math.sqrt((2*math.log(t,10))/len(Reward_arm2))
            if estimation_1>estimation_2:
                action = [1,0]
            else:
                action = [0,1]
        else:
            reward_1_window=Reward_arm1[-T2:]
            reward_2_window=Reward_arm2[-T2:]
            estimation_1 = mean(reward_1_window) + math.sqrt((2 * math.log(T2, 10)) / len(reward_1_window))
            estimation_2 = mean(reward_2_window) + math.sqrt((2 * math.log(T2, 10)) / len(reward_2_window))
            if estimation_1>estimation_2:
                action = [1,0]
            else:
                action = [0,1]


        for i in range(N):
            if action[i] == 1 and state[i] == 1:
                reward[i] = number_of_certain_probability(value_list_active, probability_active)
            if action[i] == 1 and state[i] == 0:
                reward[i] = number_of_certain_probability(value_list_passive, probability_passive)
            if action[i] == 0:
                reward[i] = 0

        if reward[0]!=0:
            Reward_arm1.append(reward[0])
        if reward[1]!=0:
            Reward_arm2.append(reward[1])


        for i in range(N):
            if action[i] == 1:
                state[i] = activity_forecast(state[i], transitionName_active, transitionMatrix_active, 1)
            if action[i] == 0:
                state[i] = activity_forecast(state[i], transitionName_passive, transitionMatrix_passive, 1)


        # for i in range(len(reward)):
        #         belief[i]=b_update(r2,r1,q1,q2,reward[i],belief[i])



        for ele in range(0, len(reward)):
            sum = sum + reward[ele]
        total_reward.append(sum)

    sum_duo.append(total_reward)


for i in range(len(sum_duo)):
    temp = 0
    for j in range(len(sum_duo[i])):
        temp = temp + np.power(beta, j) * sum_duo[i][j]
        sum_duo[i][j] = temp



final = [0] * T_max
for i in range(T_max):
    for j in range(len(sum_duo)):
        final[i] = final[i] + sum_duo[j][i]
    final[i] = final[i] / len(sum_duo)



file3 = open("slide_ucb.txt", 'w')
for i in range(len(final)):
    file3.write(str(final[i]) + '\n')
file3.close()







