from __future__ import division
import numpy as np
import pylab as pl
import matplotlib.pyplot as plt
from random import random
import pandas as pd
import random
import heapq
import statistics
import scipy.stats as st
import seaborn as sns
from time import sleep
from tqdm import tqdm
from typing import Any
import statistics
from time import sleep
from tqdm import tqdm
from typing import Any
from numpy import *
from pomegranate import *
import sys
import itertools as it
sys.setrecursionlimit(1000000)





N=2
M=1
beta=1
state=[0]*N
belief=[0]*N
reward=[0]*N
action=[0]*N
total_reward=[]
T_max=50000

l=[]
for i in range(N):
    l.append(i)
# print(l)
A=set()
B=set()
C=[]

value_max=0



transition_active = np.array([[0.5, 0.5], [0.5, 0.5]])
transition_passive = np.array([[0.5, 0.5], [0.5, 0.5]])
transitionName_active = [["00", "01"], ["10", "11"]]
transitionMatrix_active = [[0.5, 0.5], [0.5, 0.5]]
transitionName_passive = [["00", "01"], ["10", "11"]]
transitionMatrix_passive = [[0.5, 0.5], [0.5, 0.5]]

q1 = np.array([[0.5, 0.5], [0.5, 0.5]])
q2 = np.array([[0.5, 0.5], [0.5, 0.5]])

r2 = np.array([[0.8, 0.2]])  # active
r1 = np.array([[0.3, 0.7]])  # passive

value_list_active = [10, 20]
probability_active = [0.8, 0.2]
value_list_passive = [-10, 10]
probability_passive = [0.3, 0.7]

R1=[]
R2=[]
x=0
y=0
z=0
w=0



lists_action = [[] for _ in range(N)]
for i in range(len(lists_action)):
    lists_action[i] = []

lists_state = [[] for _ in range(N)]
for i in range(len(lists_state)):
    lists_state[i] = [0]


lists_belief = [[] for _ in range(N)]
for i in range(len(lists_belief)):
    lists_belief[i] = [0]

lists_action_state = [[] for _ in range(N)]
for i in range(len(lists_action_state)):
    lists_action_state[i] = [0 for _ in range(4)]
    # print(lists_action_state[i])


optimal = []


Q_value=[]
for  i in range(N):
    Q_value.append(np.array([[0, 0], [0, 0]],dtype='float32'))

value_function = np.array([[0, 0]],dtype='float32')



alpha_value= [[] for _ in range(N)]
for i in range(len(alpha_value)):
    alpha_value[i] = [0 for _ in range(4)]


whittle_index= [[] for _ in range(T_max+1)]
whittle_index[0] = [0]*N
for i in range(1,len(whittle_index)):
    whittle_index[i] = []



def calculation_value_function(q1,q2,r1,r2,value):

    beta=0.9

    result=0

    # value_state0=[]
    # value_state1=[]
    q_value = np.array([[0, 0], [0, 0]],dtype='float32')
    value_state = value
    Q1 = q1
    Q2 = q2
    R1 = r1
    R2 = r2


    q_value[0][0] = Q1[0][0] * (10 * R1[0][1] + R1[0][1] *beta * value_state[0][0] + (-10) * R1[0][0] + R1[0][0]*beta * value_state[0][0]) + \
                        Q1[0][1] * (10 * R2[0][0] + R2[0][0]*beta * value_state[0][1]+   (20) * R2[0][1] + R2[0][1]*beta * value_state[0][1])
    q_value[0][1] = Q2[0][0] * (10 * R1[0][1] + R1[0][1]*beta * value_state[0][0] + (-10) * R1[0][0] + R1[0][0] *beta * value_state[0][0]) +\
                        Q2[0][1] * (10 * R2[0][0] + R2[0][0] *beta * value_state[0][1] + (20)  * R2[0][1] + R2[0][1]*beta * value_state[0][1])
    q_value[1][0] = Q1[1][0] * (10 * R1[0][1] + R1[0][1]*beta * value_state[0][0] + (-10) * R1[0][0] + R1[0][0]*beta * value_state[0][0]) + \
                        Q1[1][1] * (10 * R2[0][0] + R2[0][0]*beta * value_state[0][1] + (20)  * R2[0][1] + R2[0][1]*beta * value_state[0][1])
    q_value[1][1] = Q2[1][0] * (10 * R1[0][1] + R1[0][1]*beta * value_state[0][0] + (-10) * R1[0][0] + R1[0][0]*beta * value_state[0][0]) + \
                        Q2[1][1] * (10 * R2[0][0] + R2[0][0]*beta * value_state[0][1] + (20)  * R2[0][1] +R2[0][1] *beta * value_state[0][1])
    # print(q_value)
    if q_value[0][0]> q_value[0][1] or q_value[0][0]== q_value[0][1]:
        value_state[0][0] =q_value[0][0]
    else:
        value_state[0][0] = q_value[0][1]
    if q_value[1][0]> q_value[1][1] or q_value[1][0]== q_value[1][1]:
        value_state[0][1] = q_value[1][0]
    else:
        value_state[0][1] = q_value[1][1]

    return q_value,value_state


def activity_forecast(state,events,transition,days):
    activityToday = state
    transitionName=events
    transitionMatrix=transition
    activityList = [activityToday]
    i = 0
    while i != days:
        if activityToday == 0:
            change = np.random.choice(transitionName[0],replace=True,p=transitionMatrix[0])
            if change == "00":
                activityList.append(0)
            else:
                activityList.append(1)
        else:
            change = np.random.choice(transitionName[1],replace=True,p=transitionMatrix[1])
            if change == "10":
                activityList.append(0)
            else:
                activityList.append(1)
        i += 1
    return activityList[-1]



def calculate_transitions(b):
    list4=[]
    list5=[]
    list7=[]
    list6=[]
    T = np.zeros(shape=(2, 2))
    mylist=[]
    for i in range(len(b)):
        mylist.append(b[i])

    if len(mylist) >= 2:
        a=sum((1-mylist[i]) * (1-mylist[i + 1]) for i in range(len(mylist) - 1))
        # list4 =[(1-mylist[i]) * (1-mylist[i + 1]) for i in range(len(mylist) - 1)]

        # list5 = [(1-mylist[i]) * mylist[i + 1] for i in range(len(mylist) - 1)]
        b=sum((1-mylist[i]) * mylist[i + 1] for i in range(len(mylist) - 1))

        # list6 = [mylist[i] * (1-mylist[i + 1]) for i in range(len(mylist) - 1)]
        c=sum(mylist[i] * (1-mylist[i + 1]) for i in range(len(mylist) - 1))

        # list7 = [mylist[i] * mylist[i + 1] for i in range(len(mylist) - 1)]
        d=sum(mylist[i] * mylist[i + 1] for i in range(len(mylist) - 1))
        if (a+b)!=0:
            T[0][0]=a/(a+b)
            T[0][1] = b / (a + b)
        else:
            T[0][0] = 0.5
            T[0][1] = 0.5
        if (c+d)!=0:
            T[1][0]=c / (c + d)
            T[1][1]=d / (c + d)
        else:
            T[1][0] = 0.5
            T[1][1] = 0.5
    else:
        T= np.array([[0.5, 0.5], [0.5, 0.5]])

    return T

def count_state_action(state, action):
    lists_action_state=[0]*4
    for i in range(len(action)):
        if  state[i] == 0 and action[i] == 0 :
            lists_action_state[0] = lists_action_state[0] + 1
        if state[i]  == 0 and action[i] == 1:
            lists_action_state[1] = lists_action_state[1] + 1
        if  state[i] == 1 and action[i] == 0 :
            lists_action_state[2] = lists_action_state[2] + 1
        if  state[i] == 1 and action[i] == 1 :
            lists_action_state[3] = lists_action_state[3] + 1
    return lists_action_state

def belief_count_state(belief,action):
    lists_action_state = [0] * 4
    for i in range(len(belief)):
        if action[i]==0:
            lists_action_state[0] = lists_action_state[0] + (1-belief[i])
            lists_action_state[2] = lists_action_state[2] + belief[i]
        if action[i]==1:
            lists_action_state[1] = lists_action_state[1] + (1-belief[i])
            lists_action_state[3] = lists_action_state[3] +  belief[i]
    return lists_action_state




def number_of_certain_probability(sequence, probability):
    x = np.random.uniform(0, 1)
    cumulative_probability = 0.0
    for item, item_probability in zip(sequence, probability):
        cumulative_probability += item_probability
        if x < cumulative_probability:
            break
    return item

def b_update(reward_active,reward_passive,transition1,transition2,reward,b):
    Q=reward_active
    P= reward_passive
    b_new=0
    if reward== 10:
        sum1 = Q[0][0] * transition1[1][1] * b + P[0][1] * transition1[0][1] * (1 - b)
        # print(sum1)
        sum2 = sum1 + P[0][1] * transition1[0][0] * (1 - b) + Q[0][0] * transition1[1][0] * b
        #print(sum2)
        if sum2!=0:
            b_new=sum1/sum2
    if  reward== 20:
        b_new = 1
    if  reward== -10:
        b_new = 0
    if  reward==0:
        b_new = b*transition2[1][1]+(1-b)*transition2[0][1]
    return b_new


# def b_update(reward_active,reward_passive,transition1,transition2,reward,b):
#     Q=reward_active #active
#     P= reward_passive
#     b_new=0
#     if reward== 1:
#         sum1 = Q[0][0] * transition1[1][1] * b + P[0][0] * transition1[0][1] * (1 - b)
#         # print(sum1)
#         sum2 = sum1 + P[0][0] * transition1[0][0] * (1 - b) + Q[0][0] * transition1[1][0] * b
#         #print(sum2)
#         if sum2!=0:
#             b_new=sum1/sum2
#
#     if  reward== 2:
#         sum1 = Q[0][1] * transition1[1][1] * b + P[0][1] * transition1[0][1] * (1 - b)
#         # print(sum1)
#         sum2 = sum1 + P[0][1] * transition1[0][0] * (1 - b) + Q[0][1] * transition1[1][0] * b
#         # print(sum2)
#         if sum2 != 0:
#             b_new = sum1 / sum2
#
#     if  reward==0:
#         b_new = b*transition2[1][1]+(1-b)*transition2[0][1]
#     return b_new





# def b_update(reward_active,reward_passive,transition1,transition2,reward,b):
#     Q=reward_active
#     P= reward_passive
#     b_new=0
#     if reward== 1:
#         sum1 = Q[0][0] * transition1[1][1] * b + P[0][0] * transition1[0][1] * (1 - b)
#         # print(sum1)
#         sum2 = sum1 + P[0][0] * transition1[0][0] * (1 - b) + Q[0][0] * transition1[1][0] * b
#         #print(sum2)
#         if sum2!=0:
#             b_new=sum1/sum2
#
#     if  reward== 5:
#         sum1 = Q[0][1] * transition1[1][1] * b + P[0][1] * transition1[0][1] * (1 - b)
#         # print(sum1)
#         sum2 = sum1 + P[0][1] * transition1[0][0] * (1 - b) + Q[0][1] * transition1[1][0] * b
#         # print(sum2)
#         if sum2 != 0:
#             b_new = sum1 / sum2
#
#     if  reward== 10:
#         sum1 = Q[0][2] * transition1[1][1] * b + P[0][2] * transition1[0][1] * (1 - b)
#         # print(sum1)
#         sum2 = sum1 + P[0][2] * transition1[0][0] * (1 - b) + Q[0][2] * transition1[1][0] * b
#         # print(sum2)
#         if sum2 != 0:
#             b_new = sum1 / sum2
#
#     if  reward==0:
#         b_new = b*transition2[1][1]+(1-b)*transition2[0][1]
#     return b_new





def find_max_matrix(T1):
    result1=0
    result2=0
    if T1[0][0]>=T1[0][1]:
        result1=T1[0][0]
    else:
        result1 = T1[0][1]
    if T1[1][0]>=T1[1][1]:
        result2=T1[1][0]
    else:
        result2 = T1[1][1]
    if result1>=result2:
        result=result1
    else:
        result=result2
    return result


res = [[[0 for item3 in range(2)] for item2 in range(2)] for item1 in range(N)]
value_max = [[0 for item3 in range(1)]  for item1 in range(N)]


def array_diff(a, b):
  c = list(set(a) - set(b))
  return c

def bellman(N,M,state,key):
    index=[]
    state_new=[]
    access=[]
    index_diff=[]
    result=[]
    l=[]
    action = [0] * N
    for i in range(N):
        l.append(i)
    for i in range(len(state)):
        if state[i]==key:
            index.append(i)
    # print(index)

    # if len(index)>M or len(index)==M:
    #     index=random.sample(index, M)

    if (len(index)>0 and len(index)<M) or (len(index)==0):
        index_diff = array_diff(l,index)
        for j in range(len(index_diff)):
            state_new.append(state[index_diff[j]])
        # print(index_diff)
        # print(state_new)
        if key==1:
            result = list(map(state_new.index, heapq.nlargest(M-len(index), state_new)))
        if key==0:
            result = list(map(state_new.index, heapq.nsmallest(M-len(index), state_new)))

        # print(result)
        for j in range(len(result)):
            index.append(index_diff[result[j]])

    for i in range(len(index)):
        action[index[i]] = 1
    return action


def beta_update(belief,a1,b1,a2,b2,a3,b3,a4,b4,action):
    w1=belief[-2]*belief[-1]
    w2=belief[-2]*(1-belief[-1])
    w3=(1-belief[-2])*belief[-1]
    w4=(1-belief[-2])*(1-belief[-1])
    AA=[w1,w2,w3,w4]
    if action == 1:
        if max(AA)==w1:
            a2 = a2 + 1
        elif max(AA)==w2:
            b2 = b2 + 1
        elif max(AA)==w4:
            a1 = a1 + 1
        else:
            b1 = b1 + 1

    else:
        if max(AA)==w1:
            a4 = a4 + 1
        elif max(AA)==w2:
            b4 = b4 + 1
        elif max(AA)==w4:
            a3 = a3 + 1
        else:
            b3 = b3 + 1

    return a1,b1,a2,b2,a3,b3,a4,b4



def find_optimal_action(q1,q2,r1,r2):

    value=np.array([[0,0]])

    result1=np.array([[0,0], [0,0]])
    result2=np.array([[0,0]])

    for t in range(100):
        result1,result2 = calculation_value_function(q1,q2,r1,r2,value)

    if result1[0][0]>=result1[0][1]:
        key1 = result1[0][0]
    else:
        key1 = result1[0][1]

    if result1[1][0]>=result1[1][1]:
        key2 = result1[1][0]
    else:
        key2 = result1[1][1]

    if key1>=key2:
        key3 = 0
    else:
        key3 = 1
    return key3

def generate_event_prob(T1,weight):
    s = list(it.product(range(2), repeat=T1))

    belief=[]
    for i in range(len(s)):
        temp = 1
        for j in range(len(s[i])):
            if s[i][j] == 0:
                temp = temp * (1 - weight[j])
            else:
                temp = temp * (weight[j])
        # print(temp)
        belief.append(temp)

    # print(belief)
    return s,belief


def bayesian_pomdp(seq, belief):
    count00 = 0
    count01 = 0
    count10 = 0
    count11 = 0
    count = [0] * len(seq)
    P00=0
    P11=0
    for j in range(len(seq)):
        count00 = 0
        count01 = 0
        count10 = 0
        count11 = 0
        for i in range(len(seq[j]) - 1):
            if seq[j][i] == 0 and seq[j][i + 1] == 0:
                count00 = count00 + 1
            if seq[j][i] == 0 and seq[j][i + 1] == 1:
                count01 = count01 + 1
            if seq[j][i] == 1 and seq[j][i + 1] == 0:
                count10 = count10 + 1
            if seq[j][i] == 1 and seq[j][i + 1] == 1:
                count11 = count11 + 1
        count[j] = [count00, count01, count10, count11]

    # print(count)

    chushifenbu = BetaDistribution(1, 1)
    Mixture1 = [chushifenbu] * len(seq)
    Mixture2 = [chushifenbu] * len(seq)
    for i in range(len(seq)):
        Mixture1[i] = BetaDistribution(1 + count[i][0], 1 + count[i][1])
        Mixture2[i] = BetaDistribution(1 + count[i][3], 1 + count[i][2])

    model_1 = GeneralMixtureModel(Mixture1, weights=belief)
    model_2 = GeneralMixtureModel(Mixture2, weights=belief)

    P00 = model_1.sample()
    P11 = model_2.sample()

    return P00, P11

# def beta_update(belief,a1,b1,a2,b2,a3,b3,a4,b4,action):
#     w1=belief[-2]*belief[-1]
#     w2=belief[-2]*(1-belief[-1])
#     w3=(1-belief[-2])*belief[-1]
#     w4=(1-belief[-2])*(1-belief[-1])
#     AA=[w1,w2,w3,w4]
#     if action == 1:
#         if max(AA)==w1:
#             a2 = a2 + 1
#         elif max(AA)==w2:
#             b2 = b2 + 1
#         elif max(AA)==w4:
#             a1 = a1 + 1
#         else:
#             b1 = b1 + 1
#
#     else:
#         if max(AA)==w1:
#             a4 = a4 + 1
#         elif max(AA)==w2:
#             b4 = b4 + 1
#         elif max(AA)==w4:
#             a3 = a3 + 1
#         else:
#             b3 = b3 + 1
#
#     return a1,b1,a2,b2,a3,b3,a4,b4


def reward_distri_update(action,reward,belief,c1,d1,c2,d2):
    for i in range(len(action)):
        if action[i]!=0 and belief<=0.5:
            if reward[i]== (-10):
                c1=c1+1
            elif reward[i]== (10) :
                d1 = d1 + 1
        if action[i]!=0 and belief>0.5:
            if reward[i]== (10):
                c2=c2+1
            elif reward[i]== (20) :
                d2 = d2+ 1
    return c1,d1,c2,d2





access=[0]*N
optimal_solution=0
sum_final=[]
sum_duo=[]

epison_1=0.1
epison_2=0.1


a = st.beta.rvs(1, 1, size=1)
a = (1-epison_1)*a[0]+epison_1

b = st.beta.rvs(1, 1, size=1)
b = (1-epison_2)*b[0]+epison_2


for j in tqdm(range(100)):

    state = [0] * N
    belief = [0.5] * N
    reward = [0] * N
    action = [0] * N
    total_reward = []
    seq = []
    jiaquan = []

    lists_state = [[] for _ in range(N)]
    for i in range(len(lists_state)):
        lists_state[i] = [0]



    transition_active = np.array([[a, 1 - a], [1 - a, a]])
    transition_passive = np.array([[a, 1 - a], [1 - a, a]])

    transitionName_active = [["00", "01"], ["10", "11"]]
    transitionMatrix_active = [[a, 1 - a], [1 - a, a]]

    transitionName_passive = [["00", "01"], ["10", "11"]]
    transitionMatrix_passive = [[a, 1 - a], [1 - a, a]]
    q1 = np.array([[a, 1 - a], [1 - a, a]])
    q2 = np.array([[a, 1 - a], [1 - a, a]])

    value_list_active = [10, 20]
    probability_active = [1 - b, b]
    value_list_passive = [-10, 10]
    probability_passive = [b, 1 - b]

    r1 = np.array([[b, 1 - b]])
    r2 = np.array([[1 - b, b]])

    a1 = 1
    b1 = 1
    a2 = 1
    b2 = 1

    a3 = 1
    b3 = 1
    a4 = 1
    b4 = 1

    c1 = 1
    d1 = 1

    c2 = 1
    d2 = 1

    w00 = st.beta.rvs(c1, d1, size=1)
    w11 = st.beta.rvs(c2, d2, size=1)

    r1[0][0] = w00[0]*(1-epison_2)+epison_2
    r1[0][1] = 1-w00[0]
    r2[0][0] = w11[0]*(1-epison_2)+epison_2
    r1[0][1] = 1 - w11[0]



    a00 = st.beta.rvs(a1, b1, size=1)
    a11 = st.beta.rvs(a2, b2, size=1)
    p00 = st.beta.rvs(a3, b3, size=1)
    p11 = st.beta.rvs(a4, b4, size=1)

    q1[0][0] = a00[0]*(1-epison_1)+epison_1
    q1[0][1] = 1 - q1[0][0]

    q1[1][1] = a11[0]*(1-epison_1)+epison_1
    q1[1][0] = 1 - q1[1][1]

    q2[0][0] = p00[0]*(1-epison_1)+epison_1
    q2[0][1] = 1 - q2[0][0]

    q2[1][1] = p11[0]*(1-epison_1)+epison_1
    q2[1][0] = 1 - q2[1][1]

    access = [0] * N
    optimal_solution = 0
    reward_record_explo=[]


    m = 250

    T1 = 200
    T2 = 50


    for w in (range(m)):

        # print(optimal_solution)
        seq = []
        jiaquan = []

        # reward_record_explo=[]
        for k in (range(int(T2/2))):

            sum = 0
            sum1 = 0
            sum2 = []
            key3 = optimal_solution

            # access = bellman(N, 1, belief, key3)
            action = [1,0]

            for i in range(N):
                if action[i] == 1 and state[i] == 1:
                    reward[i] = number_of_certain_probability(value_list_active, probability_active)
                if action[i] == 1 and state[i] == 0:
                    reward[i] = number_of_certain_probability(value_list_passive, probability_passive)
                if action[i] == 0:
                    reward[i] = 0

            reward_record_explo.append(reward)

            for i in range(N):
                if action[i] == 1:
                    state[i] = activity_forecast(state[i], transitionName_active, transitionMatrix_active, 1)
                else:
                    state[i] = activity_forecast(state[i], transitionName_passive, transitionMatrix_passive, 1)


            c1, d1, c2, d2 = reward_distri_update(action, reward, belief[0], c1, d1, c2, d2)


            for i in range(len(reward)):
                belief[i] = b_update(r2, r1, q1, q2, reward[i], belief[i])
            # print(belief)

            for i in range(len(belief)):
                lists_state[i].append(belief[i])


            for ele in range(0, len(reward)):
                sum = sum + reward[ele]
            total_reward.append(sum)

        # print(total_reward)
        # seq, jiaquan = generate_event_prob(int(T2/2), lists_state[0])
        # q1[0][0],q1[1][1]=bayesian_pomdp(seq, jiaquan)
        # q1[0][1] = 1 - q1[0][0]
        # q1[1][0] = 1 - q1[1][1]

        # print(q1)


        for k in (range(int(T2 / 2),T2)):

            sum = 0
            sum1 = 0
            sum2 = []
            key3 = optimal_solution

            # access = bellman(N, 1, belief, key3)
            action = [0, 1]

            # print("action is:")
            # print(action)

            for i in range(N):
                if action[i] == 1 and state[i] == 1:
                    reward[i] = number_of_certain_probability(value_list_active, probability_active)
                if action[i] == 1 and state[i] == 0:
                    reward[i] = number_of_certain_probability(value_list_passive, probability_passive)
                if action[i] == 0:
                    reward[i] = 0

            reward_record_explo.append(reward)

            c1, d1, c2, d2 = reward_distri_update(action, reward, belief[1], c1, d1, c2, d2)



            # print(reward)
            # print("\n")

            for i in range(N):
                if action[i] == 1:
                    state[i] = activity_forecast(state[i], transitionName_active, transitionMatrix_active, 1)
                else:
                    state[i] = activity_forecast(state[i], transitionName_passive, transitionMatrix_passive, 1)


            for i in range(len(reward)):
                belief[i] = b_update(r2, r1, q1, q2, reward[i], belief[i])


            for i in range(len(belief)):
                lists_state[i].append(belief[i])


            for ele in range(0, len(reward)):
                sum = sum + reward[ele]
            total_reward.append(sum)


        for i in range(len(lists_state[0])):
            if lists_state[0][i]>=0.5:
                lists_state[0][i]=1
            else:
                lists_state[0][i]=0
        for i in range(len(lists_state[0])-1):
            if lists_state[0][i]==0 and lists_state[0][i+1]==0:
                a1 = a1 + 1
            elif lists_state[0][i]==0 and lists_state[0][i+1]==1:
                b1 = b1 + 1
            elif lists_state[0][i] == 1 and lists_state[0][i+1] == 1:
                a2 = a2 + 1
            else:
                b2 = b2 + 1


        a00 = st.beta.rvs(a1, b1, size=1)
        a11 = st.beta.rvs(a2, b2, size=1)
        p00 = st.beta.rvs(a3, b3, size=1)
        p11 = st.beta.rvs(a4, b4, size=1)

        q1[0][0] = a00[0] * (1 - epison_1) + epison_1
        q1[0][1] = 1 - q1[0][0]

        q1[1][1] = a11[0] * (1 - epison_1) + epison_1
        q1[1][0] = 1 - q1[1][1]


        q2=q1

        w00 = st.beta.rvs(c1, d1, size=1)
        w11 = st.beta.rvs(c2, d2, size=1)

        r1[0][0] = w00[0] * (1 - epison_2) + epison_2
        r1[0][1] = 1 - w00[0]
        r2[0][0] = w11[0] * (1 - epison_2) + epison_2
        r1[0][1] = 1 - w11[0]


        belief=[0.5]*N
        for i in range(len(reward_record_explo)):
            for j in range(N):
                belief[j]=b_update(r2, r1, q1, q2, reward_record_explo[i][j], belief[j])


        # print(q2)

        optimal_solution = find_optimal_action(q1, q2, r1, r2)
        # optimal_solution = 1

        # print(optimal_solution)
        # print(belief)

        # for t in (range((int(math.pow(2,w))*T1)-T2)):
        for t in range(T1-T2+w):

            sum=0

            action=bellman(N, 1, belief, optimal_solution)



            for i in range(N):
                if action[i] == 1 and state[i] == 1:
                    reward[i] = number_of_certain_probability(value_list_active, probability_active)
                if action[i] == 1 and state[i] == 0:
                    reward[i] = number_of_certain_probability(value_list_passive, probability_passive)
                if action[i] == 0:
                    reward[i] = 0

            reward_record_explo.append(reward)


            # print(reward)
            # print("\n")

            for i in range(N):
                if action[i] == 1:
                    state[i] = activity_forecast(state[i], transitionName_active, transitionMatrix_active, 1)
                else:
                    state[i] = activity_forecast(state[i], transitionName_passive, transitionMatrix_passive, 1)


            for i in range(len(reward)):
                belief[i] = b_update(r2, r1, q1, q2, reward[i], belief[i])

            for ele in range(0, len(reward)):
                sum = sum + reward[ele]
            total_reward.append(sum)
    # print(total_reward)

    sum_duo.append(total_reward)

# print(sum_duo)


for i in range(len(sum_duo)):
    temp = 0
    for j in range(len(sum_duo[i])):
        temp = temp + np.power(beta, j) * sum_duo[i][j]
        sum_duo[i][j] = temp

# print(sum_duo)

final = [0] * T_max
for i in range(T_max):
    for j in range(len(sum_duo)):
        final[i] = final[i] + sum_duo[j][i]
    final[i] = final[i] / len(sum_duo)

# print(final)

file2 = open("beta_mixture.txt", 'w')
for i in range(len(final)):
    file2.write(str(final[i]) + '\n')
file2.close()




for i in range(len(final)):
    final[i]=final[i]/(i+1)
# print(final)


print(final[-5:-1])

# file3 = open("avg_copy.txt", 'w')
# for i in range(len(final)):
#     file3.write(str(final[-1]*(i+1)) + '\n')
# file3.close()






