import numpy as np
from sympy import *
import numpy as np
import time

def UCB_value(experienced_mu,count,t,CB_coefficient):
    tmp = experienced_mu+CB_coefficient*(np.sqrt((np.log(t+1)/count)))
    if tmp>=1:
        UCB_value = 1
    if tmp<1:
        UCB_value = tmp 
    return UCB_value

def LCB_value(experienced_cost,count,t,CB_coefficient):
    tmp = experienced_cost-CB_coefficient*(np.sqrt((np.log(t+1)/count)))
    if tmp<=0:
        LCB_value = 0
    if tmp>0:
        LCB_value = tmp
    return LCB_value

def combination_list(l,n):
    if len(l)==n:
        return [l]
    if n==1:
        z=[]
        for i in l:
            z.append([i])
        return z
    else:
        z=[]
        for i in range(len(l)):
            for j in combination_list(l[i+1:],n-1):
                z.append([l[i]]+j)
        return z

def combination(L,n):
    l = list(np.arange(L))
    return combination_list(l,n)

class C2MAB_V_direct(object):

    def __init__(self, K, env, T,CB_coefficient,log_ind,LCB_coefficient):
        super(C2MAB_V_direct, self).__init__()
        self.K = K
        self.env = env
        self.T = T
        self.L = self.env.L
        self.C = self.env.C
        self.price = self.env.price
        self.exp_rewards = np.zeros(self.T)
        self.violation = np.zeros(self.T)
        self.exp_violation = np.zeros(self.T)
        self.choosing_count = np.ones(self.L)
        self.cost = self.env.cost
        self.CB_coefficient =CB_coefficient
        self.log_ind = log_ind
        self.LCB_coefficient =LCB_coefficient

    def run(self):
        starttime = time.time()
        combination_all = combination(self.L,self.K)
        experienced_mu = np.random.uniform(0,1,self.L)
        experienced_cost = np.random.uniform(self.env.cost_lower,self.env.cost_upper,self.L)
        
        for t in range(self.T):
            #print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++",t)
            UCB_mu = np.zeros(self.L)
            LCB_cost = np.zeros(self.L)

            At = np.zeros(self.L)
            best_reward = 0
            for i in range(self.L):
                UCB_mu[i] = UCB_value(experienced_mu[i],self.choosing_count[i],t,self.CB_coefficient)
                LCB_cost[i] = LCB_value(experienced_cost[i],self.choosing_count[i],t,self.LCB_coefficient)
            for i in range(len(combination_all)):
                At_tmp_index = combination_all[i]
                At_tmp = np.zeros(self.L)
                for j in range(self.K):
                    At_tmp[At_tmp_index[j]] = 1
                if np.dot(LCB_cost,At_tmp.T)-self.C <=0:
                    tmp_reward = np.dot(UCB_mu,At_tmp.T)
                    if tmp_reward> best_reward:
                        best_reward = tmp_reward
                        At = At_tmp

            tmp_violation = np.dot(self.cost,At.T)-self.C
            if tmp_violation<=0:
                self.violation[t] = 0
            if tmp_violation>0:
                self.violation[t] = np.dot(self.cost,At.T)-self.C


            users_choosing_L,feedback_cost,total_price,exp_reward_t = self.env.feedback(At)
            self.exp_rewards[t] = exp_reward_t
            index_At_choosing = np.flatnonzero(At)
            for i in range(len(index_At_choosing)):
                self.choosing_count[index_At_choosing[i]]+=1
                experienced_cost[index_At_choosing[i]]=(experienced_cost[index_At_choosing[i]]*(self.choosing_count[index_At_choosing[i]]-1)+feedback_cost[index_At_choosing[i]])/(self.choosing_count[index_At_choosing[i]])
                if users_choosing_L[index_At_choosing[i]] == 0:
                    experienced_mu[index_At_choosing[i]]=(experienced_mu[index_At_choosing[i]]*(self.choosing_count[index_At_choosing[i]]-1)+0)/(self.choosing_count[index_At_choosing[i]])
                if users_choosing_L[index_At_choosing[i]] == 1:
                    experienced_mu[index_At_choosing[i]]=(experienced_mu[index_At_choosing[i]]*(self.choosing_count[index_At_choosing[i]]-1)+self.price[index_At_choosing[i]])/(self.choosing_count[index_At_choosing[i]])

        return self.exp_rewards,self.violation,starttime
