import numpy as np
import xgboost as xgb

def inversegap(scorelist, gamma, K):
    # print(scorelist)
    optimal_index = np.argmax(scorelist)
    Lagrangian_gap = scorelist[optimal_index] - scorelist
    pi = 1/(K + gamma * Lagrangian_gap)
    pi[optimal_index] = (1.0 - np.sum(pi[:optimal_index]) -  np.sum(pi[optimal_index+1:]))
    # print(pi)
    return np.argmax(np.random.multinomial(1,pi.flatten()))

class AUPD:
    def __init__(self, K=4, dim=5, T=5000, b =1):
        self.K = K
        self.dim = dim
        # LOE2D
        self.Q = 0
        self.V = 0.1* b * np.sqrt(T)
        self.b = b

    def take_action(self, context, rmodel, cmodel):
        #model predict
        preward = rmodel.predict(context).reshape(self.K, 1)
        pcost = cmodel.predict(context).reshape(self.K, 1)
        # LOE2D decision
        Lagweight = preward - (self.Q / self.V) * pcost
        action = np.argmax(Lagweight)
        # vq update
        self.Q = max(self.Q + pcost[action] - self.b, 0)
        return action


    def update(self, context, action, rmodel, cmodel, reward, cost):
        # model update
        rmodel.update(context[action], reward)
        cmodel.update(action, cost)

class PGD:
    def __init__(self, K=4, dim=5, T=5000, b=1):
        self.K = K
        self.dim = dim
        self.gamma = 1/np.sqrt(T)
        # PD
        self.M = 4 * np.sqrt(T) + 2*np.sqrt(dim * T * np.log(T))
        self.b = b
        self.T = T
        self.Tk = 1
        self.k = 1
        self.margin = 0.001
        self.costlist = []
        self.Q = 1
        self.round = 0


    def take_action(self, context, rmodel, cmodel):
        #model predict
        preward = rmodel.predict(context).reshape(self.K, 1)
        pcost = cmodel.predict(context).reshape(self.K, 1)
        action = np.argmax(preward - self.Q * (pcost - (self.b - self.margin)))
        #vq update
        self.Q = max(self.Q + self.gamma * (pcost[action] - (self.b - self.margin)), 0)
        # print(self.Q)
        self.round += 1
        return action

    def update(self, context, action, rmodel, cmodel, reward, cost):
        # model update
        rmodel.update(context[action], reward)
        cmodel.update(action, cost)
        self.costlist.append(cost)
        if np.sum(self.costlist) - (self.round - self.Tk + 1)*(self.b - self.margin) > self.M:
            self.k = self.k + 1
            self.costlist = []
            self.gamma = np.power(2, self.k)/np.sqrt(self.T)
            self.Tk = self.round + 1


class gb5:
    def __init__(self, depth = 5):
        self.model = xgb.XGBRegressor(max_depth=depth)

    def pretrain(self, X_train, y_train):
        self.model.fit(X_train, y_train)

    def predict(self, context):
        return self.model.predict(context)

    def update(self, action, value):
        return 1

class LinUCB:
    def __init__(self, K = 4, dim = 5, lamb = 1, beta = 0.01):
        self.K = K
        self.dim = dim
        self.lamb = lamb
        self.beta = beta
        self.round = 0
        self.A = self.lamb * np.identity(self.dim)
        self.b = np.zeros((self.dim, 1))

    def predict(self, context):
        A_inv = np.linalg.inv(self.A)
        theta_est = A_inv.dot(self.b)
        mu_est = [theta_est.T.dot(context[i]) for i in range(self.K)]
        bonus = [np.sqrt((context[i].T.dot(A_inv)).dot(context[i])) for i in range(self.K)]
        ucb = np.array([np.array(mu_est[i] + self.beta * bonus[i]) for i in range(self.K)],
                       dtype=np.float)
        return ucb

    def update(self, context, reward):
        self.A += np.outer(context, context)
        self.b += reward * context.reshape(-1, 1)

    def reset(self):
        self.A = self.lamb * np.identity(self.dim)
        self.b = np.zeros((self.dim, 1))



class gb5new:
    def __init__(self, depth=5):
        self.K = depth
        self.model = None
        self.params = {'max_depth': depth, 'objective': 'reg:squarederror'}

    def pretrain(self, X_train, y_train):
        dtrain = xgb.DMatrix(X_train, label=y_train)
        self.model = xgb.train(self.params, dtrain)

    def predict(self, context):
        dmatrix_context = xgb.DMatrix(context)
        return self.model.predict(dmatrix_context)

    def update(self, context, value):
        dtrain_new = xgb.DMatrix(context.reshape(1, -1), value.reshape(-1, 1))
        self.model = xgb.train(self.params, dtrain_new, xgb_model=self.model)


class meanvalue:
    def __init__(self, K):
        self.estimate = np.zeros(K)
        self.round = 0

    def predict(self, context):
        return self.estimate

    def update(self, action, value):
        self.round += 1
        self.estimate[action] = self.estimate[action] * (self.round - 1)/self.round + value/self.round

    def reset(self):
        self.estimate[:] = 0
        self.round = 0

class SquareCBwK:
    def __init__(self, K=4, dim=5, T=5000, U=30, b = 1):
        self.K = K
        self.dim = dim
        # LOE2D
        self.Q = 0
        self.beta = 1
        self.gamma = np.sqrt(K * T/(U))/10
        self.b = b

    def take_action(self, context, rmodel, cmodel):
        #model predict
        preward = rmodel.predict(context).reshape(self.K, 1)
        pcost = cmodel.predict(context).reshape(self.K, 1)
        # LOE2D decision
        Lagweight = preward - (pcost - self.b)
        # Lagweight = preward
        return inversegap(np.nan_to_num(Lagweight), self.gamma * self.beta, self.K)


    def update(self, context, action, rmodel, cmodel, reward, cost):
        # model update
        rmodel.update(context[action], reward)
        cmodel.update(action, cost)
