# -*- coding: utf-8 -*-
"""adaptiveDosageAllocation.ipynb

Automatically generated by Colab.

import numpy as np
import cvxpy as cp
import matplotlib.pyplot as plt

"""In the high probability constraint setting, at each round the following protocol is followed:


*   The learner observes the context $X_t \in \mathbb{R}^d$
*   The learner selects its action $\alpha_t \in [0,1]$
*   The learner observes the reward, and the cost signals $r_t, c_t \in \mathbb{R} $

At each round the learner must satisfy the following constraint:
$\Pr(c_t \leq \tau) \geq 1-δ $ for a known parameter $\tau >0$.


"""

#function to compute the mu pessimistically so as it minimizes the length of the feasible set
#returns the maximum feasible dosage
def compute_feasible_set_mu(b_t,V_t,hat_mu,delta,sigma_tr,tau,X_t):
  d = len(hat_mu)
  mu = cp.Variable((d,1)) # Define mu as a column vector

  # Define the Mahalanobis norm constraint
  constraint1 = cp.quad_form(mu - hat_mu, V_t) <= b_t**2
  constraint2 = cp.sum(X_t @ mu) + stats.norm.ppf(1-delta, loc=0, scale=sigma_tr)*sigma_tr >= 0
  # Set up a dummy objective (minimize zero)
  objective = cp.Maximize(cp.sum(X_t @ mu))

  # Formulate the problem
  problem = cp.Problem(objective, [constraint1,constraint2])
  #problem.solve()

  try:
      # Attempt to solve the problem
      problem.solve(solver=cp.SCS)  # You can try other solvers here
  except cp.SolverError as e:
      print(f"SolverError: {e}")
      return None  # Or handle the error differently

  if problem.status not in ["optimal", "optimal_inaccurate"]:
    print(problem.status)

  if problem.status in ["optimal", "optimal_inaccurate"]:
    mu_hat = mu.value.reshape(-1)
    return min(1,tau/(np.dot(X_t,mu_hat) + stats.norm.ppf(1-delta, loc=0, scale=sigma_tr)*sigma_tr))
  else:
      return 1

#function to compute the theta optimistically
def compute_theta(b_t,V_t,hat_theta,sigma_tr,X_t):
  d = len(hat_theta)
  theta = cp.Variable((d,1))

  # Define the Mahalanobis norm constraint
  constraint1 = cp.quad_form(theta - hat_theta, V_t) <= b_t**2
  # objective (dot product between X_t and theta)
  objective = cp.Maximize(cp.sum(X_t @ theta))

  # Formulate the problem
  problem = cp.Problem(objective, [constraint1])
  #problem.solve()

  try:
      # Attempt to solve the problem
      problem.solve(solver=cp.SCS)  # You can try other solvers here
  except cp.SolverError as e:
      print(f"SolverError: {e}")
      return None  # Or handle the error differently
  if problem.status not in ["optimal", "optimal_inaccurate"]:
    print(problem.status)

  if problem.status in ["optimal", "optimal_inaccurate"]:
    if np.dot(X_t,theta.value) >= 0:
      return 1
    else:
      return 0
  else:
      raise ValueError("Optimization problem is infeasible.")

class highProbConstRegime:
    def __init__(self, d, delta, tau, sigma_tr):
      '''
      d: number of dimensions
      delta: confidence parameter
      tau: toxicity threashold
      sigma_tr: standard deviation of the noise
      '''
      self.d = d #number of dimensions
      self.delta = delta #confidence parameter
      self.tau = tau #toxicity threashold
      self.sigma_tr = sigma_tr #standard deviation of the noise

      '''
      theta: unknown rewards vector : d-dimensional normal distribution in this implementation
      mu: unknown costs vector : d-dimensional normal distribution
      '''
      self.theta = self.get_theta() #create a random theta with norm 1
      self.mu = self.get_mu() #create a random mu with norm 1

      '''
      context: t x d array to store the contexts X_t
      optimal_actions: t x 1 array to store the optimal actions
      '''

      self.context = np.empty((self.d,0)) #want to store the contexts in an array d X n columnwise
      self.optimal_actions = np.empty((0,), dtype=float)

    #function to compute the feasible set
    def compute_feasible_set(self):
      foo = np.dot(self.context[:,-1],self.mu) + stats.norm.ppf(1-self.delta, loc=0, scale=self.sigma_tr)*self.sigma_tr
      if foo > 0 :
        return min(1, self.tau/foo)
      else:
        return 1

    #function to compute the optimal action
    def compute_optimal_action(self):
      foo = self.compute_feasible_set()
      dot_product = np.dot(self.context[:,-1],self.theta)
      if (dot_product > 0):
        self.optimal_actions = np.append(self.optimal_actions,foo)
      else:
        self.optimal_actions = np.append(self.optimal_actions,0)
      return self.optimal_actions[-1]  # Return the last optimal action computed

    #function to create a random theta with norm 1
    def get_theta(self):
      #the vector is sampled from a d-dimensional normal distribution
      theta = np.random.normal(0, 1, self.d)
      #normalize the vector
      norm = np.linalg.norm(theta)
      theta = theta / norm
      return theta

    #function to create a random mu with norm 1
    def get_mu(self):
      #the vector is sampled from a d-dimensional normal distribution
      mu = np.random.normal(0, 1, self.d)
      #normalize the vector
      norm = np.linalg.norm(mu)
      mu = mu / norm
      return mu

    #function to create a random context with norm 1
    def get_context(self):
      #the vector is sampled from a d-dimensional normal distribution
      vec = np.random.normal(0, 1, self.d)
      #normalize the vector
      norm = np.linalg.norm(vec)
      unit_vec = vec / norm
      self.context = np.hstack((self.context,unit_vec.reshape((self.d,1))))
      return unit_vec

    #function to compute the reward and cost signals
    def get_reward_and_cost(self, action):
      #get the last context
      X_t = self.context[:,-1]
      # Compute the reward and cost signals
      reward = action * np.dot(self.theta,X_t) + np.random.normal(0, self.sigma_tr)  # Reward is a noisy sine function
      cost = action * np.dot(self.mu,X_t)  + np.random.normal(0, self.sigma_tr)  # Cost is a noisy quadratic function
      return reward, cost

class Learner:
    def __init__(self, d, delta, tau, sigma_tr, strategy='random'):
      self.d = d
      self.strategy = strategy
      self.delta = delta
      self.tau = tau
      self.sigma_tr = sigma_tr
      self.context_history =  np.empty((self.d,0)) #want to store the contexts in an array d X n columnwise
      self.action_history = np.empty((0,), dtype=float)
      self.reward_history = np.empty((0,), dtype=float)
      self.cost_history = np.empty((0,), dtype=float)
      self.ell = 0.1 #value of \lambda for the LSE estimator

    #to store the reward and the cost signals
    def get_reward_and_cost(self,reward,cost):
      self.reward_history = np.append(self.reward_history,reward)
      self.cost_history = np.append(self.cost_history,cost)
    def get_context(self,context):
      self.context_history = np.hstack((self.context_history,context.reshape((self.d,1)))) #store the context in an array

    def initial_dose(self,context):
      # +1 comes from the fact that in this implementation norm(X_t,mu,theta) = 1
      foo = min(1,self.tau/(stats.norm.ppf(1-self.delta, loc=0, scale=self.sigma_tr)*self.sigma_tr+1))
      self.action_history = np.append(self.action_history,foo)
      self.context_history = np.hstack((self.context_history,context.reshape((self.d,1)))) #store the context in an array
      return foo

    def compute_beta_t(self,t):
      b_t = self.sigma_tr*np.sqrt(self.d*np.log((1+t/self.ell)/self.delta)) + np.sqrt(self.ell)
      return b_t

    def compute_hat_theta(self):

      X_t = self.context_history.T  #of shape n x d

      n, d = X_t.shape

      XtX = np.dot(X_t.T,X_t) #d x d

      lambda_identity = self.ell * np.eye(d)

      XtX_lambda_I = XtX + lambda_identity

      XtX_lambda_I_inv = np.linalg.inv(XtX_lambda_I)

      V_t = XtX_lambda_I

      Y = self.reward_history/self.action_history # n x 1
      Y = Y.reshape(-1, 1)

      hat_theta = np.dot(np.dot(XtX_lambda_I_inv, X_t.T), Y)

      return hat_theta, V_t

    def compute_hat_mu(self):

      X_t = self.context_history.T  #of shape n x d

      n, d = X_t.shape

      XtX = np.dot(X_t.T,X_t) #d x d

      lambda_identity = self.ell * np.eye(d)

      XtX_lambda_I = XtX + lambda_identity

      XtX_lambda_I_inv = np.linalg.inv(XtX_lambda_I)

      V_t = XtX_lambda_I

      Y = self.cost_history/self.action_history # n x 1
      Y = Y.reshape(-1, 1)

      hat_mu = np.dot(np.dot(XtX_lambda_I_inv, X_t.T), Y)

      return hat_mu, V_t

    def choose_action(self, context,strategy ):
      #self.context_history = np.hstack((self.context_history,context.reshape((self.d,1)))) #store the context in an array
      if self.strategy == 'random':
        action = np.random.rand()  # Random action between 0 and 1
        if (action != 0):
          self.context_history = np.hstack((self.context_history,context.reshape((self.d,1)))) #store the context in an array
          self.action_history = np.append(self.action_history,action)
        return action
      elif self.strategy == 'LSE':
        #1st step to compute the feasible dosages
        #subsetp compute b_t
        t = self.context_history.shape[-1] #rounds where we received feedback
        b_t = self.compute_beta_t(t)
        hat_mu, V_t_mu = self.compute_hat_mu()
        max_dosage = compute_feasible_set_mu(b_t,V_t_mu,hat_mu,self.delta,self.sigma_tr,self.tau,context)
        #2nd step to compute the optimal dosage
        hat_theta, V_t_theta = self.compute_hat_theta()
        optimal_dosage = compute_theta(b_t,V_t_theta,hat_theta,self.sigma_tr,context)
        #if optimal dosage = 1 then we give the max dosage permitted else no dosage
        return max_dosage*optimal_dosage
      return 0  # Default action

class BanditGame:
    def __init__(self, d, rounds, strategy='random', delta = 0.01, tau = 1, sigma_tr = 1):
      '''
      d: number of dimensions
      rounds: number of rounds
      strategy: strategy to choose the action, our strategy is named Leasted Squares Estimation - LSE
      delta: confidence parameter
      tau: toxicity threashold
      sigma_tr: standard deviation of the noise
      '''
      self.d = d
      self.delta = delta
      self.tau = tau
      self.sigma_tr = sigma_tr
      self.rounds = rounds
      '''
      regime: In our implementation regime is a Linear Contextual Bandit regime with constraints on the action set
      learner: Our algorithm - strategy
      cumulative_regret: to plot the regret
      '''
      self.regime = highProbConstRegime(d,delta,tau,sigma_tr)
      self.learner = Learner(d, delta, tau, sigma_tr, strategy)
      self.cumulative_regret = np.empty((0,), dtype=float)

    def plot_regret(self):
      plt.plot(self.cumulative_regret)
      plt.xlabel('Rounds')
      plt.ylabel('Cumulative Regret')
      plt.show()

    def play(self):

      cumulative_regret = 0
      theta = self.regime.theta #to compute the regret

      #first round
      context = self.regime.get_context()
      action = self.learner.initial_dose(context)
      reward, cost = self.regime.get_reward_and_cost(action)

      #compute the regret
      optimal_action = self.regime.compute_optimal_action()
      regret = (optimal_action - action)*np.dot(theta,context)
      cumulative_regret += regret
      self.cumulative_regret = np.append(self.cumulative_regret,cumulative_regret)

      #a non-zero dosage is always selected in the first round => the learner stores the feedback
      self.learner.get_reward_and_cost(reward,cost)

      for t in range(self.rounds):
          context = self.regime.get_context()
          action = self.learner.choose_action(context)
          action_in_exp = self.learner.choose_action(context)
          reward, cost = self.regime.get_reward_and_cost(action)
          #compute the regret
          optimal_action = self.regime.compute_optimal_action()
          regret = (optimal_action - action)*np.dot(theta,context)
          cumulative_regret += regret
          self.cumulative_regret = np.append(self.cumulative_regret,cumulative_regret)
          #store the reward, the cost only when the action is non-zero
          if action != 0:
            self.learner.get_context(context)
            self.learner.get_reward_and_cost(reward,cost)
          #print(f"Round {t+1}: Context = {context}, Optimal action = {optimal_action}, Action = {action}, Reward = {reward}, Cost = {cost}")
      return self.cumulative_regret

for dimensions in[5,10]:
  for taf in [0.2,0.5,0.6,0.8]:
    number_of_rounds = 100000
    num_runs = 10

    # Run the algorithm multiple times and store the results
    results = np.zeros((num_runs, number_of_rounds+1))

    for i in range(num_runs):
      game = BanditGame(dimensions, rounds=number_of_rounds, strategy='LSE', delta = 0.01, tau = taf, sigma_tr = 1)
      results[i] = game.play()

    # Compute the mean and variance
    mean_time_series = np.mean(results, axis=0)
    variance_time_series = np.var(results, axis=0)
    std_time_series = np.sqrt(variance_time_series)

    # Plot the mean time series
    plt.figure(figsize=(10, 6))
    plt.plot(mean_time_series, label='Mean Time Series', color='blue')

    # Plot the variance as a shaded area
    plt.fill_between(range(number_of_rounds+1),
                    mean_time_series - std_time_series,
                    mean_time_series + std_time_series,
                    color='blue', alpha=0.2, label='Variance')

    plt.xlabel('Time')
    plt.ylabel('Value')
    plt.title('Mean Time Series with Variance')
    plt.legend()
    plt.show()