import joblib
from joblib import Parallel, delayed
import numpy as np
import time


# Bandit environments and simulator

class BerBandit(object):
  """Bernoulli bandit."""

  def __init__(self, mu):
    self.mu = np.copy(mu)
    self.K = self.mu.size

    self.best_arm = np.argmax(self.mu)

    self.randomize()

  def randomize(self):
    # generate random rewards
    self.rt = (np.random.rand() < self.mu).astype(float)

  def reward(self, arm):
    # instantaneous reward of the arm
    return self.rt[arm]

  def regret(self, arm):
    # instantaneous regret of the arm
    return self.rt[self.best_arm] - self.rt[arm]

  def pregret(self, arm):
    # expected regret of the arm
    return self.mu[self.best_arm] - self.mu[arm]

  def print(self):
    return "Bernoulli bandit with arms (%s)" % \
      ", ".join("%.3f" % s for s in self.mu)


class BetaBandit(object):
  """Beta bandit."""

  def __init__(self, alpha, beta):
    self.alpha = np.copy(alpha)
    self.beta = np.copy(beta)
    self.mu = self.alpha / (self.alpha + self.beta)
    self.K = self.mu.size

    self.best_arm = np.argmax(self.mu)

    self.randomize()

  def randomize(self):
    # generate random rewards
    self.rt = np.random.beta(self.alpha, self.beta)

  def reward(self, arm):
    # instantaneous reward of the arm
    return self.rt[arm]

  def regret(self, arm):
    # instantaneous regret of the arm
    return self.rt[self.best_arm] - self.rt[arm]

  def pregret(self, arm):
    # expected regret of the arm
    return self.mu[self.best_arm] - self.mu[arm]

  def print(self):
    return "Beta bandit with arms (%s)" % \
      ", ".join("%.3f" % s for s in self.mu)


class GaussBandit(object):
  """Gaussian bandit."""

  def __init__(self, mu, sigma=0.5):
    self.mu = np.copy(mu)
    self.K = self.mu.size
    self.sigma = sigma

    self.best_arm = np.argmax(self.mu)

    self.randomize()

  def randomize(self):
    # generate random rewards
    self.rt = self.mu + self.sigma * np.random.randn(self.K)

  def reward(self, arm):
    # instantaneous reward of the arm
    return self.rt[arm]

  def regret(self, arm):
    # instantaneous regret of the arm
    return self.rt[self.best_arm] - self.rt[arm]

  def pregret(self, arm):
    # expected regret of the arm
    return self.mu[self.best_arm] - self.mu[arm]

  def print(self):
    return "Gaussian bandit with arms (%s)" % \
      ", ".join("%.3f" % s for s in self.mu)


def evaluate_one(Alg, params, env, n, period_size=1):
  """One run of a bandit algorithm."""
  alg = Alg(env, n, params)

  regret = np.zeros(n // period_size)
  for t in range(n):
    # generate state
    env.randomize()

    # take action
    arm = alg.get_arm(t)

    # update model and regret
    alg.update(t, arm, env.reward(arm))
    regret_at_t = env.regret(arm)
    regret[t // period_size] += regret_at_t

  return regret, alg


def evaluate(Alg, params, env, n=1000, period_size=1, printout=True):
  """Multiple runs of a bandit algorithm."""
  if printout:
    print("Evaluating %s" % Alg.print(), end="")
  start = time.time()

  num_exps = len(env)
  regret = np.zeros((n // period_size, num_exps))
  alg = num_exps * [None]

  output = Parallel(n_jobs=-1)(delayed(evaluate_one)(Alg, params, env[ex], n, period_size)
    for ex in range(num_exps))
  for ex in range(num_exps):
    regret[:, ex] = output[ex][0]
    alg[ex] = output[ex][1]
  if printout:
    print(" %.1f seconds" % (time.time() - start))

  if printout:
    total_regret = regret.sum(axis=0)
    print("Regret: %.2f +/- %.2f (median: %.2f, max: %.2f, min: %.2f)" %
      (total_regret.mean(), total_regret.std() / np.sqrt(num_exps),
      np.median(total_regret), total_regret.max(), total_regret.min()))

  return regret, alg


# Bandit algorithms

class UCB1:
  def __init__(self, env, n, params):
    self.K = env.K

    for attr, val in params.items():
      if isinstance(val, np.ndarray):
        setattr(self, attr, np.copy(val))
      else:
        setattr(self, attr, val)

    self.pulls = np.zeros(self.K)  # number of pulls
    self.reward = np.zeros(self.K)  # cumulative reward
    self.tiebreak = 1e-6 * np.random.rand(self.K)  # tie breaking

  def update(self, t, arm, r):
    self.pulls[arm] += 1
    self.reward[arm] += r

  def get_arm(self, t):
    if t < self.K:
      # pull each arm once in the first K rounds
      self.ucb = np.zeros(self.K)
      self.ucb[t] = 1
    else:
      # UCBs
      t += 1  # time starts at one
      self.ucb = self.reward / self.pulls + \
        np.sqrt(2 * np.log(t) / self.pulls) + self.tiebreak

    arm = np.argmax(self.ucb)
    return arm

  @staticmethod
  def print():
    return "UCB1"


class UCB1Tuned:
  def __init__(self, env, n, params):
    self.K = env.K

    for attr, val in params.items():
      if isinstance(val, np.ndarray):
        setattr(self, attr, np.copy(val))
      else:
        setattr(self, attr, val)

    self.pulls = np.zeros(self.K)  # number of pulls
    self.reward = np.zeros(self.K)  # cumulative reward
    self.reward2 = np.zeros(self.K)  # cumulative squared reward
    self.tiebreak = 1e-6 * np.random.rand(self.K)  # tie breaking

  def update(self, t, arm, r):
    self.pulls[arm] += 1
    self.reward[arm] += r
    self.reward2[arm] += r * r

  def get_arm(self, t):
    if t < self.K:
      # pull each arm once in the first K rounds
      self.ucb = np.zeros(self.K)
      self.ucb[t] = 1
    else:
      # UCBs
      t += 1  # time starts at one
      
      V = self.reward2 / self.pulls - np.square(self.reward / self.pulls) + \
        np.sqrt(2 * np.log(t) / self.pulls)
      self.ucb = self.reward / self.pulls + \
        np.sqrt(np.minimum(0.25, V) * np.log(t) / self.pulls) + self.tiebreak

    arm = np.argmax(self.ucb)
    return arm

  @staticmethod
  def print():
    return "UCB1-Tuned"


class UCBV:
  def __init__(self, env, n, params):
    self.K = env.K
    self.n = n
    self.fixed_delta = 0.0

    for attr, val in params.items():
      if isinstance(val, np.ndarray):
        setattr(self, attr, np.copy(val))
      else:
        setattr(self, attr, val)

    self.pulls = np.zeros(self.K)  # number of pulls
    self.reward = np.zeros(self.K)  # cumulative reward
    self.reward2 = np.zeros(self.K)  # cumulative squared reward
    self.tiebreak = 1e-6 * np.random.rand(self.K)  # tie breaking

  def update(self, t, arm, r):
    self.pulls[arm] += 1
    self.reward[arm] += r
    self.reward2[arm] += r * r

  def get_arm(self, t):
    if t < self.K:
      # pull each arm once in the first K rounds
      self.ucb = np.zeros(self.K)
      self.ucb[t] = 1
    else:
      # UCBs
      t += 1  # time starts at one

      if not self.fixed_delta:
        # # from \sum_{t = 1}^n \sum_{s = 1}^t (1.0 / n^2) <= 1
        # delta = 1.0 / np.power(self.n, 2)
        # from \sum_{t = 1}^n \sum_{s = 1}^t (1.0 / t^3) <= \pi^2 / 6
        delta = 1.0 / np.power(t, 3)
      else:
        delta = self.fixed_delta

      muhat = self.reward / self.pulls
      varhat = (self.reward2 - self.pulls * np.square(muhat)) / self.pulls
      varhat = np.maximum(varhat, 0)
      self.ucb = muhat + \
        np.sqrt(2 * varhat * np.log(3 / delta) / self.pulls) + \
        3 * np.log(3 / delta) / self.pulls + \
        self.tiebreak

    arm = np.argmax(self.ucb)
    return arm

  @staticmethod
  def print():
    return "UCB-V"


class TS:
  def __init__(self, env, n, params):
    self.K = env.K
    self.epsilon = 1.0

    self.alpha = np.ones(self.K)  # positive observations
    self.beta = np.ones(self.K)  # negative observations

    for attr, val in params.items():
      if isinstance(val, np.ndarray):
        setattr(self, attr, np.copy(val))
      else:
        setattr(self, attr, val)

  def update(self, t, arm, r):
    r = min(max(r, 0), 1)
    if (r > 0) and (r < 1):
      r = (np.random.rand() < r).astype(float)
    self.alpha[arm] += r
    self.beta[arm] += 1 - r

  def get_arm(self, t):
    # posterior sampling
    self.mu = np.random.beta(self.alpha, self.beta)

    # e-TS from "Thompson Sampling with Less Exploration is Fast and Optimal"
    use_mean = np.random.rand(self.K) > self.epsilon
    self.mu[use_mean] = self.alpha[use_mean] / (self.alpha[use_mean] + self.beta[use_mean])

    arm = np.argmax(self.mu)
    return arm

  @staticmethod
  def print():
    return "TS"


class GaussTS:
  def __init__(self, env, n, params):
    self.K = env.K
    self.mu0 = np.zeros(self.K)
    self.sigma0 = np.ones(self.K)
    self.sigma = np.ones(self.K)
    self.epsilon = 1.0

    for attr, val in params.items():
      if isinstance(val, np.ndarray):
        setattr(self, attr, np.copy(val))
      else:
        setattr(self, attr, val)

    self.pulls = np.zeros(self.K)  # number of pulls
    self.reward = np.zeros(self.K)  # cumulative reward

  def update(self, t, arm, r):
    self.pulls[arm] += 1
    self.reward[arm] += r

  def get_arm(self, t):
    # posterior distribution
    sigma2 = np.square(self.sigma)
    sigma02 = np.square(self.sigma0)
    post_var = 1.0 / (1.0 / sigma02 + self.pulls / sigma2)
    post_mean = post_var * (self.mu0 / sigma02 + self.reward / sigma2)

    # posterior sampling
    self.mu = post_mean + np.sqrt(post_var) * np.random.randn(self.K)

    # e-TS from "Thompson Sampling with Less Exploration is Fast and Optimal"
    use_mean = np.random.rand(self.K) > self.epsilon
    self.mu[use_mean] = post_mean[use_mean]

    arm = np.argmax(self.mu)
    return arm

  @staticmethod
  def print():
    return "Gaussian TS"


class VarTS:
  def __init__(self, env, n, params):
    self.K = env.K

    self.mu0 = np.zeros(self.K)
    self.kappa0 = np.ones(self.K)
    self.alpha0 = np.ones(self.K)
    self.beta0 = np.ones(self.K)

    for attr, val in params.items():
      if isinstance(val, np.ndarray):
        setattr(self, attr, np.copy(val))
      else:
        setattr(self, attr, val)

    self.pulls = np.zeros(self.K)  # number of pulls
    self.reward = np.zeros(self.K)  # cumulative reward
    self.reward2 = np.zeros(self.K)  # cumulative squared reward

  def update(self, t, arm, r):
    self.pulls[arm] += 1
    self.reward[arm] += r
    self.reward2[arm] += r * r

  def get_arm(self, t):
    # posterior distribution
    mu = (self.kappa0 * self.mu0 + self.reward) / (self.kappa0 + self.pulls)
    kappa = np.maximum(self.kappa0 + self.pulls, 1e-3)
    alpha = self.alpha0 + self.pulls / 2
    mean_reward = self.reward / np.maximum(self.pulls, 1)
    beta = self.beta0 + \
      0.5 * (self.reward2 - 2 * self.reward * mean_reward + self.pulls * np.square(mean_reward)) + \
      self.kappa0 * self.pulls * np.square(mean_reward - self.mu0) / (2 * kappa)

    # posterior sampling
    Lambda = np.maximum(np.random.gamma(alpha, 1.0 / beta, size=self.K), 1e-3)
    self.mu = mu + np.random.randn(self.K) / np.sqrt(kappa * Lambda)

    arm = np.argmax(self.mu)
    return arm

  @staticmethod
  def print():
    return "VarTS"


class HTTS:
  def __init__(self, env, n, params):
    self.K = env.K
    self.alpha = - np.ones(self.K)

    for attr, val in params.items():
      if isinstance(val, np.ndarray):
        setattr(self, attr, np.copy(val))
      else:
        setattr(self, attr, val)

    self.pulls = np.zeros(self.K)  # number of pulls
    self.reward = np.zeros(self.K)  # cumulative reward
    self.reward2 = np.zeros(self.K)  # cumulative squared reward

  def update(self, t, arm, r):
    self.pulls[arm] += 1
    self.reward[arm] += r
    self.reward2[arm] += r * r

  def get_arm(self, t):
    if t < 4 * self.K:
      self.mu = np.zeros(self.K)
      self.mu[t % self.K] = 1
    else:
      # posterior sampling
      mean_reward = self.reward / self.pulls
      S = np.maximum(self.reward2 - self.pulls * np.square(mean_reward), 0)
      df = self.pulls + 2 * self.alpha - 1
      sample = np.random.standard_t(df)
      self.mu = mean_reward + np.sqrt(S / (self.pulls * df)) * sample

    arm = np.argmax(self.mu)
    return arm

  @staticmethod
  def print():
    return "Honda & Takemura TS"
