#UCB algorithm

from scipy.stats import bernoulli
import math
import random
from numpy.random import Generator, PCG64
import matplotlib.pyplot as plt
#from google.colab import files



#set random seed -- this is a different seed than the one random uses!
seedBernoulli = 123456
numpy_randomGen = Generator(PCG64(seedBernoulli))
bernoulli.random_state = numpy_randomGen

#bandit instance specification
numArms = 6
T = 45000
delta = 1.0/(numArms*T)**2
trueMean = [0.44,0.47,0.5,0.53,0.56,0.59]
bestArmValue = max(trueMean)
numExecutions = 20
print(trueMean)

#helper variables ---- need to reset to zero for every algorithm ---
regretUCBExec = []
oldSeqPullsUCB = None
diffCounterUCB = []


for exec in range(numExecutions):
  totalRewards = numArms*[0]
  numPulls = numArms*[0]
  ucbValue = numArms*[0]
  seqPulls = []

  regretUCB = []
  totalRegretUCB = 0

  #pull every arm once
  for i in range(numArms):
    totalRewards[i] += bernoulli.rvs(trueMean[i],size=1)[0]
    numPulls[i] += 1
    totalRegretUCB += bestArmValue - trueMean[i]
    regretUCB.append(totalRegretUCB)
    seqPulls.append(i)

  for i in range(numArms):
    ucbValue[i] = float(totalRewards[i])/numPulls[i] + math.sqrt(2*math.log(1/delta)/numPulls[i])

  remRounds = T - numArms
  for t in range(remRounds):
    currArm = ucbValue.index(max(ucbValue))
    currReward = bernoulli.rvs(trueMean[currArm],size=1)[0]
    totalRewards[currArm] += currReward
    numPulls[currArm] += 1
    ucbValue[currArm] = float(totalRewards[currArm])/numPulls[currArm] + math.sqrt(2*math.log(1/delta)/numPulls[currArm])
    seqPulls.append(currArm)
    totalRegretUCB += bestArmValue - trueMean[currArm]
    regretUCB.append(totalRegretUCB)

  #print(ucbValue)
  regretUCBExec.append(regretUCB)
  #print(seqPulls)
  #print(ucbValue)
  avg__ = [float(totalRewards[arm])/numPulls[arm] for arm in range(numArms)]
  #print(avg__)
  # print(numPulls)

  if (exec % 2 == 0):
    oldSeqPullsUCB = seqPulls.copy()
  else:
    numDifferent = 0
    for j in range(T):
      if seqPulls[j] != oldSeqPullsUCB[j]:
        numDifferent += 1
    diffCounterUCB.append(numDifferent)


avgRegretUCB = []
for i in range(T):
  total_ = 0
  for j in range(numExecutions):
    total_ += regretUCBExec[j][i]
  avgRegretUCB.append(float(total_)/numExecutions)


regretReprExec = []
oldSeqPullsRepr = None
diffCounterRepr = []
seedBernoulli = 123456
numpy_randomGen = Generator(PCG64(seedBernoulli))
bernoulli.random_state = numpy_randomGen


for exec in range(numExecutions):
  #helper variables ---- need to reset to zero for every algorithm ---
  seedRandom = 1234567
  random.seed(seedRandom)
  totalRewards = numArms*[0]
  seqPulls = []


  #variables that are specific to the reproducible algorithm
  batches = math.floor(math.log(T,2))
  q = 2
  rho = 0.3
  pulls = 0
  activeArms = {i for i in range(numArms)}
  remRounds = T
  beta = math.ceil(numArms**2/rho**2)
  regretRepr = []
  totalRegretRepr = 0

  for i in range(batches):
    if beta * q**(i+1) * len(activeArms) > remRounds:
      break
    for arm in activeArms:
      for j in range(beta*q**(i+1)):
        totalRewards[arm] += bernoulli.rvs(trueMean[arm],size=1)[0]
        seqPulls.append(arm)
        totalRegretRepr += bestArmValue - trueMean[arm]
        regretRepr.append(totalRegretRepr)
    pulls += q**(i+1)
    pullsBeta = beta * pulls

    uBeta = math.sqrt(2*math.log(2*numArms*T*batches)/pullsBeta)
    u = math.sqrt(2*math.log(2*numArms*T*batches)/pulls)
    threshold = random.uniform(u/2,u)

    maxReward = max(totalRewards)
    deleteArms = []
    for arm in activeArms:
      if float(totalRewards[arm])/pullsBeta + uBeta < float(maxReward)/pullsBeta - threshold:
        deleteArms.append(arm)

    remRounds -= len(activeArms)*beta*q**(i+1)
    for arm in deleteArms:
      activeArms.remove(arm)

  bestEstimatedArm = totalRewards.index(max(totalRewards))
  seqPulls += [bestEstimatedArm]*remRounds
  #print("Best estimated arm value: ", trueMean[bestEstimatedArm])
  for i in range(remRounds):
    totalRegretRepr += bestArmValue - trueMean[bestEstimatedArm]
    regretRepr.append(totalRegretRepr)

  regretReprExec.append(regretRepr)
  #print(seqPulls)
  #print(ucbValue)

  if (exec % 2 == 0):
    oldSeqPullsRepr = seqPulls.copy()
  else:
    numDifferent = 0
    for j in range(T):
      if seqPulls[j] != oldSeqPullsRepr[j]:
        numDifferent += 1
    diffCounterRepr.append(numDifferent)
  


avgRegretRepr = []
for i in range(T):
  total_ = 0
  for j in range(numExecutions):
    total_ += regretReprExec[j][i]
  avgRegretRepr.append(float(total_)/numExecutions)

print(diffCounterUCB)
print(diffCounterRepr)
# print(avgRegretUCB)
# print(avgRegretRepr)

plt.plot(range(T), avgRegretUCB, label = "Average Regret UCB")
plt.plot(range(T), avgRegretRepr, label = "Average Regret Repr. Batched Elimin.")
plt.legend()
plt.show()
#plt.savefig('UCBvsRepr.png', dpi = 1000, bbox_inches='tight')
#files.download('UCBvsRepr.png')
print(totalRegretRepr)
print(totalRegretUCB)
#print(bestEstimatedArm)
# print(seqPulls)
# print(len(seqPulls))


