import numpy as np
import scipy.stats as st
import matplotlib.pylab as plt
First set up the algorithm
# Parameters
sigma = 0.1
# the random function f(x)
mode = np.random.uniform(0, 1, 2) # this is the mode
def f(x): # the function connects (0,0), the mode, and (1,0)
xp = np.array([0, mode[0], 1])
fp = np.array([0, mode[1], 0])
return np.interp(x, xp, fp)
We implement all four algorithms in the same run
We first implement our algorithm.
Tvec = np.arange(1000, 110000, 10000)
n = 1000 # simulatons
reg_mat = np.zeros((len(Tvec), n))
for indt in np.arange(len(Tvec)): # over n patients
T = Tvec[indt]
K = np.floor(T**(0.25))
m = np.floor(T**0.5)
K = np.int(K)
m = np.int(m)
disc = np.arange(0, 1.0001, 1/K)
for indn in np.arange(n):
mode = np.random.uniform(0, 1, 2) # this is the mode
mode[1] = 1-(1-mode[1])/2
# generate K+1 batches of rewards
reward = np.random.normal(0, sigma, (m, K+1))
ub = np.mean(reward, 0) + f(disc) + sigma*np.sqrt(2*np.log(m)/m)
lb = np.mean(reward, 0) + f(disc) - sigma*np.sqrt(2*np.log(m)/m)
for s in np.arange(len(ub)):
if ub[s] < np.max(lb[0:(s+1)]):
break
# print(m, mode[0], disc[s], mode[1], np.mean(f(disc[0:s])), f(disc[s]))
reg_mat[indt, indn] = m*(mode[1] - np.mean(f(disc[0:s]))) + (T-s*m) * (mode[1] - f(disc[s]))
plt.plot(Tvec, np.mean(reg_mat, 1))
plt.plot(Tvec, np.mean(reg_mat, 1)+1.96*np.std(reg_mat, 1)/np.sqrt(n))
plt.plot(Tvec, np.mean(reg_mat, 1)-1.96*np.std(reg_mat, 1)/np.sqrt(n))
plt.title('Our algorithm')
plt.show()
a = np.column_stack((Tvec, np.mean(reg_mat, 1), np.std(reg_mat, 1)))
np.savetxt('temp.csv', a, delimiter = ',')
reg_mat[10,1:100]
array([ 8451.27122352, 6912.99825288, 34856.95386238, 9652.57584862, 25584.47473224, 10506.72539188, 9573.1155781 , 12834.35145582, 9975.01224538, 15758.86445143, 9479.64210747, 5758.59739529, 10264.07703521, 20775.11281158, 10937.18774297, 8518.78864622, 18009.79007795, 13100.04480474, 11257.4152817 , 9571.55097644, 8389.605859 , 11640.60775609, 24418.79462226, 6733.30809732, 7811.90715985, 9274.78454348, 7570.65795032, 7085.48035483, 9730.61597751, 8509.34329993, 51521.99904888, 8759.51754286, 7285.44717372, 15145.104213 , 7504.68007288, 9025.51011578, 6454.61482239, 10216.27072564, 10951.42882875, 21449.85600582, 7094.42825265, 11395.89493196, 9722.27540533, 9923.75293524, 13030.01085268, 8282.69672395, 8948.00039599, 12300.91691767, 11120.81756798, 13741.2456082 , 6068.75081809, 7552.4120525 , 6711.9258694 , 17200.22137422, 13332.12799846, 9952.50977431, 20493.87882586, 7189.70436095, 6389.14918036, 10213.56941473, 9110.38236202, 74791.50703823, 6325.5667405 , 6524.19055359, 7440.5331628 , 5723.55055107, 90403.91948114, 10263.75189017, 23954.55309231, 16830.07464039, 12956.80339232, 11486.70649228, 5841.53555932, 9360.76811776, 6004.14761574, 19928.10189096, 15031.14235287, 7668.45150041, 12053.15641986, 6529.79167755, 15178.03973511, 7608.68490885, 10704.61806231, 11687.96248388, 9501.08340969, 7910.65531955, 6981.87688235, 6082.4559523 , 14116.91325892, 4569.48804848, 14952.61894727, 10035.35953852, 11815.33906572, 7351.51891271, 7908.22824893, 71918.89800284, 5865.23258736, 10440.53297053, 17445.80148778])
Just UCB. This is cheating because it is not increasing.
n = 100
reg_mat = np.zeros((len(Tvec), n))
for indt in np.arange(len(Tvec)):
T = Tvec[indt]
K = np.int(T**(1/3))
disc = np.arange(0, 1.0001, 1/K) # UCB discretization
for indn in np.arange(n):
mode = np.random.uniform(0, 1, 2) # this is the mode
mode[1] = 1-(1-mode[1])/2
mode = np.random.uniform(0, 1, 2) # this is the mode
cum_reward = 0
ub = 1000*np.ones(len(disc)) # initial upper bound
avg_x = np.zeros(len(disc))
n_disc = np.zeros(len(disc))
for t in np.arange(T):
opt_x = np.argmax(ub)
sample = np.random.normal(f(disc[opt_x]), sigma, 1)
avg_x[opt_x] = (avg_x[opt_x] * n_disc[opt_x] + sample)/(n_disc[opt_x]+1)
n_disc[opt_x] = n_disc[opt_x]+1
ub[opt_x] = avg_x[opt_x] + sigma*np.sqrt(2*np.log(1+(t+1)*(np.log(t+1))**2)/n_disc[opt_x])
cum_reward = cum_reward + f(disc[opt_x])
reg_mat[indt, indn] = T*mode[1] - cum_reward
plt.plot(Tvec, np.mean(reg_mat, 1))
plt.title('UCB')
plt.show()
a = np.column_stack((Tvec, np.mean(reg_mat, 1), np.std(reg_mat, 1)))
np.savetxt('temp.csv', a, delimiter = ',')
UCB with increasing sequence
n = 100
reg_mat = np.zeros((len(Tvec), n))
for indt in np.arange(len(Tvec)):
T = Tvec[indt]
K = np.int(T**(1/3))
disc = np.arange(0, 1.0001, 1/K) # UCB discretization
for indn in np.arange(n):
mode = np.random.uniform(0, 1, 2) # this is the mode
mode[1] = 1-(1-mode[1])/2
cum_reward = 0
current_x = 0
ub = np.ones(len(disc)) # initial upper bound
n_disc = np.zeros(len(disc))
# decay_disc = q**(-np.arange(len(disc)))
avg_x = np.zeros(len(disc))
for t in np.arange(T):
opt_x = np.argmax(ub*(disc>=current_x))
sample = np.random.normal(f(disc[opt_x]), sigma, 1)
avg_x[opt_x] = (avg_x[opt_x] * n_disc[opt_x] + sample)/(n_disc[opt_x]+1)
n_disc[opt_x] = n_disc[opt_x]+1
current_x = disc[opt_x]
ub[opt_x] = avg_x[opt_x] + sigma*np.sqrt(2*np.log(1+(t+1)*(np.log(t+1))**2)/n_disc[opt_x])
cum_reward = cum_reward + f(disc[opt_x])
reg_mat[indt, indn] = T*mode[1] - cum_reward
plt.plot(Tvec, np.mean(reg_mat, 1))
plt.title('UCB Increasing')
plt.show()
a = np.column_stack((Tvec, np.mean(reg_mat, 1), np.std(reg_mat, 1)))
np.savetxt('temp.csv', a, delimiter = ',')
UCB with inflated UB
n=100
reg_mat = np.zeros((len(Tvec), n))
for indt in np.arange(len(Tvec)):
T = Tvec[indt]
K = np.int(T**(1/3))
disc = np.arange(0, 1.0001, 1/K) # UCB discretization
for indn in np.arange(n):
mode = np.random.uniform(0, 1, 2) # this is the mode
mode[1] = 1-(1-mode[1])/2
cum_reward = 0
current_x = 0
ub = (1- disc) * np.ones(len(disc)) # initial upper bound
n_disc = np.zeros(len(disc))
# decay_disc = q**(-np.arange(len(disc)))
avg_x = np.zeros(len(disc))
for t in np.arange(T):
opt_x = np.argmax(ub*(disc>=current_x))
sample = np.random.normal(f(disc[opt_x]), sigma, 1)
avg_x[opt_x] = (avg_x[opt_x] * n_disc[opt_x] + sample)/(n_disc[opt_x]+1)
n_disc[opt_x] = n_disc[opt_x]+1
current_x = disc[opt_x]
ub[opt_x] = avg_x[opt_x] + sigma*np.sqrt(2*np.log(1+(t+1)*(np.log(t+1))**2)/n_disc[opt_x])
cum_reward = cum_reward + f(disc[opt_x])
reg_mat[indt, indn] = T*mode[1] - cum_reward
plt.plot(Tvec, np.mean(reg_mat, 1))
plt.title('UCB Deflated')
plt.show()
a = np.column_stack((Tvec, np.mean(reg_mat, 1), np.std(reg_mat, 1)))
np.savetxt('temp.csv', a, delimiter = ',')