# this file implements the five cascading bandit algorithms discussed in the paper
import numpy as np
import time


# estimate pseudo-regret for cascade_ucb1
# inputs: n -- horizon, K -- number of recommended items, reward -- vector of attraction probabilities
# outputs: regret -- estimated pseudo-regret, runtime -- time spend computing ucbs
def cascade_ucb1(n, K, reward):
    # initialization
    regret = np.zeros(n)  # pseudo-regret for horizon n
    L = len(reward)  # figure out number of items
    w_hat = np.zeros(L)  # empirical means for the items
    samples = np.zeros(L)  # number of samples for the items
    runtime = 0  # will be incremented to estimate time to compute ucbs
    for t in range(n):
        # compute ucbs and choose action
        ucb, start = np.zeros(L), time.time()
        for e in range(L):  # could vectorize to make faster, but even this is much faster than kl-ucb
            if samples[e] > 0:
                ucb[e] = min(w_hat[e] + np.sqrt(1.5 * np.log(t) / samples[e]), 1)  # min with 1 as in appendix B
            else:
                ucb[e] = 1
        runtime = runtime + (time.time() - start)
        act = (-ucb).argsort()[:K]  # K items with highest ucbs
        # play action and update statistics
        for k in range(K):
            click_k = (np.random.rand() <= reward[act[k]])  # = 1 if click, = 0 otherwise
            w_hat[act[k]] = (samples[act[k]] * w_hat[act[k]] + click_k) / (samples[act[k]] + 1)  # update empirical mean
            samples[act[k]] = samples[act[k]] + 1  # update number of samples
            if click_k:
                break  # user stops examining items after first click
        regret[t] = np.prod(1 - reward[act]) - np.prod(1 - reward[:K])  # instantaneous pseudo-regret
    return np.cumsum(regret), runtime


# estimate pseudo-regret for cascade_ucbV
# inputs: n -- horizon; K -- number of recommended items; reward -- vector of attraction probabilities
# outputs: regret -- estimated pseudo-regret; runtime -- time spend computing ucbs
def cascade_ucbV(n, K, reward):
    # initialization
    regret = np.zeros(n)  # pseudo-regret for horizon n
    L = len(reward)  # figure out number of items
    w_hat = np.zeros(L)  # empirical means for the items
    samples = np.zeros(L)  # number of samples for the items
    runtime = 0  # will be incremented to estimate time to compute ucbs
    for t in range(n):
        # compute ucbs and choose action
        ucb, start = np.zeros(L), time.time()
        for e in range(L):  # could vectorize to make faster, but even this is much faster than kl-ucb
            if samples[e] > 0:
                ucb[e] = min(
                    w_hat[e] + np.sqrt(4 * w_hat[e] * (1 - w_hat[e]) * np.log(t) / samples[e]) + 6 * np.log(t) /
                    samples[e], 1)  # min with 1 as in appendix B
            else:
                ucb[e] = 1
        runtime = runtime + (time.time() - start)
        act = (-ucb).argsort()[:K]  # K items with highest ucbs
        # play action and update statistics
        for k in range(K):
            click_k = (np.random.rand() <= reward[act[k]])  # = 1 if click, = 0 otherwise
            w_hat[act[k]] = (samples[act[k]] * w_hat[act[k]] + click_k) / (samples[act[k]] + 1)  # update empirical mean
            samples[act[k]] = samples[act[k]] + 1  # update number of samples
            if click_k:
                break  # user stops examining items after first click
        regret[t] = np.prod(1 - reward[act]) - np.prod(1 - reward[:K])  # instantaneous pseudo-regret
    return np.cumsum(regret), runtime


# compute bernoulli relative entropy, up to some tolerance to avoid issues with division by zero
# inputs: p, q -- probabilities; tol -- tolerance
# outputs: (approximate) relative entropy between p and q
def rel_entropy(p, q, tol):
    p, q = min(max(p, tol), 1 - tol), min(max(q, tol), 1 - tol)  # clip based on tol
    return p * np.log(p / q) + (1 - p) * np.log((1 - p) / (1 - q))


# estimate pseudo-regret for cascade_klucb
# inputs: n -- horizon; K -- number of recommended items; reward -- vector of attraction probabilities
# outputs: regret -- estimated pseudo-regret; runtime -- time spend computing ucbs
def cascade_klucb(n, K, reward, tol):
    # initialization
    regret = np.zeros(n)  # pseudo-regret for horizon n
    L = len(reward)  # figure out number of items
    w_hat = np.zeros(L)  # empirical means for the items
    samples = np.zeros(L)  # number of samples for the items
    runtime = 0  # will be incremented to estimate time to compute ucbs
    for t in range(n):
        # compute ucbs and choose action
        ucb, start = np.zeros(L), time.time()
        for e in range(L):
            if samples[e] > 0:
                # estimate ucb via binary search as in appendix C
                lower, upper = w_hat[e], 1
                while upper - lower > tol:
                    if rel_entropy(w_hat[e], (lower + upper) / 2, tol) > np.log(t * (np.log(t + 1) ** 3)) / samples[e]:
                        upper = (lower + upper) / 2
                    else:
                        lower = (lower + upper) / 2
                ucb[e] = lower
            else:
                ucb[e] = 1
        runtime = runtime + (time.time() - start)
        act = (-ucb).argsort()[:K]  # K items with highest ucbs
        # play action and update statistics
        for k in range(K):
            click_k = (np.random.rand() <= reward[act[k]])  # = 1 if click, = 0 otherwise
            w_hat[act[k]] = (samples[act[k]] * w_hat[act[k]] + click_k) / (samples[act[k]] + 1)  # update empirical mean
            samples[act[k]] = samples[act[k]] + 1  # update number of samples
            if click_k:
                break  # user stops examining items after first click
        regret[t] = np.prod(1 - reward[act]) - np.prod(1 - reward[:K])  # instantaneous pseudo-regret
    return np.cumsum(regret), runtime


# estimate pseudo-regret for cascade_linucb
# inputs: n -- horizon; K -- number of recommended items; phi - features; reward -- vector of attraction probabilities
#    (for synthetic data) or list containing W_test and approximately optimal solution (for real data, see appendix C)
# outputs: regret -- estimated pseudo-regret
def cascade_linucb(n, K, phi, reward):
    # initialization
    regret = np.zeros(n)  # regret for horizon n
    [L, d] = np.shape(phi)  # figure out number of items and feature dimension
    alpha = np.sqrt(d * np.log(1 + n * K / d) + 2 * np.log(n * K)) + 1  # exploration parameter
    theta_hat = np.zeros(d)  # estimate of true theta
    lam = np.identity(d)  # covariance matrix
    lam_inv = np.identity(d)  # inverse of lam (will be iteratively updated)
    # determine if rewards synthetic
    synthetic = 1
    if isinstance(reward, list):
        synthetic = 0  # flag for later use
        W_test, opt = reward  # unpack the reward list
        num_users = W_test.shape[0]  # figure out number of test users
    for t in range(n):
        # in the next line, (phi @ lam_inv) * phi) @ np.ones(d) is a vectorized version of computing
        #   the bonus terms phi[e, :] @ lam_inv @ phi[e, :] for all e in range(L)
        ucb = np.minimum(phi @ theta_hat + alpha * np.sqrt(((phi @ lam_inv) * phi) @ np.ones(d)), 1)  # compute ucbs
        act = (-ucb).argsort()[:K]  # K items with highest ucbs
        # play action and update statistics
        theta_hat_unnorm = lam @ theta_hat  # save the currently unnormalized theta_hat for later use
        if not synthetic:
            user = np.random.randint(num_users)  # uniformly random test user
            success = 0  # whether or not there has been a click
        for k in range(K):
            # update lam and lam_inv for the current feature (using sherman-morrison for the latter)
            phi_k = phi[act[k], :]
            lam = lam + np.outer(phi_k, phi_k)
            lam_inv_phi_k = lam_inv @ phi_k
            lam_inv = lam_inv - np.outer(lam_inv_phi_k, lam_inv_phi_k) / (1 + np.inner(phi_k, lam_inv_phi_k))
            if (synthetic and np.random.rand() <= reward[act[k]]) or (not synthetic and W_test[user, act[k]]):
                theta_hat_unnorm = theta_hat_unnorm + phi_k  # update unnormalized theta_hat
                success = 1  # flag that there was a click
                break  # user stops examining after first click
        theta_hat = lam_inv @ theta_hat_unnorm  # update theta_hat by normalizing
        if synthetic:
            regret[t] = np.prod(1 - reward[act]) - np.prod(1 - reward[:K])  # instantaneous pseudo-regret
        else:
            regret[t] = max(W_test[user, opt]) - success  # "real" regret discussed in appendix C
    return np.cumsum(regret)


# estimate pseudo-regret for cascade_woful
# inputs: n -- horizon; K -- number of recommended items; phi - features; reward -- vector of attraction probabilities
#    (for synthetic data) or list containing W_test and approximately optimal solution (for real data, see appendix C)
# outputs: regret -- estimated pseudo-regret
def cascade_woful(n, K, phi, reward):
    # initialization
    regret = np.zeros(n)  # regret for horizon n
    [L, d] = np.shape(phi)  # figure out number of items and feature dimension
    theta_hat_H = np.zeros(d)  # hoeffding estimate of true theta
    lam_H = np.identity(d)  # hoeffing covariance matrix
    lam_H_inv = np.identity(d)  # inverse of lam_H (will be iteratively updated)
    theta_hat_B = np.zeros(d)  # bernstein estimate of true theta
    lam_B = np.identity(d) * K  # bernstein covariance matrix
    lam_B_inv = np.identity(d) / K  # inverse of lam_B (will be iteratively updated)
    # determine if rewards synthetic
    synthetic = 1
    if isinstance(reward, list):
        synthetic = 0  # flag for later use
        W_test, opt = reward  # unpack the reward list
        num_users = W_test.shape[0]  # figure out number of test users
    for t in range(n):
        # compute ucbs and choose action
        alpha_t_H = np.sqrt(d * np.log(1 + t * K / d) + 2 * np.log(n)) + 1  # hoeffding exploration parameter
        alpha_t_B = np.sqrt(d * np.log(1 + t * K / d) + 2 * np.log(n)) + np.sqrt(K)  # bernstein exploration parameter
        ucb_H = phi @ theta_hat_H + alpha_t_H * np.sqrt(((phi @ lam_H_inv) * phi) @ np.ones(d))  # hoeffding ucbs
        ucb_B = phi @ theta_hat_B + alpha_t_B * np.sqrt(((phi @ lam_B_inv) * phi) @ np.ones(d))  # bernstein ucbs
        ucb = np.minimum(np.minimum(ucb_H, ucb_B), 1)  # minimum of three ucbs as in appendix B
        act = (-ucb).argsort()[:K]  # K items with highest ucbs
        # play action and update statistics
        theta_hat_H_unnorm = lam_H @ theta_hat_H  # save the currently unnormalized theta_hat_H for later use
        theta_hat_B_unnorm = lam_B @ theta_hat_B  # save the currently unnormalized theta_hat_B for later use
        if not synthetic:
            user = np.random.randint(num_users)  # uniformly random test user
            success = 0  # whether or not there has been a click
        for k in range(K):
            # update lam_* and lam_*_inv for the current feature (using sherman-morrison for the latter)
            phi_k_H = phi[act[k], :]
            phi_k_B = phi_k_H / np.sqrt(min(max(ucb_H[act[k]], 1 / K), 1))
            lam_H = lam_H + np.outer(phi_k_H, phi_k_H)
            lam_H_inv_phi = lam_H_inv @ phi_k_H
            lam_H_inv = lam_H_inv - np.outer(lam_H_inv_phi, lam_H_inv_phi) / (1 + np.inner(phi_k_H, lam_H_inv_phi))
            lam_B = lam_B + np.outer(phi_k_B, phi_k_B)
            lam_B_inv_phi = lam_B_inv @ phi_k_B
            lam_B_inv = lam_B_inv - np.outer(lam_B_inv_phi, lam_B_inv_phi) / (1 + np.inner(phi_k_B, lam_B_inv_phi))
            if (synthetic and np.random.rand() <= reward[act[k]]) or (not synthetic and W_test[user, act[k]]):
                # update unnormalized theta_hat_*
                theta_hat_H_unnorm = theta_hat_H_unnorm + phi_k_H
                theta_hat_B_unnorm = theta_hat_B_unnorm + phi_k_H / min(max(ucb_H[act[k]], 1 / K), 1)
                success = 1  # flag that there was a click
                break  # user stops examining after first click
        # update theta_hat_* by normalizing
        theta_hat_H = lam_H_inv @ theta_hat_H_unnorm
        theta_hat_B = lam_B_inv @ theta_hat_B_unnorm
        # compute instantaneous regret
        if synthetic:
            regret[t] = np.prod(1 - reward[act]) - np.prod(1 - reward[:K])  # instantaneous pseudo-regret
        else:
            regret[t] = max(W_test[user, opt]) - success  # "real" regret discussed in appendix C
    return np.cumsum(regret)
