import numpy as np

from utils import get_random_actions, get_optimal_actions, sigmoid, get_starting_data, run_experiment, sigmoid


class Thompson:
    # X_train is a 1D list of action IDs for the observed data so far
    def __init__(
        self,
        actions,  # list of all action IDs; should be a list of integers from 0 to n-1
        max_reward,  # used to compute the alpha/beta params
    ):
        self.actions = actions
        self.max_reward = max_reward 
        self.X_train = []
        self.y_train = []
        # self.a = np.ones((len(self.actions),))
        # self.b = np.ones((len(self.actions),))

    # Inputs:
    #   X: the action IDs (NOT features!) of the data
    #   y: the true observed rewards for the taken actions
    def fit(self, X, y):
        # Add only the most recent element to the a/b parameters
        self.X_train = X
        self.y_train = y
        

    def select_action(self, allowed_actions):
        allowed_actions = list(allowed_actions)
        a = np.ones((len(self.actions),))
        b = np.ones((len(self.actions),))

        for i in range(len(self.X_train)):
            action_id = int(self.X_train[i])

            p_success = sigmoid(int(min(self.max_reward, self.y_train[i])) / self.max_reward)
            reward = np.random.choice(np.arange(0, 2), p=[1 - p_success, p_success])
            if reward == 1:
                a[action_id] += 1
            else:
                b[action_id] += 1

        samples = np.random.beta(a, b)

        # 2. Choose the action with the highest sampled value. Limit to the allowed actions.
        # Note: argmax returns only the index of the first max action, so in case of ties, this still returns one value
        best_action = np.where(samples == np.max(samples[list(allowed_actions)]))[0][0]

        return best_action




# Thompson sampling baseline
def thompson_sample(model, actions, action_data, score_data, allowed_actions):
    # Model is just a dummy input to match the format of SVM or model evaluation
    # 1. Sample expected rewards from all predicted reward distributions.
    # Define the params: a -> count of successes per action, b -> count of failures per action
    # Starting off with 50/50 probability, ie one success, one failure per action
    # Will approach expected probability over time with more samples collected
    a = np.ones((len(actions),))
    b = np.ones((len(actions),))
    # Generate the alpha/beta parameters for all actions.
    for i in range(len(action_data)):
        action_index = int(action_data[i])
        if score_data[i] == 1:
            a[action_index] += 1
        else:
            b[action_index] += 1

    samples = np.random.beta(a, b)

    # 2. Choose the action with the highest sampled value. Limit to the allowed actions.
    # Note: argmax returns only the index of the first max action, so in case of ties, this still returns one value
    best_action = np.where(samples == np.max(samples[list(allowed_actions)]))[0][0]

    # Return the action and sampled value.
    return best_action, samples[best_action]

