import numpy as np
from utils import sigmoid
from sklearn.kernel_ridge import KernelRidge

class KR:
    # X_train is a 1D list of action IDs for the observed data so far
    def __init__(
            self,
            actions,  # list of all action IDs; should be a list of integers from 0 to n-1
            similarity_matrix,
    ):
        self.actions = actions
        self.similarity_matrix = similarity_matrix

    # Inputs:
    #   X: the action IDs (NOT features!) of the data
    #   y: the true observed rewards for the taken actions
    def fit(self, X, y):
        # Save the training data
        self.X_train = X
        self.y_train = y

    # Sampling function for kernel regression
    def select_action(self, allowed_actions):
        if not hasattr(self, "X_train"):
            # If there is no data so far, return a random action and sampled reward of 0
            return np.random.choice(list(allowed_actions))

        n_data_points = len(self.X_train)
        # 1. Sample expected rewards from all predicted reward distributions.
        # Define the params: a -> count of successes per action, b -> count of failures per action
        # To begin with, assume 1 example of success, 1 of failure. Gives 50% chance of success with low confidence.
        a = np.ones((len(self.actions),))
        b = np.ones((len(self.actions),))

        # For each action, get predicted reward from kernel regression, then update # successes/failures
        for j in range(len(self.actions)):
            action = self.actions[j]
            # Predicted reward is computed as weighted sum over all data, weighted by similarity measure,
            # divided by sum of similarity measures.
            weighted_rewards = 0
            total_similarity = 0
            for i in range(len(self.X_train)):
                action_i = int(self.X_train[i])
                weighted_rewards += self.similarity_matrix[action, action_i] * self.y_train[i]
                total_similarity += self.similarity_matrix[action, action_i]

            if total_similarity == 0:
                predicted_reward = 0
            else:
                predicted_reward = weighted_rewards / total_similarity

            a[action] += max(int(predicted_reward / 100 * n_data_points), 0) # 100 is max reward
            b[action] += max(n_data_points - a[action], 0)

        samples = np.random.beta(a, b)

        # 2. Choose the action with the highest sampled value. Limit to the allowed actions.
        # Note: argmax returns only the index of the first max action, so in case of ties, this still works
        best_action = np.where(samples == np.max(samples[list(allowed_actions)]))[0][0]

        # Return the action. 
        return best_action
