import numpy as np
from utils import sigmoid
from sklearn.svm import SVR


class SVM:
    # X_train is a 1D list of action IDs for the observed data so far
    def __init__(
            self,
            actions,  # list of all action IDs; should be a list of integers from 0 to n-1
            svr_kernel,
            starting_action_data,  # Pass in some initial data
            starting_score_data,
    ):
        self.actions = actions
        self.svr_kernel = svr_kernel
        self.X_train = starting_action_data
        self.y_train = starting_score_data

        # ----- INITIALIZE MODEL -----
        self.model = SVR(kernel=svr_kernel)
        self.model.fit(self.X_train, self.y_train)

    # Inputs:
    #   X: the action IDs (NOT features!) of the data
    #   y: the true observed rewards for the taken actions
    def fit(self, X, y):
        # Save the training data
        self.X_train = X
        self.y_train = y
        # Re-fit the model
        self.model.fit(np.reshape(self.X_train, (len(self.X_train), 1)), self.y_train)

    # Defined to work-around sklearn SVR requirement of passing in same # of examples at evaluation and training time.
    def predict_all_actions(self, data_shape, actions):
        predict_array = np.ones(data_shape).reshape(-1, 1)
        predict_array[:len(actions), :] = actions.reshape(-1, 1)
        rewards = self.model.predict(predict_array)
        return rewards[:len(actions)]


    # Params: model, actions, n_data_points
    def select_action(self, allowed_actions):
        n_data_points = len(self.X_train)
        # Sample expected rewards from all predicted reward distributions.
        # Define the params: a -> count of successes per action, b -> count of failures per action
        # To begin with, assume 1 example of success, 1 of failure. Gives 50% chance of success with low confidence.
        a = np.ones((len(self.X_train),))
        b = np.ones((len(self.X_train),))

        # Use SVM probabilities as prior. Apply sigmoid to make them proper probabilities.
        # Multiply by number of data points seen to get counts of successes & failures.
        predictions = self.predict_all_actions(self.X_train.shape, self.actions)
        for action in self.actions:
            action = int(action)
            predicted_reward = sigmoid(predictions[action])
            a[action] += max(int(predicted_reward * n_data_points), 0)
            b[action] += max(n_data_points - a[action], 0)

        samples = np.random.beta(a, b)

        # Choose the action with the highest sampled value. Limit to the allowed actions.
        # Note: argmax returns only the index of the first max action, so in case of ties, this still works
        best_action = np.where(samples == np.max(samples[list(allowed_actions)]))[0][0]

        # Return the action. 
        return best_action

