# Defining helper functions that make the multi-arm bandit script more readable and modular.
import random
import numpy as np
import math
import os
import json
import pandas as pd
from scipy.stats import spearmanr

# Define constants
ENVIRONMENT_REWARD_STD = 1  # true standard deviation of the rewards sampled by the environment


def get_embedding_kernel(filename):
    # No need to corrupt the similarity matrix - we only measure rep alignment once for this model. 
    with open(filename, 'rb') as f:
        loaded = np.load(f)
        
    return loaded


def get_all_embedding_kernels(kernels_path):
    # Get the similarity matrices from all the embedding based models
    kernels = {}
    for filename in os.listdir(kernels_path):
        if filename.endswith(".npy"): 
            model_name = filename.replace(".npy", "")
            kernels[model_name] = get_embedding_kernel(os.path.join(kernels_path, filename))

    return kernels


# Define a randomly generated environment with some number of actions and associated rewards.
# Labels of actions are just integers.
def get_random_actions(n_actions, max_reward, min_reward):
    actions = []
    scores = []
    for i in range(n_actions):
        actions.append(i)
        scores.append(np.random.uniform(min_reward, max_reward))

    # Type conversion to numpy arrays
    actions = np.array(actions).astype(int)
    scores = np.array(scores).astype(int)

    return actions, scores



# Get a set of all the optimal actions with the highest morality scores.
def get_optimal_actions(n_actions, actions, scores):
    max_score = np.max(scores)
    optimal_actions = set()
    for i in range(n_actions):
        if scores[i] == max_score:
            optimal_actions.add(actions[i])

    return optimal_actions


def sigmoid(x):
    return 1 / (1 + math.exp(-x))


# Compute ground-truth similarity matrix without corruption.
def get_similarity_matrix(x, y, actions, scores, MAX_REWARD, MIN_REWARD, kernel_fn):
    # Create similarity matrix to return.
    # Note: the inputs x, y are the things to compute similarity between, NOT features and targets!
    n_points = x.shape[0]
    similarity = np.zeros((n_points, n_points))

    # Populate each element of the similarity matrix with the appropriate value.
    for idx in range(n_points):
        action1 = np.where(actions == x[idx])[0][0]
        for j in range(n_points):
            action2 = np.where(actions == y[j])[0][0]
            similarity[idx][j] = (MAX_REWARD - MIN_REWARD) - kernel_fn(scores[action1], scores[action2])

    return similarity


def get_starting_data(actions, default_value=0):
    # Starting data. Needs to contain all possible actions, so model is able to choose them
    # Only needed for SVR model.
    indices = np.random.choice(actions.shape[0], actions.shape[0], replace=False)
    action_data = actions[indices].reshape(-1, 1)  # make sure there's only 1 feature
    score_data = [default_value] * len(action_data)
    return action_data, score_data


def run_experiment(actions, action_data, score_data, morality_scores, optimal_actions, model, n_allowed_actions,
                   allowed_actions=None, allowed_actions_list=[], binary_rewards=False, filename=None, immoral_threshold=0,
                   max_iterations = 1000, history_filename=None, spearman=None, test=False):
    # We consider the optimal action to be identified when the model CONSISTENTLY acts optimally.
    # This is currently defined as acting optimally in the past 5 consecutive actions.
    n_consecutive_optimal, iterations, n_non_optimal, n_negative, total_reward = 0, 0, 0, 0, 0
    taken_actions = set()

    # Store action history for this run. 
    action_history, reward_history = [], []

    while n_consecutive_optimal < 5 and iterations < max_iterations:
        if allowed_actions is not None:
            with open(allowed_actions, 'r') as f: 
                all_allowed_actions = json.loads(f.read())  
            allowed = set(all_allowed_actions[str(iterations)])
        elif len(allowed_actions_list) > 0:
            allowed = set(allowed_actions_list)
        else:
            allowed = set()
            allowed.add(random.sample(sorted(optimal_actions), 1)[0])
            for i in range(n_allowed_actions - 1):
                # Choose a random action that has not yet been chosen and add it to allowed_actions
                allowed.add(random.sample(sorted(set(actions) - allowed_actions), 1)[0])
            

        # Choose action the model predicts to have best reward.
        action = model.select_action(allowed)
        taken_actions.add(action)

        if action not in optimal_actions:
            n_non_optimal += 1

        # Environment: Get true reward for action chosen and append to data observations.
        action_index = np.where(actions == action)[0][0]
        morality = morality_scores[action_index]
        if morality < immoral_threshold:
            n_negative += 1

        # Compute actual reward by sampling from a Normal distribution centered at the morality score.
        actual_reward = np.random.normal(morality, ENVIRONMENT_REWARD_STD)
        if binary_rewards:
            # Compute actual reward by sampling from a sigmoid.
            p_success = sigmoid(morality)
            actual_reward = int(np.random.rand() <= p_success)  # returns 1 if expression evaluates to true

        action_data = np.append(action_data, [action])
        score_data = np.append(score_data, [actual_reward])
        # Update the rewards in the loop
        total_reward += actual_reward

        # Update number of consecutive optimal actions.
        if action in optimal_actions:
            n_consecutive_optimal += 1
        else:
            n_consecutive_optimal = 0

        iterations += 1

        if not test:
            # Update model
            model.fit(action_data, score_data)

        # Add to history
        action_history.append(action)
        reward_history.append(actual_reward)

    # Write out experiment overview data to txt file
    if filename is not None:
        with open(filename, 'w') as f:
            for i in range(len(actions)):
                f.write('Action: ' + str(i) + ' Morality Score: ' + str(morality_scores[i]) + '\n')

            f.write('\nOptimal actions: ' + str(optimal_actions) + '\n')
            f.write('Optimal expected reward: ' + str(sigmoid(np.max(morality_scores))) + '\n')
            f.write('Number of trials to learn environment: ' + str(iterations) + '\n')

    if history_filename is not None:
        # Store a DataFrame of action history.
        results = pd.DataFrame()
        results['action'] = action_history
        results['reward'] = reward_history
        if spearman is not None:
            results['spearman'] = spearman
        results.to_csv(history_filename)

    # Return relevant metrics.
    metrics = {'mean_reward': total_reward / iterations,
               'unique_actions_taken': len(taken_actions),
               'non_optimal_actions_taken': n_non_optimal,
               'negative_actions_taken': n_negative,
               'iterations_to_convergence': iterations}

    return metrics


