# -*- coding: utf-8 -*-
import pdb
import numpy as np

class Mlp():
    '''
    fully-connected Multi-Layer Perceptron (MLP)
    '''

    def __init__(self, size_layers, type_loss, act_funct='sigmoid',
                 learning_rate=0.01, alpha = 0.82, reg_lambda=0, bias_flag=True,
                 batch_size=None, seed=47):
        '''
        Constructor method. Defines the characteristics of the MLP

        Arguments:
            size_layers : List with the number of Units for:
                [Input, Hidden1, Hidden2, ... HiddenN, Output] Layers.
            act_funct   : Activation function for all the Units in the MLP
                default = 'sigmoid'
            reg_lambda: Value of the regularization parameter Lambda
                default = 0, i.e. no regularization
            bias: Indicates is the bias element is added for each layer, but the output
        '''
        self.size_layers = size_layers
        self.type_loss   = type_loss
        self.n_layers    = len(size_layers)
        self.act_f       = act_funct
        self.lambda_r    = reg_lambda
        self.bias_flag   = bias_flag
        self.learning_rate = learning_rate
        self.alpha       = alpha
        self.batch_size  = batch_size
        self.seed        = seed

        # Ramdomly initialize theta (MLP weights)
        self.initialize_theta_weights()

    def train(self, X, Y, reset=False):
        '''
        Given X (feature matrix) and y (class vector)
        Updates the Theta Weights by running Backpropagation N tines
        Arguments:
            X          : Feature matrix [n_examples, n_features]
            Y          : Sparse class matrix [n_examples, classes]
            iterations : Number of times Backpropagation is performed
                default = 400
            reset      : If set, initialize Theta Weights before training
                default = False
        '''
        n_examples = Y.shape[0]
        if reset:
            self.initialize_theta_weights()
        if self.batch_size is not None:
            start_idx = 0
            end_idx   = self.batch_size
            iterations = int(n_examples / self.batch_size)
        else:
            start_idx = 0
            end_idx   = n_examples
            iterations = 1

        for iteration in range(iterations):
            self.gradients = self.backpropagation(X[start_idx:end_idx, :], Y[start_idx:end_idx])
            self.gradients_vector = self.unroll_weights(self.gradients)
            self.theta_vector = self.unroll_weights(self.theta_weights)
            self.theta_vector = self.theta_vector - self.learning_rate*self.gradients_vector
            self.theta_weights = self.roll_weights(self.theta_vector)
            if self.batch_size is not None:
                start_idx = end_idx
                if (end_idx + self.batch_size < n_examples):
                    end_idx = end_idx + self.batch_size
                else:
                    end_idx = n_examples

    def predict(self, X):
        '''
        Given X (feature matrix), y_hat is computed
        Arguments:
            X      : Feature matrix [n_examples, n_features]
        Output:
            y_hat  : Computed Vector Class for X
        '''
        A , Z = self.feedforward(X)
        Y_hat = A[-1]
        return Y_hat

    def initialize_theta_weights(self):
        '''
        Initialize theta_weights, initialization method depends
        on the Activation Function and the Number of Units in the current layer
        and the next layer.
        The weights for each layer as of the size [next_layer, current_layer + 1]
        '''
        np.random.seed(self.seed)
        self.theta_weights = []
        size_next_layers = self.size_layers.copy()
        size_next_layers.pop(0)
        for size_layer, size_next_layer in zip(self.size_layers, size_next_layers):
            if self.act_f == 'sigmoid':
                # Method presented "Understanding the difficulty of training deep feedforward neural networks"
                # Xavier Glorot and Youshua Bengio, 2010
                epsilon = 4.0 * np.sqrt(6) / np.sqrt(size_layer + size_next_layer)
                # Weigts from a uniform distribution [-epsilon, epsion]
                if self.bias_flag:
                    theta_tmp = epsilon * ( (np.random.rand(size_next_layer, size_layer + 1) * 2.0 ) - 1)
                else:
                    theta_tmp = epsilon * ( (np.random.rand(size_next_layer, size_layer) * 2.0 ) - 1)
            elif self.act_f == 'relu' or self.act_f=='custom':
                # Method presented in "Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classfication"
                # He et Al. 2015
                epsilon = np.sqrt(2.0 / (size_layer * size_next_layer) )
                # Weigts from Normal distribution mean = 0, std = epsion
                if self.bias_flag:
                    theta_tmp = epsilon * (np.random.randn(size_next_layer, size_layer + 1 ))
                else:
                    theta_tmp = epsilon * (np.random.randn(size_next_layer, size_layer))
            self.theta_weights.append(theta_tmp)
        return self.theta_weights

    def backpropagation(self, X, Y):
        '''
        Implementation of the Backpropagation algorithm with regularization
        '''
        if self.act_f == 'sigmoid':
            g_dz = lambda x: self.sigmoid_derivative(x)
        elif self.act_f == 'relu':
            g_dz = lambda x: self.relu_derivative(x)
        elif self.act_f == 'custom':
            g_dz = lambda x: self.custom_derivative(x)

        n_examples = X.shape[0]
        # Feedforward
        A, Z = self.feedforward(X)

        # Backpropagation
        deltas = [None] * self.n_layers
        if self.type_loss == 'lyapunov':
            loss_derivative = np.multiply(
                np.power(np.abs(np.squeeze(A[-1]) - np.squeeze(Y)), self.alpha),
                np.sign(np.squeeze(A[-1]) - np.squeeze(Y)))
        #loss_derivative = np.power(np.abs(A[-1] - Y), self.alpha) * np.sign(A[-1] - Y)
            deltas[-1] = \
                np.multiply(loss_derivative,
                            self.sigmoid_derivative(np.squeeze(A[-1])))
            deltas[-1] = np.expand_dims(deltas[-1], axis=1)
        elif self.type_loss == 'l1':
            sigmoid_derivative = np.squeeze(self.sigmoid_derivative(A[-1]))
            deltas[-1] = np.multiply(np.sign(np.squeeze(A[-1]) - np.squeeze(Y)),
                                     sigmoid_derivative)
            deltas[-1] = np.expand_dims(deltas[-1], axis=1)
        elif self.type_loss == 'l2':
            sigmoid_derivative = np.squeeze(self.sigmoid_derivative(A[-1]))
            deltas[-1] = ((np.squeeze(A[-1]) - np.squeeze(Y)) *
                          sigmoid_derivative)
            deltas[-1] = np.expand_dims(deltas[-1], axis=1)

        # For the second last layer to the second one
        for ix_layer in np.arange(self.n_layers - 1 - 1 , 0 , -1):
            theta_tmp = self.theta_weights[ix_layer]
            if self.bias_flag:
                # Removing weights for bias
                theta_tmp = np.delete(theta_tmp, np.s_[0], 1)
            deltas[ix_layer] = ((np.matmul(theta_tmp.transpose(),
                                deltas[ix_layer + 1].transpose())).transpose() *
                                g_dz(Z[ix_layer]))

        if (self.type_loss == 'lyapunov'):
            E = np.power(np.abs(A[-1] - Y), self.alpha+1).mean()/(self.alpha+1)
            E_beta = np.power(E, 0.04)
        # Compute gradientstr
        gradients = [None] * (self.n_layers - 1)
        for ix_layer in range(self.n_layers - 1):
            a_tmp = A[ix_layer]
            grads_tmp = np.matmul(deltas[ix_layer + 1].transpose() , a_tmp)
            grads_tmp = grads_tmp / n_examples
            if self.bias_flag:
                # Regularize weights, except for bias weights
                grads_tmp[:, 1:] = grads_tmp[:, 1:] + (self.lambda_r / n_examples) * self.theta_weights[ix_layer][:,1:]
            else:
                # Regularize ALL weights
                grads_tmp = grads_tmp + (self.lambda_r / n_examples) * self.theta_weights[ix_layer]
            if self.type_loss == 'lyapunov':
                gradients[ix_layer] = np.multiply(np.power(np.abs(grads_tmp),
                                                  self.alpha),
                                                  np.sign(grads_tmp)) * E_beta
            else:
                gradients[ix_layer] = grads_tmp
        return gradients

    def feedforward(self, X):
        '''
        Implementation of the Feedforward
        '''
        if self.act_f == 'sigmoid':
            g = lambda x: self.sigmoid(x)
        elif self.act_f == 'relu':
            g = lambda x: self.relu(x)
        elif self.act_f == 'custom':
            g = lambda x: self.custom(x)

        A = [None] * self.n_layers
        Z = [None] * self.n_layers
        input_layer = X

        for ix_layer in range(self.n_layers - 1):
            n_examples = input_layer.shape[0]
            if self.bias_flag:
                # Add bias element to every example in input_layer
                input_layer = np.concatenate((np.ones([n_examples ,1]) ,input_layer), axis=1)
            A[ix_layer] = input_layer
            # Multiplying input_layer by theta_weights for this layer
            Z[ix_layer + 1] = np.matmul(input_layer,  self.theta_weights[ix_layer].transpose() )
            # Activation Function
            if (ix_layer == self.n_layers-2):
                output_layer = self.sigmoid(Z[ix_layer + 1])
            else:
                output_layer = g(Z[ix_layer + 1])
            # Current output_layer will be next input_layer
            input_layer = output_layer

        A[self.n_layers - 1] = output_layer
        return A, Z


    def unroll_weights(self, rolled_data):
        '''
        Unroll a list of matrices to a single vector
        Each matrix represents the Weights (or Gradients) from one layer to the next
        '''
        unrolled_array = np.array([])
        for one_layer in rolled_data:
            unrolled_array = np.concatenate((unrolled_array, one_layer.flatten(1)) )
        return unrolled_array

    def roll_weights(self, unrolled_data):
        '''
        Unrolls a single vector to a list of matrices
        Each matrix represents the Weights (or Gradients) from one layer to the next
        '''
        size_next_layers = self.size_layers.copy()
        size_next_layers.pop(0)
        rolled_list = []
        if self.bias_flag:
            extra_item = 1
        else:
            extra_item = 0
        for size_layer, size_next_layer in zip(self.size_layers, size_next_layers):
            n_weights = (size_next_layer * (size_layer + extra_item))
            data_tmp = unrolled_data[0 : n_weights]
            data_tmp = data_tmp.reshape(size_next_layer, (size_layer + extra_item), order = 'F')
            rolled_list.append(data_tmp)
            unrolled_data = np.delete(unrolled_data, np.s_[0:n_weights])
        return rolled_list

    def sigmoid(self, z):
        '''
        Sigmoid function
        z can be an numpy array or scalar
        '''
        epsilon = 0.0001
        result = 1.0 / (1.0 + np.exp(-z + epsilon))
        return result

    def relu(self, z):
        '''
        Rectified Linear function
        z can be an numpy array or scalar
        '''
        if np.isscalar(z):
            result = np.max((z, 0))
        else:
            zero_aux = np.zeros(z.shape)
            meta_z = np.stack((z , zero_aux), axis = -1)
            result = np.max(meta_z, axis = -1)
        return result


    def sigmoid_derivative(self, z):
        '''
        Derivative for Sigmoid function
        z can be an numpy array or scalar
        '''
        result = self.sigmoid(z) * (1 - self.sigmoid(z))
        return result

    def relu_derivative(self, z):
        '''
        Derivative for Rectified Linear function
        z can be an numpy array or scalar
        '''
        result = 1 * (z > 0)
        return result
