import pandas as pd
import copy
import numpy as np
import math
import os
import time
import tqdm
import sys
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

os.environ["OMP_NUM_THREADS"] = "1"  # OpenMP
os.environ["MKL_NUM_THREADS"] = "1"  # Intel Math Kernel Library
os.environ["NUMEXPR_NUM_THREADS"] = "1"  # NumExpr
os.environ["OPENBLAS_NUM_THREADS"] = "1"  # OpenBLAS
import random
import ast
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'


USE_GPU = False
if USE_GPU:
    device_idx = 0
    gpus = tf.config.list_physical_devices('GPU')
    gpu_device = gpus[device_idx]
    core_config = tf.config.experimental.set_visible_devices(gpu_device, 'GPU')
    tf.config.experimental.set_memory_growth(gpu_device, True)
    tf.compat.v1.keras.backend.set_session(tf.compat.v1.Session(config=core_config))
else:
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"


def repeat_Samples(samples_to_be_repeated, X_train, y_train):
    random_indices = np.random.choice(X_train.shape[0], samples_to_be_repeated)
    X_train = np.concatenate((X_train, X_train[random_indices]), axis=0)
    y_train = np.concatenate((y_train, y_train[random_indices]), axis=0)
    return X_train, y_train


def data_preparation(tasks_list):
    data_param_dict_for_specific_task = {}

    lengths = []
    for sch_id in tasks_list:
        X_train = np.load(f'{DataPath}/{sch_id}_X_train.npy')
        lengths.append(X_train.shape[0])

    if len(tasks_list) > 1:
        max_size = max(lengths)
        if max_size % 2 == 0:
            max_size += 1

        print(f'max size = {max_size}')

    for task_id in tasks_list:
        X_train = np.load(f'{DataPath}/{task_id}_X_train.npy')
        y_train = np.load(f'{DataPath}/{task_id}_y_train.npy')
        X_test = np.load(f'{DataPath}/{task_id}_X_test.npy')
        y_test = np.load(f'{DataPath}/{task_id}_y_test.npy')

        if len(tasks_list) > 1:
            samples_to_be_repeated = max_size - len(X_train)

            if samples_to_be_repeated > 0:
                # print(f'X_train: {X_train.shape}, samples_to_be_repeated = {samples_to_be_repeated}')
                X_train, y_train = repeat_Samples(samples_to_be_repeated, X_train, y_train)

        data_param_dict_for_specific_task.update({f'School_{task_id}_X_train': X_train})
        data_param_dict_for_specific_task.update({f'School_{task_id}_y_train': y_train})
        data_param_dict_for_specific_task.update({f'School_{task_id}_X_test': X_test})
        data_param_dict_for_specific_task.update({f'School_{task_id}_y_test': y_test})

    return data_param_dict_for_specific_task


def decay_lr(step, optimizer):
    if (step + 1) % 75 == 0:
        optimizer.lr = optimizer.lr / 2.
        # print('Decreasing the learning rate by 1/2. New Learning Rate: {}'.format(optimizer.lr))



def final_model(shared_hyperparameters, tasks_list, data_param_dict_for_specific_task, val=False):

    train_data = []
    train_label = []
    test_data = []
    test_label = []
    if val:
        val_data = []
        val_label = []

    for sch_id in tasks_list:
        X_train_full = data_param_dict_for_specific_task[f'School_{sch_id}_X_train']
        y_train_full = data_param_dict_for_specific_task[f'School_{sch_id}_y_train']

        if val:
            # Add a small validation split from the training data
            X_train, X_val, y_train, y_val = train_test_split(
                X_train_full, y_train_full, test_size=0.1, random_state=42)
            val_data.append(X_val)
            val_label.append(y_val)
        else:
            X_train = data_param_dict_for_specific_task[f'School_{sch_id}_X_train']
            y_train = data_param_dict_for_specific_task[f'School_{sch_id}_y_train']

        train_data.append(X_train)
        train_label.append(y_train)
        test_data.append(data_param_dict_for_specific_task[f'School_{sch_id}_X_test'])
        test_label.append(data_param_dict_for_specific_task[f'School_{sch_id}_y_test'])

    class SharedEncoder(tf.keras.Model):
        def __init__(self):
            super(SharedEncoder, self).__init__()
            self.shared_layers = []
            for h in range(shared_hyperparameters['shared_FF_Layers']):
                self.shared_layers.append(Dense(shared_hyperparameters['shared_FF_Neurons'][h], activation='relu'))

        def call(self, inputs):
            x = inputs
            for layer in self.shared_layers:
                x = layer(x)
            return x

    class TaskDecoder(tf.keras.Model):
        def __init__(self):
            super(TaskDecoder, self).__init__()
            self.output_layer = Dense(1, activation='tanh')

        def call(self, shared_representation):
            return self.output_layer(shared_representation)

    # Create instances of the Shared Encoder and Task Decoders
    shared_encoder = SharedEncoder()
    task_decoders = {sch_id: TaskDecoder() for sch_id in tasks_list}

    global_step = tf.Variable(0, trainable=False)
    init_lr = shared_hyperparameters['learning_rate']
    optimizer = tf.keras.optimizers.SGD(init_lr, momentum=0.9, nesterov=False)
    '''mean squared error'''
    loss_fn = tf.keras.losses.MeanSquaredError()

    @tf.function
    def train_step(x_batch_train, y_batch_train):
        with tf.GradientTape() as tape:
            shared_representations = [shared_encoder(input_data, training=True) for input_data in x_batch_train]
            predictions = [task_decoders[sch_id](shared_rep, training=True) for sch_id, shared_rep in
                           zip(tasks_list, shared_representations)]
            losses = [loss_fn(y_true, y_pred) for y_true, y_pred in zip(y_batch_train, predictions)]
            losses_dict = {task: loss for task, loss in zip(tasks_list, losses)}
            tot_loss = tf.reduce_sum(losses)

        gradients = tape.gradient(tot_loss, shared_encoder.trainable_variables + sum(
            [decoder.trainable_variables for decoder in task_decoders.values()], []))

        optimizer.apply_gradients(zip(gradients, shared_encoder.trainable_variables + sum(
            [decoder.trainable_variables for decoder in task_decoders.values()], [])))

        original_shared_weights = [tf.identity(weight) for weight in shared_encoder.trainable_weights]
        original_decoder_weights = {sch_id: [tf.identity(weight) for weight in decoder.trainable_weights]
                                    for sch_id, decoder in task_decoders.items()}

        '''check how many trainable parameters are trainable'''
        # print(f'trainable variables = {len(optimizer.variables())}')
        # print(f'how many trainable variables = {len(shared_encoder.trainable_variables)}')
        # if fold == 0:
        #     print(shared_encoder.summary())
        return tot_loss, original_shared_weights, original_decoder_weights

    @tf.function
    def test_step(x_batch_test, y_batch_test):
        shared_representations = [shared_encoder(input_data, training=False) for input_data in x_batch_test]
        predictions = [task_decoders[sch_id](shared_rep, training=False) for sch_id, shared_rep in
                       zip(tasks_list, shared_representations)]
        eval_losses = [loss_fn(y_true, y_pred) for y_true, y_pred in zip(y_batch_test, predictions)]
        eval_loss = tf.reduce_sum(eval_losses)
        return eval_loss, eval_losses, predictions


    Patience = 20

    min_loss_to_consider = math.inf

    TRAIN_SIZE = len(train_data[0])
    # print(f'TRAIN_SIZE = {TRAIN_SIZE}')

    gradient_metrics = {task: [] for task in tasks_list}

    timeStart = time.time()
    velocity_trackers = {}
    print(f'total batches {TRAIN_SIZE / batch_size}')
    TRAIN_LOSS = []
    VAL_LOSS = []
    TEST_LOSS = []
    for epoch in range(num_epochs):
        if epoch > 100:
            decay_lr(epoch, optimizer)

        batch_grad_metrics = {combined_task: {task: 0. for task in tasks_list} for combined_task in
                              gradient_metrics}

        for batch_idx in range(0, len(train_data[0]), batch_size):
            x_batch_train = [data[batch_idx:batch_idx + batch_size] for data in train_data]
            y_batch_train = [label[batch_idx:batch_idx + batch_size] for label in train_label]

            train_loss, shared_weights, decoder_weights = train_step(x_batch_train, y_batch_train)


        if val:
            ### Validation can be done here if needed, by evaluating on `val_data` and `val_label`
            val_loss, _, _ = test_step(val_data, val_label)
            loss_to_consider = val_loss
        else:
            loss_to_consider = train_loss
        if epoch % 20 == 0:
            print(f'Epoch {epoch + 1}/{num_epochs}, loss = {train_loss.numpy()}, Patience = {Patience}')

        TRAIN_LOSS.append(train_loss.numpy())
        test_loss, indiv_losses, y_pred = test_step(test_data, test_label)
        TEST_LOSS.append(test_loss.numpy())
        if loss_to_consider.numpy() < min_loss_to_consider:
            min_loss_to_consider = min(min_loss_to_consider, loss_to_consider.numpy())
            Patience = 20
            best_shared_weights = copy.deepcopy(shared_weights)
            best_decoder_weights = copy.deepcopy(decoder_weights)
        else:
            Patience -= 1
            if Patience == 0:
                print(f'Stopping Training at Epoch {epoch + 1}')
                break
        # if epoch % 20 == 0:
        #     for base_task, task_gain_map in batch_grad_metrics.items():
        #         print(f'base_task = {base_task}\tgain = {task_gain_map}')
        # exit(0)



    time_taken = time.time() - timeStart
    # print(f'gradient_metrics = {gradient_metrics}')
    # load the original model
    for best_weight, curr_weight in zip(best_shared_weights, shared_encoder.trainable_weights):
        curr_weight.assign(best_weight)

    for sch_id, decoder_specific_weights in best_decoder_weights.items():
        for best_weight, curr_weight in zip(decoder_specific_weights, task_decoders[sch_id].trainable_weights):
            curr_weight.assign(best_weight)

    print(f'stopping training at epoch {epoch + 1}')

    def save_gradients():
        # Set the model to evaluation mode in TensorFlow
        # (In TF, there's no need for an explicit eval mode like PyTorch, just ensure dropout/batchnorm layers are in inference mode)
        task_gradients = {task: [] for task in tasks_list}

        for batch_idx in range(0, len(train_data[0]), batch_size):
            x_batch_train = [data[batch_idx:batch_idx + batch_size] for data in train_data]
            y_batch_train = [label[batch_idx:batch_idx + batch_size] for label in train_label]
            # optimizer = new_optimizer
            with tf.GradientTape(persistent=True) as tape:
                shared_representations = [shared_encoder(input_data, training=True) for input_data in x_batch_train]
                predictions = [task_decoders[molecule](shared_rep, training=True) for molecule, shared_rep in
                               zip(tasks_list, shared_representations)]
                losses = [loss_fn(y_true, y_pred) for y_true, y_pred in zip(y_batch_train, predictions)]
                losses_dict = {task: loss for task, loss in zip(tasks_list, losses)}
                tot_loss = tf.reduce_sum(losses)

                # Compute the gradient of the task-specific loss w.r.t. the shared base.
                single_task_specific_gradients = [
                    (single_task, tape.gradient(losses_dict[single_task], shared_encoder.trainable_weights)) for
                    single_task in tasks_list]

            for task, tmp_gradients in single_task_specific_gradients:
                # for tmp_gradients in grads:
                #     '''flatten and concatenate gradients'''
                #     tmp_gradients = tf.concat([tf.reshape(g, [-1]) for g in tmp_gradients], axis=0).numpy()
                #     tmp_gradients = (tmp_gradients.reshape(1, -1) @ project_matrix).flatten()
                #     task_gradients[task].append(tmp_gradients)

                # for tmp_gradients in grads:
                '''flatten and concatenate gradients'''
                tmp_gradients = tf.concat([tf.reshape(g, [-1]) for g in tmp_gradients], axis=0).numpy()

                # Debug print to check the dimensions
                # print(f"tmp_gradients shape: {tmp_gradients.shape}")
                # print(f"project_matrix shape: {project_matrix.shape}")

                if tmp_gradients.size != project_matrix.shape[0]:
                    raise ValueError(
                        f"Gradient size {tmp_gradients.size} does not match expected size {project_matrix.shape[0]}")

                tmp_gradients = (tmp_gradients.reshape(1, -1) @ project_matrix).flatten()

                task_gradients[task].append(tmp_gradients)

        for task_name, gradients in task_gradients.items():
            np.save(f"{gradients_dir}/{task_name}_train_gradients.npy", gradients)

        del tape  # Clean up the persistent GradientTape

    '''new parts'''
    if len(tasks_list) == len(TASKS):
        '''save best weights and model to a file'''
        model_base_dir = f'{datasetName}_model_weights'
        if not os.path.exists(model_base_dir):
            os.makedirs(model_base_dir)

        gradients_dir = f'{datasetName}_gradients_run_{run}'
        if not os.path.exists(gradients_dir):
            os.makedirs(gradients_dir)

        model_dir = f'{model_base_dir}/run_{run}'
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
        shared_encoder.save_weights(f'{model_dir}/shared_encoder')
        for molecule, decoder in task_decoders.items():
            decoder.save_weights(f'{model_dir}/decoder_{molecule}')

        grad_params = []
        for params in shared_encoder.trainable_weights:
            grad_params.append(params)

        print(f'len(grad_params): {len(grad_params)}', end=' ')
        gradient_dim = 0
        for param in grad_params:
            gradient_dim += param.numpy().size
        print("Gradient Dim: {}".format(gradient_dim), end=' ')

        project_dim = 200
        project_matrix = (2 * np.random.randint(2, size=(gradient_dim, project_dim)) - 1).astype(float)
        project_matrix *= 1 / np.sqrt(project_dim)
        print("Project Dim: {}".format(project_dim))

        # Save gradients
        start_time_grad = time.time()
        save_gradients()
        end_time_grad = time.time()
        # print(f"Time taken for train gradients: {end_time_grad - start_time_grad}")

    test_loss, indiv_losses, y_pred = test_step(test_data, test_label)
    indiv_losses = [each_loss.numpy() for each_loss in indiv_losses]

    print(f'test_loss = {test_loss}')


    return test_loss.numpy(), indiv_losses, time_taken


if __name__ == "__main__":
    datasetName = 'School'

    import sys

    Method_name = 'SimpleMTL'
    group_len = 'ALL' #1 for STL, 2 for Pairs



    ResultPath = '../RESULTS/GROUPS_MTL/'



    DataPath = f'../Dataset/{datasetName.upper()}/'
    SchoolData = pd.read_csv(f'{DataPath}Task_Information_School.csv', low_memory=False)
    '''sort based on dataset'''
    # SchoolData = SchoolData.sort_values(by=['Dataset_Size'], ascending=False)
    DataPath = f'../Dataset/{datasetName.upper()}/Task_Splits/'
    TASKS = list(SchoolData['Task_Name'])
    # print(f'TASKS = {TASKS}')

    TASKS = [str(task) for task in TASKS]

    task_len = {}
    variance_dict = {}
    std_dev_dict = {}
    dist_dict = {}
    Single_res_dict = {}
    STL_error = {}
    STL_AP = {}

    num_folds = 10

    Arch_Name = 'Arch_1'
    if Arch_Name == 'Arch_1':
        num_epochs = 1000
        batch_size = 64
        MAX_PATIENCE = 20
        initial_shared_architecture = {'shared_FF_Layers': 3,
                                       'shared_FF_Neurons': [20, 10, 32],
                                       'learning_rate': 0.005}



    if group_len == 'ALL':
        TASK_Group = [tuple(TASKS)]
        name_suffix = 'ALL'



    RUNS = [1,2,3,4,5,6]
    for run in RUNS:
        seed_value = run
        tf.random.set_seed(seed_value)
        np.random.seed(seed_value)
        random.seed(seed_value)

        Task_group = []
        Total_Loss = []
        Individual_Group_Score = []
        Individual_Error_Rate = []
        Individual_AP = []
        Number_of_Groups = []
        Individual_Task_Score = []
        Time_taken_for_training = []
        Prev_Groups = {}
        print(f'Total Groups : {len(TASK_Group)}')
        for count in range(len(TASK_Group)):
            print(f'Initial Training for {datasetName}-partition {count}, {TASK_Group[count]}')
            task_group = TASK_Group[count]

            args_tasks = []
            group_score = {}
            tmp_task_score = []

            data_param_dict_for_specific_task = data_preparation(task_group)
            all_scores = final_model(initial_shared_architecture, task_group, data_param_dict_for_specific_task,
                                     val=True)

            tot_loss, indi_scores = all_scores[0], all_scores[1]
            Total_time = all_scores[2]

            print(f'tot_loss = {tot_loss}')
            task_scores = {}
            for idx, task in enumerate(task_group):
                task_scores[task] = indi_scores[idx]

            Task_group.append(task_group)
            Total_Loss.append(tot_loss)
            Individual_Task_Score.append(copy.deepcopy(task_scores))

            Time_taken_for_training.append(np.mean(Total_time) / 60)

            print(len(Total_Loss), len(Task_group), len(Individual_Task_Score), len(Individual_Error_Rate))


            temp_res = pd.DataFrame({'Total_Loss': Total_Loss,
                                     'Task_group': Task_group,
                                     'Individual_Task_Score': Individual_Task_Score,
                                     'Time_taken_for_training': Time_taken_for_training,
                                     })
            temp_res.to_csv(f'{ResultPath}/{datasetName}_{name_suffix}_run_{run}_SGD_Arch_{Arch_Name}_BASELINE_GRADTAE.csv',index=False)



