import pandas as pd
import copy
import numpy as np
import math
import os
import time

# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
#
os.environ["OMP_NUM_THREADS"] = "1"  # OpenMP
os.environ["MKL_NUM_THREADS"] = "1"  # Intel Math Kernel Library
os.environ["NUMEXPR_NUM_THREADS"] = "1"  # NumExpr
os.environ["OPENBLAS_NUM_THREADS"] = "1"  # OpenBLAS
import random
import ast
from sklearn.metrics import average_precision_score
import itertools
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Dense

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# print(f'version = {tf.__version__}')

USE_GPU = False
if USE_GPU:
    device_idx = 0
    gpus = tf.config.list_physical_devices('GPU')
    gpu_device = gpus[device_idx]
    core_config = tf.config.experimental.set_visible_devices(gpu_device, 'GPU')
    tf.config.experimental.set_memory_growth(gpu_device, True)
    tf.compat.v1.keras.backend.set_session(tf.compat.v1.Session(config=core_config))
else:
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"


def repeat_Samples(samples_to_be_repeated, X_train, y_train):
    # Step 1: Separate indices by class
    class0_indices = np.where(y_train < 1)[0]
    class1_indices = np.where(y_train >= 1)[0]
    # print(f'TRAIN: [0,1] : [{len(class0_indices), len(class1_indices)}]')

    # Step 2: Compute how many samples to repeat from each class
    repeat_per_class = samples_to_be_repeated // 2

    # If odd, one sample will be left out — you can choose what to do with that if needed

    # Step 3: Randomly sample with replacement
    sampled_class0 = np.random.choice(class0_indices, repeat_per_class, replace=True)
    sampled_class1 = np.random.choice(class1_indices, repeat_per_class, replace=True)

    # Step 4: Combine and concatenate
    sampled_indices = np.concatenate([sampled_class0, sampled_class1])
    X_repeat = X_train[sampled_indices]
    y_repeat = y_train[sampled_indices]

    # Final step: Concatenate to training set
    X_train = np.concatenate((X_train, X_repeat), axis=0)
    y_train = np.concatenate((y_train, y_repeat), axis=0)

    return X_train, y_train


def data_preparation(tasks_list):
    data_param_dict_for_specific_task = {}
    DataPath = f'../Dataset/{datasetName.upper()}/Task_Splits'

    lengths = []
    for task_id in tasks_list:
        X_train = np.load(f'{DataPath}/{task_id}_X_train.npy')
        lengths.append(X_train.shape[0])

    if len(tasks_list) > 1:
        max_size = max(lengths)
        if max_size % 2 == 0:
            max_size += 1

        print(f'max size = {max_size}')

    for task_id in tasks_list:
        X_train = np.load(f'{DataPath}/{task_id}_X_train.npy')
        y_train = np.load(f'{DataPath}/{task_id}_y_train.npy')
        X_test = np.load(f'{DataPath}/{task_id}_X_test.npy')
        y_test = np.load(f'{DataPath}/{task_id}_y_test.npy')

        if len(tasks_list) > 1:
            samples_to_be_repeated = max_size - len(X_train)

            if samples_to_be_repeated > 0:
                # print(f'X_train: {X_train.shape}, samples_to_be_repeated = {samples_to_be_repeated}')
                X_train, y_train = repeat_Samples(samples_to_be_repeated, X_train, y_train)

        y_train = y_train.reshape(-1, 1)
        y_test = y_test.reshape(-1, 1)
        # print(f'shape of y_train = {y_train.shape}')

        data_param_dict_for_specific_task.update({f'Molecule_{task_id}_X_train': X_train})
        data_param_dict_for_specific_task.update({f'Molecule_{task_id}_y_train': y_train})
        data_param_dict_for_specific_task.update({f'Molecule_{task_id}_X_test': X_test})
        data_param_dict_for_specific_task.update({f'Molecule_{task_id}_y_test': y_test})

    return data_param_dict_for_specific_task

    # print(len(args))


def decay_lr(step, optimizer):
    if (step + 1) % 75 == 0:
        optimizer.lr = optimizer.lr / 2.
        # print('Decreasing the learning rate by 1/2. New Learning Rate: {}'.format(optimizer.lr))


def final_model(shared_hyperparameters, molecule_list, data_param_dict_for_specific_task, val=False):
    train_data = []
    train_label = []
    val_data = []
    val_label = []
    test_data = []
    test_label = []

    for task_id in molecule_list:
        X_train_full = data_param_dict_for_specific_task[f'Molecule_{task_id}_X_train']
        y_train_full = data_param_dict_for_specific_task[f'Molecule_{task_id}_y_train']

        # Add a small validation split from the training data
        X_train, X_val, y_train, y_val = train_test_split(
            X_train_full, y_train_full, test_size=0.1, random_state=42, stratify=y_train_full
        )

        train_data.append(X_train)
        train_label.append(y_train)
        val_data.append(X_val)
        val_label.append(y_val)

        test_data.append(data_param_dict_for_specific_task[f'Molecule_{task_id}_X_test'])
        test_label.append(data_param_dict_for_specific_task[f'Molecule_{task_id}_y_test'])

    class SharedEncoder(tf.keras.Model):
        def __init__(self):
            super(SharedEncoder, self).__init__()
            self.shared_layers = []
            for h in range(shared_hyperparameters['shared_FF_Layers']):
                self.shared_layers.append(Dense(shared_hyperparameters['shared_FF_Neurons'][h], activation='relu'))

        def call(self, inputs):
            x = inputs
            for layer in self.shared_layers:
                x = layer(x)
            return x

    class TaskDecoder(tf.keras.Model):
        def __init__(self):
            super(TaskDecoder, self).__init__()
            self.output_layer = Dense(1, activation='sigmoid')

        def call(self, shared_representation):
            return self.output_layer(shared_representation)

    # Create instances of the Shared Encoder and Task Decoders
    shared_encoder = SharedEncoder()
    task_decoders = {molecule: TaskDecoder() for molecule in molecule_list}

    global_step = tf.Variable(0, trainable=False)
    init_lr = shared_hyperparameters['learning_rate']
    optimizer = tf.keras.optimizers.SGD(init_lr, momentum=0.9, nesterov=False)
    loss_fn = tf.keras.losses.BinaryCrossentropy()

    # print(f'optimizer variables = {len(optimizer.variables())},')
    # print(f'how many trainable variables = {len(shared_encoder.trainable_variables)}')

    # @tf.function
    def train_step(x_batch_train, y_batch_train):
        with tf.GradientTape() as tape:
            shared_representations = [shared_encoder(input_data, training=True) for input_data in x_batch_train]
            predictions = [task_decoders[molecule](shared_rep, training=True) for molecule, shared_rep in
                           zip(molecule_list, shared_representations)]
            losses = [loss_fn(y_true, y_pred) for y_true, y_pred in zip(y_batch_train, predictions)]
            losses_dict = {task: loss for task, loss in zip(molecule_list, losses)}
            tot_loss = tf.reduce_sum(losses)

        # print(f'losses = {losses_dict}')
        # before_update_losses = copy.deepcopy(losses_dict)
        # before_losses = {task: loss.numpy() for task, loss in before_update_losses.items()}
        # print(f'before_losses = {before_losses}')
        # print(f'total loss = {tot_loss.numpy()}')
        gradients = tape.gradient(tot_loss, shared_encoder.trainable_variables + sum(
            [decoder.trainable_variables for decoder in task_decoders.values()], []))

        optimizer.apply_gradients(zip(gradients, shared_encoder.trainable_variables + sum(
            [decoder.trainable_variables for decoder in task_decoders.values()], [])))

        original_shared_weights = [tf.identity(weight) for weight in shared_encoder.trainable_weights]
        original_decoder_weights = {molecule: [tf.identity(weight) for weight in decoder.trainable_weights]
                                    for molecule, decoder in task_decoders.items()}

        '''check how many trainable parameters are trainable'''
        return tot_loss, original_shared_weights, original_decoder_weights

    # @tf.function
    def test_step(x_batch_test, y_batch_test):
        shared_representations = [shared_encoder(input_data, training=False) for input_data in x_batch_test]
        predictions = [task_decoders[molecule](shared_rep, training=False) for molecule, shared_rep in
                       zip(molecule_list, shared_representations)]
        eval_losses = [loss_fn(y_true, y_pred) for y_true, y_pred in zip(y_batch_test, predictions)]
        eval_loss = tf.reduce_sum(eval_losses)
        return eval_loss, eval_losses, predictions

    # @tf.function()
    def save_gradients():
        # Set the model to evaluation mode in TensorFlow
        # (In TF, there's no need for an explicit eval mode like PyTorch, just ensure dropout/batchnorm layers are in inference mode)
        task_gradients = {task: [] for task in molecule_list}

        for batch_idx in range(0, len(train_data[0]), batch_size):
            x_batch_train = [data[batch_idx:batch_idx + batch_size] for data in train_data]
            y_batch_train = [label[batch_idx:batch_idx + batch_size] for label in train_label]
            # optimizer = new_optimizer
            with tf.GradientTape(persistent=True) as tape:
                shared_representations = [shared_encoder(input_data, training=True) for input_data in x_batch_train]
                predictions = [task_decoders[molecule](shared_rep, training=True) for molecule, shared_rep in
                               zip(molecule_list, shared_representations)]
                losses = [loss_fn(y_true, y_pred) for y_true, y_pred in zip(y_batch_train, predictions)]
                losses_dict = {task: loss for task, loss in zip(molecule_list, losses)}
                tot_loss = tf.reduce_sum(losses)

                # Compute the gradient of the task-specific loss w.r.t. the shared base.
                single_task_specific_gradients = [
                    (single_task, tape.gradient(losses_dict[single_task], shared_encoder.trainable_weights)) for
                    single_task in molecule_list]

            for task, tmp_gradients in single_task_specific_gradients:
                # for tmp_gradients in grads:
                #     '''flatten and concatenate gradients'''
                #     tmp_gradients = tf.concat([tf.reshape(g, [-1]) for g in tmp_gradients], axis=0).numpy()
                #     tmp_gradients = (tmp_gradients.reshape(1, -1) @ project_matrix).flatten()
                #     task_gradients[task].append(tmp_gradients)

                # for tmp_gradients in grads:
                '''flatten and concatenate gradients'''
                tmp_gradients = tf.concat([tf.reshape(g, [-1]) for g in tmp_gradients], axis=0).numpy()

                # Debug print to check the dimensions
                # print(f"tmp_gradients shape: {tmp_gradients.shape}")
                # print(f"project_matrix shape: {project_matrix.shape}")

                if tmp_gradients.size != project_matrix.shape[0]:
                    raise ValueError(
                        f"Gradient size {tmp_gradients.size} does not match expected size {project_matrix.shape[0]}")

                tmp_gradients = (tmp_gradients.reshape(1, -1) @ project_matrix).flatten()

                task_gradients[task].append(tmp_gradients)

        for task_name, gradients in task_gradients.items():
            np.save(f"{gradients_dir}/{task_name}_train_gradients.npy", gradients)

        del tape  # Clean up the persistent GradientTape

    Patience = 50
    min_loss_to_consider = math.inf

    TRAIN_SIZE = len(train_data[0])
    print(f'TRAIN_SIZE = {TRAIN_SIZE}')

    gradient_metrics = {task: [] for task in molecule_list}

    timeStart = time.time()
    for epoch in range(num_epochs):
        if epoch > 100:
            decay_lr(epoch, optimizer)

        batch_grad_metrics = {combined_task: {task: 0. for task in molecule_list} for combined_task in
                              gradient_metrics}

        for batch_idx in range(0, len(train_data[0]), batch_size):
            x_batch_train = [data[batch_idx:batch_idx + batch_size] for data in train_data]
            y_batch_train = [label[batch_idx:batch_idx + batch_size] for label in train_label]

            train_loss, shared_weights, decoder_weights = train_step(x_batch_train, y_batch_train)

        if val:
            ### Validation can be done here if needed, by evaluating on `val_data` and `val_label`
            val_loss, _, _ = test_step(val_data, val_label)
            loss_to_consider = val_loss
        else:
            loss_to_consider = train_loss

        if epoch % 20 == 0:
            print(f'Epoch {epoch + 1}/{num_epochs}, loss = {train_loss.numpy()}, Patience = {Patience}')

        if loss_to_consider.numpy() < min_loss_to_consider:
            min_loss_to_consider = min(min_loss_to_consider, loss_to_consider.numpy())
            Patience = 50
            best_shared_weights = copy.deepcopy(shared_weights)
            best_decoder_weights = copy.deepcopy(decoder_weights)
        else:
            Patience -= 1
            if Patience == 0:
                print(f'Stopping Training at Epoch {epoch + 1}')
                break


    time_taken = time.time() - timeStart
    # print(f'gradient_metrics = {gradient_metrics}')
    # load the original model
    for best_weight, curr_weight in zip(best_shared_weights, shared_encoder.trainable_weights):
        curr_weight.assign(best_weight)

    for molecule, decoder_specific_weights in best_decoder_weights.items():
        for best_weight, curr_weight in zip(decoder_specific_weights, task_decoders[molecule].trainable_weights):
            curr_weight.assign(best_weight)

    '''new parts'''
    if len(molecule_list) == len(TASKS):
        '''save best weights and model to a file'''
        model_base_dir = f'{datasetName}_model_weights'
        if not os.path.exists(model_base_dir):
            os.makedirs(model_base_dir)

        gradients_dir = f'{datasetName}_gradients_run_{run}'
        if not os.path.exists(gradients_dir):
            os.makedirs(gradients_dir)

        model_dir = f'{model_base_dir}/run_{run}'
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
        shared_encoder.save_weights(f'{model_dir}/shared_encoder')
        for molecule, decoder in task_decoders.items():
            decoder.save_weights(f'{model_dir}/decoder_{molecule}')

        grad_params = []
        for params in shared_encoder.trainable_weights:
            grad_params.append(params)

        print(f'len(grad_params): {len(grad_params)}', end=' ')
        gradient_dim = 0
        for param in grad_params:
            gradient_dim += param.numpy().size
        print("Gradient Dim: {}".format(gradient_dim), end=' ')

        project_dim = 200
        project_matrix = (2 * np.random.randint(2, size=(gradient_dim, project_dim)) - 1).astype(float)
        project_matrix *= 1 / np.sqrt(project_dim)
        print("Project Dim: {}".format(project_dim))

        # Save gradients
        start_time_grad = time.time()
        save_gradients()
        end_time_grad = time.time()
        # print(f"Time taken for train gradients: {end_time_grad - start_time_grad}")

    test_loss, indiv_losses, y_pred = test_step(test_data, test_label)
    indiv_losses = [each_loss.numpy() for each_loss in indiv_losses]
    y_pred = [pred.numpy() for pred in y_pred]
    y_test = [label for label in test_label]
    y_pred = np.concatenate(y_pred, axis=0)
    y_test = np.concatenate(y_test, axis=0)
    # print(f'y_pred = {y_pred[:50]}, y_test = {y_test[:50]}')

    predicted_val = (y_pred >= 0.75).astype(int)
    error_rate = np.mean(predicted_val != y_test)
    ap = average_precision_score(y_test, y_pred)

    print(f'test_loss = {test_loss}\terrorRate = {error_rate}\tap = {ap}')


    return test_loss.numpy(), indiv_losses, error_rate, time_taken


if __name__ == "__main__":
    datasetName = 'Chemical'
    DataPath = f'../Dataset/{datasetName.upper()}/'
    import sys

    Method_name = 'SimpleMTL'
    group_len = 'ALL'#1 for STL, 2 for Pairs,

    ResultPath = '../RESULTS/GROUPS_MTL/'

    import sys

    ChemicalData = pd.read_csv(f'{DataPath}Task_Information_Chemical.csv', low_memory=False)
    ChemicalData = ChemicalData.sort_values(by=['Dataset_Size'], ascending=False)
    print(ChemicalData[['Molecule', 'Dataset_Size']])
    TASKS = list(ChemicalData['Molecule'])
    print(f'TASKS = {TASKS}')
    TASKS = [str(task) for task in TASKS]

    task_len = {}
    variance_dict = {}
    std_dev_dict = {}
    dist_dict = {}
    Single_res_dict = {}
    STL_error = {}
    STL_AP = {}

    num_folds = 10

    Arch_Name = 'Arch_1'
    if Arch_Name == 'Arch_1':
        initial_shared_architecture = {'shared_FF_Layers': 2, 'shared_FF_Neurons': [32, 16],
                                       'learning_rate': 0.001}
        num_epochs = 1000
        batch_size = 264

    # '''ALL'''
    if group_len == 'ALL':
        TASK_Group = [tuple(TASKS)]
        name_suffix = 'ALL'

    RUNS = [i for i in range(1, 11)]
    for run in RUNS:
        seed_value = run
        tf.random.set_seed(seed_value)
        np.random.seed(seed_value)
        random.seed(seed_value)

        Task_group = []
        Total_Loss = []
        Individual_Group_Score = []
        Individual_Error_Rate = []
        Individual_AP = []
        Number_of_Groups = []
        Individual_Task_Score = []
        Time_taken_for_training = []
        Prev_Groups = {}
        for count in range(len(TASK_Group)):
            print(f'Initial Training for {datasetName}-partition {count}, {TASK_Group[count]}')
            task_group = TASK_Group[count]

            random_seed = seed_value  # 2024

            args_tasks = []
            group_score = {}
            group_avg_err = {}
            group_avg_AP = {}
            tmp_task_score = []

            data_param_dict_for_specific_task = data_preparation(task_group)
            all_scores = final_model(initial_shared_architecture, task_group, data_param_dict_for_specific_task,
                                     val=True)
            print(all_scores)
            tot_loss, indi_scores = all_scores[0], all_scores[1]
            avg_error = all_scores[2]
            Total_time = all_scores[-1]
            print(f'total_time = {Total_time}')
            task_scores = {}
            for idx, task in enumerate(task_group):
                task_scores[f'molecule_{task}'] = indi_scores[idx]

            print(f'total_time = {Total_time}')
            print(f'avg time in minutes = {np.mean(Total_time) / 60}')

            # loss, task_scores, avg_error, AP = sort_Res(all_scores, task)

            print(f'tot_loss = {tot_loss}')
            Task_group.append(task_group)
            Total_Loss.append(tot_loss)
            Individual_Error_Rate.append(avg_error)
            Individual_Task_Score.append(copy.deepcopy(task_scores))
            # print(Individual_Group_Score)
            Time_taken_for_training.append(np.mean(Total_time) / 60)

            print(len(Total_Loss), len(Task_group), len(Individual_Task_Score), len(Individual_Error_Rate))
            # exit(0)

            temp_res = pd.DataFrame({'Total_Loss': Total_Loss,
                                     'Task_group': Task_group,
                                     'Individual_Task_Score': Individual_Task_Score,
                                     'Individual_Error_Rate': Individual_Error_Rate,
                                     'Time_taken_for_training': Time_taken_for_training,
                                     })
            temp_res.to_csv(f'{ResultPath}/{datasetName}_{name_suffix}_run_{run}_SGD_Arch_{Arch_Name}_BASELINE_GRADTAE.csv',index=False)