import os
import sys
import platform

current_system = platform.system()
if current_system == "Linux":
    # current_directory = os.getcwd()
    current_directory = os.path.dirname(os.path.realpath(__file__))
    os.chdir(current_directory)
    print("current_directory:", current_directory)

from torch.utils.data import Dataset, DataLoader, TensorDataset, SubsetRandomSampler
from sklearn import preprocessing
import os
import datetime
import numpy as np
import pandas as pd
import torch
import bisect
from sklearn.cluster import KMeans
import sklearn


# ECG dataset
class Dataset_RUL(Dataset):
    def __init__(self, data, seq_len, batch_size=128, operating_condition=0, renormalize=True):
        assert data in ['001', '002', '003', '004']
        self.data_dict = {
            f'FD{data}': {
                'root_path': [
                    f'data/train_FD{data}.txt',
                    f'data/test_FD{data}.txt',
                    f'data/RUL_FD{data}.txt']}}
        self.path = self.data_dict[f'FD{data}']['root_path']
        self.seq_len = seq_len
        self.batch_size = batch_size
        self.operating_condition = operating_condition
        self.kmeans = True if data in ['002', '004'] else False
        self.renormalize = renormalize

    def gen_sequence(self, id_df, seq_length, seq_cols):
        data_array = id_df[seq_cols].values
        num_elements = data_array.shape[0]
        if seq_length == num_elements:
            yield data_array
        elif seq_length < num_elements:
            for start, stop in zip(range(0, num_elements - seq_length), range(seq_length, num_elements)):
                yield data_array[start:stop, :]
        else:
            pass

    def gen_labels(self, id_df, seq_length, label):
        data_array = id_df[label].values
        num_elements = data_array.shape[0]
        if num_elements == seq_length:
            return data_array[-1:, :]
        else:
            return data_array[seq_length:num_elements, :]

    def load_data(self):
        # load training, test and real RUL label data
        column_title = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3', 's4', 's5', 's6', 's7',
                        's8', 's9', 's10', 's11', 's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21']
        train_df = pd.read_csv(self.path[0], sep=" ", header=None)
        train_df.drop(train_df.columns[[26, 27]], axis=1, inplace=True)
        train_df.columns = column_title
        train_df = train_df.sort_values(['id', 'cycle'])

        test_df = pd.read_csv(self.path[1], sep=" ", header=None)
        test_df.drop(test_df.columns[[26, 27]], axis=1, inplace=True)
        test_df.columns = column_title

        truth_df = pd.read_csv(self.path[2], sep=" ", header=None)
        truth_df.drop(truth_df.columns[[1]], axis=1, inplace=True)

        # Training Data Labeling and Normalization
        rul = pd.DataFrame(train_df.groupby('id')['cycle'].max()).reset_index()
        rul.columns = ['id', 'max']
        train_df = train_df.merge(rul, on=['id'], how='left')
        train_df['RUL'] = train_df['max'] - train_df['cycle']
        train_df.drop('max', axis=1, inplace=True)

        cols_normalize = train_df.columns.difference(['id', 'cycle', 'RUL'])
        min_max_scaler = preprocessing.MinMaxScaler((0, 1))

        norm_train_df = pd.DataFrame(min_max_scaler.fit_transform(train_df[cols_normalize]),
                                     columns=cols_normalize,
                                     index=train_df.index)
        join_df = train_df[train_df.columns.difference(cols_normalize)].join(norm_train_df)
        train_df = join_df.reindex(columns=train_df.columns)
        train_df['RUL'] = train_df['RUL'].clip(upper=125)  # or train_df.loc[train_df['RUL'] > 125, 'RUL'] = 125

        # Test Data Labeling and Normalization
        norm_test_df = pd.DataFrame(min_max_scaler.transform(test_df[cols_normalize]),
                                    columns=cols_normalize,
                                    index=test_df.index)
        test_join_df = test_df[test_df.columns.difference(cols_normalize)].join(norm_test_df)
        test_df = test_join_df.reindex(columns=test_df.columns)
        test_df = test_df.reset_index(drop=True)
        rul = pd.DataFrame(test_df.groupby('id')['cycle'].max()).reset_index()
        rul.columns = ['id', 'max']
        truth_df.columns = ['more']
        truth_df['id'] = truth_df.index + 1
        truth_df['max'] = rul['max'] + truth_df['more']
        truth_df.drop('more', axis=1, inplace=True)
        test_df = test_df.merge(truth_df, on=['id'], how='left')
        test_df['RUL'] = test_df['max'] - test_df['cycle']
        test_df.drop('max', axis=1, inplace=True)
        test_df['RUL'] = test_df['RUL'].clip(upper=125)  # or test_df.loc[test_df['RUL'] > 125, 'RUL'] = 125

        if self.kmeans:
            train_kmeans = train_df[["setting1", "setting2", "setting3"]].values
            estimator = KMeans(
                n_clusters=6,
                init='k-means++',
                n_init=10,
                max_iter=300,
                tol=1e-4,
                random_state=0,
                algorithm='lloyd')

            estimator.fit(train_kmeans)

            concatenated_train_df = pd.DataFrame()
            for id in train_df['id'].unique():
                sub_train_df = train_df[train_df['id'] == id].reset_index(drop=True)
                train_kmeans = sub_train_df[["setting1", "setting2", "setting3"]].values
                train_data_labels = estimator.predict(train_kmeans)
                sub_train_df["operating_condition"] = train_data_labels
                sub_train_df_condition = sub_train_df[sub_train_df['operating_condition'] == self.operating_condition].copy()


                # apply equal value filling
                sub_train_df_condition = sub_train_df_condition.reindex(range(len(sub_train_df))).fillna(method='ffill').fillna(method='bfill')

                # apply linear interpolation
                # sub_train_df_condition = sub_train_df_condition.reindex(range(len(sub_train_df))).interpolate(method='linear', limit_direction='both')

                # apply polynomial interpolation
                # sub_train_df_condition = sub_train_df_condition.reindex(range(len(sub_train_df))).interpolate(method='polynomial', order=2).fillna(method='bfill')

                concatenated_train_df = pd.concat([concatenated_train_df, sub_train_df_condition])

            train_df = concatenated_train_df

            concatenated_test_df = pd.DataFrame()
            for id in test_df['id'].unique():
                sub_test_df = test_df[test_df['id'] == id].reset_index(drop=True)
                test_kmeans = sub_test_df[["setting1", "setting2", "setting3"]].values
                test_data_labels = estimator.predict(test_kmeans)
                sub_test_df["operating_condition"] = test_data_labels
                sub_test_df_condition = sub_test_df[sub_test_df['operating_condition'] == self.operating_condition].copy()


                # apply equal value filling
                sub_test_df_condition = sub_test_df_condition.reindex(range(len(sub_test_df))).fillna(method='ffill').fillna(method='bfill')

                # apply linear interpolation
                # sub_test_df_condition = sub_test_df_condition.reindex(range(len(sub_test_df))).interpolate(method='linear', limit_direction='both')

                # apply polynomial interpolation
                # sub_test_df_condition = sub_test_df_condition.reindex(range(len(sub_test_df))).interpolate(method='polynomial', order=2).fillna(method='bfill')


                concatenated_test_df = pd.concat([concatenated_test_df, sub_test_df_condition])

            test_df = concatenated_test_df

            if self.renormalize:
                train_df = train_df.reset_index(drop=True)
                test_df = test_df.reset_index(drop=True)

                cols_normalize = test_df.columns.difference(['id', 'cycle', 'RUL', 'operating_condition'])
                min_max_scaler = preprocessing.MinMaxScaler((0, 1))

                norm_train_df = pd.DataFrame(min_max_scaler.fit_transform(train_df[cols_normalize]),
                                             columns=cols_normalize,
                                             index=train_df.index)
                join_df = train_df[train_df.columns.difference(cols_normalize)].join(norm_train_df)
                train_df = join_df.reindex(columns=train_df.columns)

                norm_test_df = pd.DataFrame(min_max_scaler.transform(test_df[cols_normalize]),
                                            columns=cols_normalize,
                                            index=test_df.index)
                test_join_df = test_df[test_df.columns.difference(cols_normalize)].join(norm_test_df)
                test_df = test_join_df.reindex(columns=test_df.columns)

        return train_df, test_df

    def check_length(self):
        train_df, test_df = self.load_data()
        length_of_test_df = []
        for id in test_df['id'].unique():
            length_of_test = len(test_df[test_df['id'] == id])
            length_of_test_df.append(length_of_test)
        id_of_test_df = list(range(1, len(test_df['id'].unique()) + 1))
        length_of_test_df_sorted, id_of_test_df_sorted = zip(*sorted(zip(length_of_test_df, id_of_test_df)))

        length_of_train_df = []
        for id in train_df['id'].unique():
            length_of_train = len(train_df[train_df['id'] == id])
            length_of_train_df.append(length_of_train)
        id_of_train_df = list(range(1, len(train_df['id'].unique()) + 1))
        length_of_train_df_sorted, id_of_train_df_sorted = zip(*sorted(zip(length_of_train_df, id_of_train_df)))

        return id_of_test_df_sorted, id_of_train_df_sorted

    def get_dataloader(self):
        train_df, test_df = self.load_data()
        id_of_test_df_sorted, _ = self.check_length()

        sequence_cols = ['s2', 's3', 's4', 's7', 's8', 's9', 's11', 's12', 's13', 's14', 's15', 's17', 's20', 's21']

        seq_gen = (list(self.gen_sequence(train_df[train_df['id'] == id], self.seq_len, sequence_cols))
                   for id in train_df['id'].unique())
        training_sample = np.concatenate([sublist for sublist in list(seq_gen) if sublist]).astype(np.float32)


        label_gen = [self.gen_labels(train_df[train_df['id'] == id], self.seq_len, ['RUL'])
                     for id in train_df['id'].unique()]
        training_label = np.concatenate(label_gen).astype(np.float32)


        sorted = True

        if sorted:
            # sort by length of test data
            test_sample = [test_df[test_df['id'] == id][sequence_cols].values[-self.seq_len:] for id in
                           id_of_test_df_sorted if len(test_df[test_df['id'] == id]) > self.seq_len]
            test_sample = np.array(test_sample).astype(np.float32)
            test_label = [test_df[test_df['id'] == id]['RUL'].values[-1] for id in id_of_test_df_sorted if
                          len(test_df[test_df['id'] == id]) > self.seq_len]
            test_label = np.array(test_label).astype(np.float32)
            test_id = torch.tensor(
                [id for id in id_of_test_df_sorted if len(test_df[test_df['id'] == id]) > self.seq_len])
        else:
            # do not sort
            test_sample = [test_df[test_df['id'] == id][sequence_cols].values[-self.seq_len:] for id in
                           test_df['id'].unique() if len(test_df[test_df['id'] == id]) > self.seq_len]
            test_sample = np.array(test_sample).astype(np.float32)
            test_label = [test_df[test_df['id'] == id]['RUL'].values[-1] for id in test_df['id'].unique() if
                          len(test_df[test_df['id'] == id]) > self.seq_len]
            test_label = np.array(test_label).astype(np.float32)
            test_id = torch.tensor(
                [id for id in test_df['id'].unique() if len(test_df[test_df['id'] == id]) > self.seq_len])

        DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')  # cuda or cpu

        train_x_tensor = torch.from_numpy(training_sample).contiguous().type(torch.FloatTensor).to(DEVICE)
        train_target_tensor = torch.from_numpy(training_label).type(torch.FloatTensor).unsqueeze(2).to(DEVICE)

        print("training samples: ", train_x_tensor.size(), train_target_tensor.size())

        test_x_tensor = torch.from_numpy(test_sample).contiguous().type(torch.FloatTensor).to(DEVICE)
        test_target_tensor = torch.from_numpy(test_label).type(torch.FloatTensor).unsqueeze(1).unsqueeze(2).to(DEVICE)
        print("test samples: ", test_x_tensor.size(), test_target_tensor.size())

        train_dataset_new = TensorDataset(train_x_tensor, train_target_tensor)

        train_dataloader = DataLoader(train_dataset_new, batch_size=self.batch_size, shuffle=True, drop_last=False) # 为测试取消shuffle

        test_dataset_new = TensorDataset(test_x_tensor, test_target_tensor)
        test_dataloader = DataLoader(test_dataset_new, batch_size=300, shuffle=False, drop_last=False)

        return train_dataloader, test_dataloader, test_id

    # [(0, 4), (4, 7), (7, 9), (9, 10)]
    def get_group_index(self, time_windows = [30, 60, 90, 120]):
        group_indices= []
        span = len(time_windows)
        index_start = 0

        for i in range(len(time_windows)):
            index_end = index_start + span
            index = (index_start, index_end)
            group_indices.append(index)
            index_start = index_end
            span = span -1
        return group_indices

    # [0, 12, 26, 37, 63, 83, 100]:  to analyze the performance of subsets
    def get_subset_unit_index(self, time_windows=[30, 60, 90, 120]):
        train_df, test_df = self.load_data()
        sequence_cols = ['s2', 's3', 's4', 's7', 's8', 's9', 's11', 's12', 's13', 's14', 's15', 's17', 's20', 's21']
        length_of_test_df = []
        for id in test_df['id'].unique():
            length_of_test = len(test_df[test_df['id'] == id])
            length_of_test_df.append(length_of_test)
        id_of_test_df = list(range(1, len(test_df['id'].unique()) + 1))

        length_of_test_df_sorted, id_of_test_df_sorted = zip(*sorted(zip(length_of_test_df, id_of_test_df)))

        split_indices = [bisect.bisect_left(length_of_test_df_sorted, value) for value in time_windows]
        split_indices = split_indices + [len(test_df['id'].unique())]
        return split_indices

    def get_subtest(self, time_windows=[30, 60, 90, 120]):
        train_df, test_df = self.load_data()
        sequence_cols = ['s2', 's3', 's4', 's7', 's8', 's9', 's11', 's12', 's13', 's14', 's15', 's17', 's20', 's21']
        length_of_test_df = []
        for id in test_df['id'].unique():
            length_of_test = len(test_df[test_df['id'] == id])
            length_of_test_df.append(length_of_test)
        id_of_test_df = list(range(1, len(test_df['id'].unique()) + 1))

        length_of_test_df_sorted, id_of_test_df_sorted = zip(*sorted(zip(length_of_test_df, id_of_test_df)))

        split_indices = [bisect.bisect_left(length_of_test_df_sorted, value) for value in time_windows]
        split_indices = split_indices + [len(test_df['id'].unique())]
        test_dataloader_list = []
        subset_index = 0
        for time_window in time_windows:
            for start_index, end_index in zip(split_indices[subset_index:-1], split_indices[1+subset_index:]):
                sub_test_id = id_of_test_df_sorted[start_index:end_index]
                test_sample = [test_df[test_df['id'] == id][sequence_cols].values[-time_window:] for id in sub_test_id]
                test_sample = np.array(test_sample).astype(np.float32)
                test_label = [test_df[test_df['id'] == id]['RUL'].values[-1] for id in sub_test_id]
                test_label = np.array(test_label).astype(np.float32)

                DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')  # cuda or cpu
                test_x_tensor = torch.from_numpy(test_sample).contiguous().type(torch.FloatTensor).to(DEVICE)
                test_target_tensor = torch.from_numpy(test_label).type(torch.FloatTensor).unsqueeze(1).unsqueeze(2).to(DEVICE)
                test_dataset_new = TensorDataset(test_x_tensor, test_target_tensor)
                test_dataloader = DataLoader(test_dataset_new, batch_size=300, shuffle=False, drop_last=False)
                test_dataloader_list.append(test_dataloader)
            subset_index += 1
        return test_dataloader_list

