import os
import sys

import pandas as pd

sys.path.append(os.path.realpath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir, 'src')))
sys.path.append(os.path.realpath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
from data.preprocessing import save_statistics
from experiment_utils import make_experiment
from utils.metadata import DATA_DIRECTORY
from utils.utils import Bunch


if __name__ == '__main__':
    experiment = make_experiment()


    @experiment.config
    def config():
        data_dir = os.path.join(DATA_DIRECTORY, 'SWaT', 'SWaT_A1A2_Dec_2015', 'Physical')
        params = dict(
            data_files=[
                os.path.join(data_dir, 'SWaT_Dataset_Attack_v0.xlsx'),
                os.path.join(data_dir, 'SWaT_Dataset_Normal_v1.xlsx'),
            ],
            out_dir=data_dir,
            chunksize=None,
            gzip=False,
        )
        del data_dir


    @experiment.automain
    def main(params, _run):
        """
        Converts a file from the original SWat dataset (https://itrust.sutd.edu.sg/itrust-labs_datasets/dataset_info/)
        from the Excel format to csv using pandas and openpyxl. Chunk size can be tuned to prevent OutOfMemory errors on
        low-RAM machines.
        """
        params = Bunch(params)

        for file in params.data_files:
            print(f'Reading file "{file}"!')
            data = pd.read_excel(file, skiprows=1)
            data = data.rename(columns={c: c.strip() for c in data.columns})
            data['Timestamp'] = pd.to_datetime(data['Timestamp'])
            # Fix labeling bug
            data.loc[data['Normal/Attack'] == 'A ttack', 'Normal/Attack'] = 'Attack'

            file_info = list(os.path.splitext(os.path.basename(file)))
            file_info[1] = '.csv'
            if params.gzip:
                file_info.append('.gz')
            os.makedirs(params.out_dir, exist_ok=True)

            # Save dataset statistics
            stats_file = os.path.join(params.out_dir, f'{file_info[0]}_stats.npz')
            save_statistics(data[data.columns[1:-1]], stats_file)

            # Save csv file
            out_file = os.path.join(params.out_dir, ''.join(file_info))
            assert out_file != file
            print(f'Saving converted file in "{out_file}"!')
            data.to_csv(out_file, index=False, chunksize=params.chunksize)
