import os
import sys

import pyreadr

sys.path.append(os.path.realpath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir, 'src')))
sys.path.append(os.path.realpath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
from data.preprocessing import save_statistics
from experiment_utils import make_experiment
from utils.metadata import DATA_DIRECTORY
from utils.utils import Bunch


if __name__ == '__main__':
    experiment = make_experiment()


    @experiment.config
    def config():
        params = dict(
            data_file=os.path.join(DATA_DIRECTORY, 'TEP_harvard', 'TEP_Faulty_Testing.RData'),
            out_dir=os.path.join(DATA_DIRECTORY, 'TEP_harvard'),
            chunksize=100000,
            gzip=True,
        )


    @experiment.automain
    def main(params, _run):
        """
        Converts a File from the original TEP harvard dataset (https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/6C3JR1)
        From the R format to csv using pyreadr and pandas. Chunk size can be tuned to prevent OutOfMemory errors on
        low-RAM machines.
        """
        params = Bunch(params)

        data = pyreadr.read_r(params.data_file)
        data = data[next(iter(data.keys()))]
        file_info = list(os.path.splitext(os.path.basename(params.data_file)))
        file_info[1] = '.csv'
        if params.gzip:
            file_info[1] += '.gz'
        os.makedirs(params.out_dir, exist_ok=True)

        # Save dataset statistics
        stats_file = os.path.join(params.out_dir, f'{file_info[0]}_stats.npz')
        save_statistics(data[data.columns[3:]], stats_file)

        out_file = os.path.join(params.out_dir, ''.join(file_info))
        assert out_file != params.data_file
        print(f'Saving converted file in "{out_file}"!')
        data.to_csv(out_file, index=False, chunksize=params.chunksize)
