import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
import torch
from pathlib import Path
import zipfile
import numpy as np
import sys

def print_X_y(X, y, output_txt_file):
    all = np.append(X, y[:, None], axis=1)
    file = open(output_txt_file, "w")
    file.write(str(all))
    file.close()

def dats_to_torch(f, output_file, scaler=None):
    output_pt_file = Path(output_file + '.pt')
    output_txt_file = Path(output_file + '.txt')
    lines = open(f, "r").readlines()
    skip = next(i for i, v in enumerate(lines) if not v.startswith("@"))
    df = pd.read_csv(f, header=None, skiprows=skip)
    df = df.rename({df.shape[1] - 1: "class"}, axis=1)
    df = df.replace({" negative": 0, " positive": 1})
    if scaler is None:
        scaler = StandardScaler(with_std=True)
        X = scaler.fit_transform(df[[c for c in df.columns if c != "class"]])
    else:
        X = scaler.transform(df[[c for c in df.columns if c != "class"]])
    y = df["class"].values
    print_X_y(X, y, output_txt_file)
    torch.save([torch.from_numpy(X), torch.from_numpy(y)], output_pt_file)
    return scaler

def main():
    if not os.path.isdir('Keel/raw'):
        raise Exception("Downloaded Keel datasets should be saved under datasets/Keel/raw/<dataset_name>")
    np.set_printoptions(precision=3, threshold=sys.maxsize)
    for dataset_top in Path("Keel/raw").iterdir():
        dataset_name = Path(dataset_top).parts[-1]
        # Unzip Source
        unzipped_5_fold_dir = Path(f'Keel/raw/{dataset_name}/{dataset_name}-5-fold')
        if not os.path.isdir(unzipped_5_fold_dir):
            os.mkdir(unzipped_5_fold_dir)
            with zipfile.ZipFile(Path(f'Keel/raw/{dataset_name}/{dataset_name}-5-fold.zip'), "r") as zip_ref:
                zip_ref.extractall(unzipped_5_fold_dir)
        # Make dirs
        if not os.path.isdir(Path(f'Keel/preprocessed/{dataset_name}')):
            os.makedirs(Path(f'Keel/preprocessed/{dataset_name}'))
        # Translate
        for i in [1, 2, 3, 4, 5]:
            train_f = Path(f'Keel/raw/{dataset_name}/{dataset_name}-5-fold/{dataset_name}-5-{i}tra.dat')
            test_f = Path(f'Keel/raw/{dataset_name}/{dataset_name}-5-fold/{dataset_name}-5-{i}tst.dat')
            output_file_base = f'Keel/preprocessed/{dataset_name}/{dataset_name}_'
            if i == 1:
                scaler = dats_to_torch(train_f, output_file_base+'train')
                dats_to_torch(test_f, output_file_base+'test', scaler)
            scaler = dats_to_torch(train_f, output_file_base+f'train_{i}')
            dats_to_torch(test_f, output_file_base+f'test_{i}', scaler)


if __name__ == "__main__":
    main()
