import os
import pandas
import json
from sklearn.model_selection import train_test_split
from shutil import copy
import numpy

import itertools
import multiprocessing
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
import hdbscan
from sklearn.cluster import AgglomerativeClustering

from ProtLig_GPCRclassA.base_cross_validation import BaseCrossValidation, BaseCVPreProcess, BaseCVSplit

class InadequateTestSetSizeError(Exception):
    pass


class Random(BaseCVSplit):
    """
    For convenience when createing test set only.
    """
    def __init__(self, data_dir, seed = None, split_kwargs = {}):
        """
        Parameters:
        -----------
        data_dir : str
            directory containing full preprocessed data.

        data_path : str
            path to raw data.

        split_kwargs : dict:
            kwargs are passed to func_split_data.
        """
        super(Random, self).__init__(data_dir = data_dir, seed = seed, split_kwargs = split_kwargs)

        self.func_split_data_name = 'random'

    def load_data(self):
        """
        function to read preprocessed full data into a DataFrame.
        """
        full_data = pandas.read_csv(os.path.join(self.data_dir, 'full_data.csv'), sep = self.sep, index_col = None, header = 0)
        # Copy-paste data:
        # copy(src = os.path.join(self.data_dir, 'full_data.csv'), dst = os.path.join(self.working_dir, 'full_data.csv'))
        # copy(src = os.path.join(self.data_dir, 'mols.csv'), dst = os.path.join(self.working_dir, 'mols.csv'))
        # copy(src = os.path.join(self.data_dir, 'seqs.csv'), dst = os.path.join(self.working_dir, 'seqs.csv'))
        # copy(src = os.path.join(self.data_dir, 'CV_data_hparams.json'), dst = os.path.join(self.working_dir, 'CV_data_hparams.json'))

        return full_data
