from typing import Union
import pandas as pd
from sklearn.model_selection import train_test_split
from ucimlrepo import fetch_ucirepo
from sklearn.feature_extraction import DictVectorizer

class DataLoader:
    def __init__(self):
        pass

    def load(self,
             file_path: str,
             target_column: str,
             positive_class_value: Union[int, str] = 'Yes',
             as_records: bool = True,
             test_size: float = 0.2,
             random_state: int = 42,
             num_records: int = None) -> tuple:
        """
        Loads a CSV dataset, shuffles it, and performs a stratified split into
        training and testing sets. The target labels are converted to booleans.

        Args:
            file_path (str): The path to the CSV file.
            target_column (str): The name of the target variable column.
            positive_class_value (Union[int, str]): The value to indicate a positive class.
            as_records (bool): Whether to return a pandas DataFrame or an array of dictionaries.
            test_size (float): The proportion of the dataset to include in the test split.
            random_state (int): The seed used by the random number generator for reproducibility.

        Returns:
            tuple: A tuple containing (X_train, X_test, y_train, y_test).
                   Returns (None, None, None, None) if the file is not found.
        """
        try:
            df = pd.read_csv(file_path)
        except FileNotFoundError:
            print(f"Error: The file at '{file_path}' was not found.")
            return None, None, None, None

        num_records = num_records if num_records is not None else len(df)
        # 2. Separate features (X) and the target variable (y)
        X = df.drop(columns=[target_column])[:num_records]
        y = df[target_column][:num_records]
        return self._split_data(X, y, target_column, test_size, random_state, positive_class_value, as_records, num_records)

    def load_uci(self,
                 uci_id,
                 target_column: str,
                 positive_class_value: Union[int, str] = 'Yes',
                 as_records: bool = True,
                 test_size: float = 0.2,
                 random_state: int = 42,
                 num_records: int = None):
        # fetch dataset
        dataset = fetch_ucirepo(id=uci_id)


        # data (as pandas dataframes)
        X = dataset.data.features
        y = dataset.data.targets
        return self._split_data(X, y, target_column, test_size, random_state, positive_class_value, as_records, num_records)

    def _split_data(self, X, y, target_column, test_size, random_state, positive_class_value, as_records, num_records):

        # 3. Split the data into training and testing sets
        #    - shuffle=True is the default and ensures the data is shuffled before splitting.
        #    - stratify=y ensures the proportion of target classes is the same in both
        #      the train and test sets
        x_train, x_test, y_train, y_test = train_test_split(
            X,
            y,
            test_size=test_size,
            random_state=random_state,
            shuffle=True,
            stratify=y  # Perform stratified split
        )
        #print('y_train', y_train)
        #print('y_test', y_test)
        # 4. Convert the Y labels to boolean (True/False)
        y_train = [i == positive_class_value for i in y_train[target_column].to_list()]
        y_test = [i == positive_class_value for i in y_test[target_column].to_list()]

        if as_records:
            x_train = x_train.to_dict(orient='records')
            x_test = x_test.to_dict(orient='records')


        return x_train[:num_records], x_test[:num_records], y_train[:num_records], y_test[:num_records]

