{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[],"dockerImageVersionId":30699,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"import torch\nimport torch.nn as nn\nfrom torch.utils.data import DataLoader\nimport torch.nn.functional as F\nfrom torchvision import datasets, transforms\nimport numpy as np\ndevice = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')\nprint(device)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"class TorchDataset(torch.utils.data.Dataset):\n\n        def __init__(self, *data, **options):\n            \n            n_data = len(data)\n            if n_data == 0:\n                raise ValueError(\"At least one set required as input\")\n\n            self.data = data\n            means = options.pop('means', None)\n            stds = options.pop('stds', None)\n            self.transform = options.pop('transform', None)\n            self.test = options.pop('test', False)\n            \n            if options:\n                raise TypeError(\"Invalid parameters passed: %s\" % str(options))\n            \n            if means is not None:\n                assert stds is not None, \"must specify both <means> and <stds>\"\n\n                self.normalize = lambda data: [(d - m) / s for d, m, s in zip(data, means, stds)]\n\n            else:\n                self.normalize = lambda data: data\n\n        def __len__(self):\n            return len(self.data[0])\n\n        def __getitem__(self, idx):\n            data = self.normalize([s[idx] for s in self.data])\n            if self.transform:\n\n                if self.test:\n                    data = sum([[self.transform.test_transform(d)] * 2 for d in data], [])\n                else:\n                    data = sum([self.transform(d) for d in data], [])\n                \n            return data","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#@title Synthetic data\ndef set_npseed(seed):\n    np.random.seed(seed)\n\n\ndef set_torchseed(seed):\n    torch.manual_seed(seed)\n    torch.cuda.manual_seed(seed)\n    torch.cuda.manual_seed_all(seed)\n    torch.backends.cudnn.deterministic = True\n    torch.backends.cudnn.benchmark = False\n\n\n#classification data\n\ndef data_gen_decision_tree(num_data=1000, dim=2, seed=0, w_list=None, b_list=None,vals=None, num_levels=2):        \n    set_npseed(seed=seed)\n\n    # Construct a complete decision tree with 2**num_levels-1 internal nodes,\n    # e.g. num_levels=2 means there are 3 internal nodes.\n    # w_list, b_list is a list of size equal to num_internal_nodes\n    # vals is a list of size equal to num_leaf_nodes, with values +1 or 0\n    num_internal_nodes = 2**num_levels - 1\n    num_leaf_nodes = 2**num_levels\n    stats = np.zeros(num_internal_nodes+num_leaf_nodes) #stores the num of datapoints at each node so at 0(root) all data points will be present\n\n    if vals is None: #when val i.e., labels are not provided make the labels dynamically\n        vals = np.arange(0,num_internal_nodes+num_leaf_nodes,1,dtype=np.int32)%2 #assign 0 or 1 label to the node based on whether its numbering is even or odd\n        vals[:num_internal_nodes] = -99 #we put -99 to the internal nodes as only the values of leaf nodes are counted\n\n    if w_list is None: #if the w values of the nodes (hyperplane eqn) are not provided then generate dynamically\n        w_list = np.random.standard_normal((num_internal_nodes, dim))\n        w_list = w_list/np.linalg.norm(w_list, axis=1)[:, None] #unit norm w vects\n        b_list = np.zeros((num_internal_nodes))\n\n    '''\n    np.random.random_sample\n    ========================\n    Return random floats in the half-open interval [0.0, 1.0).\n\n    Results are from the \"continuous uniform\" distribution over the\n    stated interval.  To sample :math:`Unif[a, b), b > a` multiply\n    the output of `random_sample` by `(b-a)` and add `a`::\n\n        (b - a) * random_sample() + a\n    '''\n\n#     data_x = np.random.random_sample((num_data, dim))*2 - 1. #generate the datas in range -1 to +1\n#     relevant_stats = data_x @ w_list.T + b_list #stores the x.wT+b value of each nodes for all data points(num_data x num_nodes) to check if > 0 i.e will follow right sub tree route or <0 and will follow left sub tree route\n#     curr_index = np.zeros(shape=(num_data), dtype=int) #stores the curr index for each data point from root to leaf. So initially a datapoint starts from root but then it can go to right or left if it goes to right its curr index will become 2 from 0 else 1 from 0 then in next iteration from say 2 it goes to right then it will become 6\n\n    data_x = np.random.standard_normal((num_data, dim))\n    data_x /= np.sqrt(np.sum(data_x**2, axis=1, keepdims=True))\n    relevant_stats = data_x @ w_list.T + b_list\n    curr_index = np.zeros(shape=(num_data), dtype=int)\n    \n    for level in range(num_levels):\n        nodes_curr_level=list(range(2**level - 1,2**(level+1)-1  ))\n        for el in nodes_curr_level:\n#             b_list[el]=-1*np.median(relevant_stats[curr_index==el,el])\n            relevant_stats[:,el] += b_list[el]\n        decision_variable = np.choose(curr_index, relevant_stats.T) #based on the curr index will choose the corresponding node value of the datapoint\n\n        # Go down and right if wx+b>0 down and left otherwise.\n        # i.e. 0 -> 1 if w[0]x+b[0]<0 and 0->2 otherwise\n        curr_index = (curr_index+1)*2 - (1-(decision_variable > 0)) #update curr index based on the desc_variable\n        \n\n    bound_dist = np.min(np.abs(relevant_stats), axis=1) #finds the abs value of the minm node value of a datapoint. If some node value of a datapoint is 0 then that data point exactly passes through a hyperplane and we remove all such datapoints\n    thres = threshold\n    labels = vals[curr_index] #finally labels for each datapoint is assigned after traversing the whole tree\n\n    data_x_pruned = data_x[bound_dist>thres] #to distingush the hyperplanes seperately for 0 1 labels (classification)\n    #removes all the datapoints that passes through a node hyperplane\n    labels_pruned = labels[bound_dist>thres]\n    relevant_stats = np.sign(data_x_pruned @ w_list.T + b_list) #storing only +1 or -1 for a particular node if it is active or not\n    nodes_active = np.zeros((len(data_x_pruned),  num_internal_nodes+num_leaf_nodes), dtype=np.int32) #stores node actv or not for a data\n\n    for node in range(num_internal_nodes+num_leaf_nodes):\n        if node==0:\n            stats[node]=len(relevant_stats) #for root node all datapoints are present\n            nodes_active[:,0]=1 #root node all data points active status is +1\n            continue\n        parent = (node-1)//2\n        nodes_active[:,node]=nodes_active[:,parent]\n        right_child = node-(parent*2)-1 # 0 means left, 1 means right 1 has children 3,4\n        #finds if it is a right child or left of the parent\n        if right_child==1:\n            nodes_active[:,node] *= relevant_stats[:,parent]>0 #if parent node val was >0 then this right child of parent is active\n        if right_child==0:\n            nodes_active[:,node] *= relevant_stats[:,parent]<0 #else left is active\n        stats = nodes_active.sum(axis=0) #updates the status i.e., no of datapoints active in that node (root has all active then gradually divided in left right)\n    return ((data_x_pruned, labels_pruned), (w_list, b_list, vals), stats)\n\nclass Dataset_syn:\n    def __init__(self, dataset, data_path='./DATA'):\n        if dataset ==\"syn\":\n            self.X_train = train_data\n            self.y_train = train_data_labels\n            self.X_valid = vali_data\n            self.y_valid = vali_data_labels\n            self.X_test = test_data\n            self.y_test = test_data_labels\n        self.data_path = data_path\n        self.dataset = dataset","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"'''\n# Use the data in DLGN and its variations from https://arxiv.org/abs/2010.04627\n'''\n#Imports for Latent Tree data\n\nimport random\nimport requests\nimport os\nimport torch\nfrom tqdm import tqdm\nimport numpy as np\nimport gzip\nimport shutil\nimport tarfile\nimport bz2\nimport pandas as pd\nimport gzip\nimport shutil\nimport warnings\n\nfrom pathlib import Path\nfrom sklearn.datasets import load_svmlight_file\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.datasets import make_swiss_roll\nfrom sklearn.preprocessing import QuantileTransformer\n\nfrom category_encoders import LeaveOneOutEncoder\nfrom category_encoders.ordinal import OrdinalEncoder\nimport os\nimport zipfile\nimport shutil\nimport urllib.request\nimport numpy as np\nimport pandas as pd\nfrom sklearn.preprocessing import StandardScaler, LabelEncoder\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.model_selection import train_test_split, GridSearchCV\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.svm import SVC\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.neural_network import MLPClassifier\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.ensemble import RandomForestClassifier\n\nfrom sklearn.neighbors import NearestCentroid\nfrom scipy.io import arff\n# !pip install numpy==1.22.0  # Install a compatible version of NumPy\n# !pip install scipy  # Install or update SciPy\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport torch.optim as optim\nfrom tqdm import tqdm\nimport copy\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.metrics import pairwise_distances\nfrom mpl_toolkits.axes_grid1 import make_axes_locatable\nfrom itertools import product as cartesian_prod\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.metrics import pairwise_distances\n\nfrom sklearn import tree\nfrom sklearn import cluster, mixture\n\n\nnp.set_printoptions(precision=4)\n\n\n#@title Importing Packages\nimport os\nimport random\nimport pandas as pd\n\nimport torchvision\nimport torchvision.transforms as transforms\n\nimport time\nimport sys\n\ndevice = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')\nprint(device)\n\n\n\ndef preprocess_data_adult(data_path):\n    # Read the data into a DataFrame\n    columns = [\n        \"age\", \"workclass\", \"fnlwgt\", \"education\", \"education-num\", \"marital-status\",\n        \"occupation\", \"relationship\", \"race\", \"sex\", \"capital-gain\", \"capital-loss\",\n        \"hours-per-week\", \"native-country\", \"income\"\n    ]\n    df = pd.read_csv(data_path, names=columns, na_values=[\" ?\"])\n\n    # Drop rows with missing values\n    df.dropna(inplace=True)\n\n    # Convert categorical features using Label Encoding\n    categorical_columns = [\"workclass\", \"education\", \"marital-status\", \"occupation\", \"relationship\", \"race\", \"sex\", \"native-country\"]\n    label_encoders = {}\n    for col in categorical_columns:\n        le = LabelEncoder()\n        df[col] = le.fit_transform(df[col])\n        label_encoders[col] = le\n\n    # Encode the target variable\n    df[\"income\"] = df[\"income\"].apply(lambda x: 1 if x == \" >50K\" else 0)\n\n    return df\n\ndef preprocess_data_bank_marketing(data):\n    # Convert categorical features using Label Encoding\n    label_encoders = {}\n    for col in data.select_dtypes(include=['object']).columns:\n        le = LabelEncoder()\n        data[col] = le.fit_transform(data[col])\n        label_encoders[col] = le\n\n    return data\n\ndef preprocess_data_credit_card_defaults(data):\n    # Convert categorical features using one-hot encoding\n    data = pd.get_dummies(data, columns=[\"SEX\", \"EDUCATION\", \"MARRIAGE\"], drop_first=True)\n\n    # Standardize numerical features\n    scaler = StandardScaler()\n    data[[\"LIMIT_BAL\", \"AGE\", \"PAY_0\", \"PAY_2\", \"PAY_3\", \"PAY_4\", \"PAY_5\", \"PAY_6\", \"BILL_AMT1\",\n          \"BILL_AMT2\", \"BILL_AMT3\", \"BILL_AMT4\", \"BILL_AMT5\", \"BILL_AMT6\", \"PAY_AMT1\", \"PAY_AMT2\",\n          \"PAY_AMT3\", \"PAY_AMT4\", \"PAY_AMT5\", \"PAY_AMT6\"]] = scaler.fit_transform(\n        data[[\"LIMIT_BAL\", \"AGE\", \"PAY_0\", \"PAY_2\", \"PAY_3\", \"PAY_4\", \"PAY_5\", \"PAY_6\", \"BILL_AMT1\",\n               \"BILL_AMT2\", \"BILL_AMT3\", \"BILL_AMT4\", \"BILL_AMT5\", \"BILL_AMT6\", \"PAY_AMT1\", \"PAY_AMT2\",\n               \"PAY_AMT3\", \"PAY_AMT4\", \"PAY_AMT5\", \"PAY_AMT6\"]])\n\n    return data\n\n\n\ndef fetch_ADULT(data_dir=\"./ADULT_DATA\"):\n    print(\"---------------------ADULT--------------------------------------\")\n    # Create the data directory if it doesn't exist\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n        \n    # URL of the dataset zip file\n    url = \"https://archive.ics.uci.edu/static/public/2/adult.zip\"\n    zip_file_path = os.path.join(data_dir, \"adult.zip\")\n    # Download the zip file\n    urllib.request.urlretrieve(url, zip_file_path)\n    # Extract the zip file\n    with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n        zip_ref.extractall(data_dir)\n\n    # Preprocess the data\n    train_data_path = os.path.join(data_dir, \"adult.data\")\n#     test_data_path = os.path.join(data_dir, \"adult.test\")\n   \n    df_train = preprocess_data_adult(train_data_path)\n#     df_test = preprocess_data_adult(test_data_path)\n\n    # Split the data into train, validation, and test sets\n    X = df_train.drop(\"income\", axis=1)\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = df_train[\"income\"]\n    \n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)\n    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n    \n#     X_test = df_test.drop(\"income\", axis=1)\n#     y_test = df_test[\"income\"]\n\n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_val = (y_val.values.reshape(-1) == 1).astype('int64')\n    # Remove the zip file\n    os.remove(zip_file_path)\n\n    # Remove the extracted directory and its contents using shutil.rmtree()\n    shutil.rmtree(data_dir)\n\n    return dict(\n        X_train=X_train.astype('float32'), y_train=y_train, X_valid=X_val.astype('float32'), y_valid=y_val, X_test=X_test.astype('float32'), y_test=y_test\n    )\n\ndef fetch_bank_marketing(data_dir=\"./BANK\"):\n    print(\"---------------------BANK--------------------------------------\")\n    # Create the data directory if it doesn't exist\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    # URL of the dataset zip file\n    url = \"https://archive.ics.uci.edu/static/public/222/bank+marketing.zip\"\n    zip_file_path = os.path.join(data_dir, \"bank_marketing.zip\")\n\n    # Download the zip file\n    urllib.request.urlretrieve(url, zip_file_path)\n\n    # Extract the zip file\n    with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n        zip_ref.extractall(data_dir)\n    \n    zip_file_path_bank_add = os.path.join(data_dir, \"bank-additional.zip\")\n    with zipfile.ZipFile(zip_file_path_bank_add, \"r\") as zip_ref:\n        zip_ref.extractall(data_dir)\n\n    # Get the extracted directory path\n    extracted_dir = os.path.join(data_dir, \"bank-additional\")\n\n    # Read the dataset\n    data = pd.read_csv(os.path.join(extracted_dir, \"bank-additional-full.csv\"), sep=';')\n\n    # Preprocess the data\n    data = preprocess_data_bank_marketing(data)\n\n    # Split the data into train, validation, and test sets\n    X = data.drop(\"y\", axis=1)\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = data[\"y\"]\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)\n    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n    \n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_val = (y_val.values.reshape(-1) == 1).astype('int64')\n    # Remove the zip file\n    os.remove(zip_file_path)\n\n    # Remove the extracted directory and its contents\n    shutil.rmtree(data_dir)\n\n    return dict(\n        X_train=X_train.astype('float32'), y_train=y_train,X_test=X_test.astype('float32'), y_test=y_test, X_valid = X_val.astype('float32'), y_valid = y_val\n    )\n\ndef fetch_credit_card_defaults(data_dir=\"./CREDIT\"):\n    print(\"---------------------CREDIT--------------------------------------\")\n    # Create the data directory if it doesn't exist\n    !pip install xlrd\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    # URL of the dataset zip file\n    url = \"https://archive.ics.uci.edu/static/public/350/default+of+credit+card+clients.zip\"\n    zip_file_path = os.path.join(data_dir, \"credit_card_defaults.zip\")\n\n    # Download the zip file\n    urllib.request.urlretrieve(url, zip_file_path)\n\n    # Extract the zip file\n    with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n        zip_ref.extractall(data_dir)\n\n#     # Get the extracted directory path\n#     extracted_dir = os.path.join(data_dir, \"default+of+credit+card+clients\")\n\n    # Read the dataset\n    data = pd.read_excel(os.path.join(data_dir, \"default of credit card clients.xls\"), skiprows=1)\n\n    # Preprocess the data\n    data = preprocess_data_credit_card_defaults(data)\n\n    # Split the data into train, validation, and test sets\n    X = data.drop(\"default payment next month\", axis=1)\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = data[\"default payment next month\"]\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)\n    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n    \n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_val = (y_val.values.reshape(-1) == 1).astype('int64')\n\n    # Remove the zip file\n    os.remove(zip_file_path)\n\n    # Remove the extracted directory and its contents\n    shutil.rmtree(data_dir)\n\n    return dict(\n        X_train=X_train.astype('float32'), y_train=y_train, X_valid=X_val.astype('float32'), y_valid=y_val , X_test=X_test.astype('float32'), y_test=y_test\n    )\n\n\ndef fetch_gamma_telescope(data_dir=\"./TELESCOPE\"):\n    print(\"---------------------TELESCOPE--------------------------------------\")\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    # URL of the dataset zip file\n    url = \"https://archive.ics.uci.edu/static/public/159/magic+gamma+telescope.zip\"\n    zip_file_path = os.path.join(data_dir, \"magic_gamma_telescope.zip\")\n\n    # Download the zip file\n    urllib.request.urlretrieve(url, zip_file_path)\n\n    # Extract the zip file\n    with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n        zip_ref.extractall(data_dir)\n    \n    # Load the data from CSV\n    data_path = os.path.join(data_dir, \"magic04.data\")\n    columns = [\n        \"fLength\", \"fWidth\", \"fSize\", \"fConc\", \"fConc1\", \"fAsym\", \"fM3Long\",\n        \"fM3Trans\", \"fAlpha\", \"fDist\", \"class\"\n    ]\n    data = pd.read_csv(data_path, header=None, names=columns)\n    \n    # Convert the class labels to binary format (g = gamma, h = hadron)\n    data[\"class\"] = data[\"class\"].map({\"g\": 1, \"h\": 0})\n    \n    # Split the data into features (X) and target (y)\n    X = data.drop(\"class\", axis=1)\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = data[\"class\"]\n    \n    # Split the data into train, test, and validation sets\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)\n    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n    \n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_valid = (y_valid.values.reshape(-1) == 1).astype('int64')\n    \n    # Create a dictionary to store the data splits\n    data_splits = {\n        \"X_train\": X_train.astype('float32'), \"y_train\": y_train,\n        \"X_valid\": X_valid.astype('float32'), \"y_valid\": y_valid,\n        \"X_test\": X_test.astype('float32'), \"y_test\": y_test\n    }\n    \n    # Remove the zip file\n    os.remove(zip_file_path)\n\n    # Remove the extracted directory and its contents\n    shutil.rmtree(data_dir)\n    \n    return data_splits\n\ndef fetch_rice_dataset(data_dir=\"./RICE\"):\n    print(\"---------------------RICE--------------------------------------\")\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    # URL of the dataset zip file\n    url = \"https://archive.ics.uci.edu/static/public/545/rice+cammeo+and+osmancik.zip\"\n    zip_file_path = os.path.join(data_dir, \"rice_dataset.zip\")\n\n    # Download the zip file\n    urllib.request.urlretrieve(url, zip_file_path)\n\n    # Extract the zip file\n    with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n        zip_ref.extractall(data_dir)\n        \n    # Load the data from CSV\n    arff_file_name = os.path.join(data_dir, \"Rice_Cammeo_Osmancik.arff\")\n\n    \n    # Load the ARFF file using SciPy\n    data, meta = arff.loadarff(arff_file_name)\n    \n    df = pd.DataFrame(data)\n    print(\"df\",df)\n    df[\"Class\"] = df[\"Class\"].map({b'Cammeo': 1, b'Osmancik': 0})\n    \n    # Split the data into features (X) and target (y)\n    X = df.drop(\"Class\", axis=1)\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = df[\"Class\"]\n    \n    # Split the data into train, test, and validation sets\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)\n    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n    \n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_valid = (y_valid.values.reshape(-1) == 1).astype('int64')\n    \n    # Create a dictionary to store the data splits\n    data_splits = {\n        \"X_train\": X_train.astype('float32'), \"y_train\": y_train,\n        \"X_valid\": X_valid.astype('float32'), \"y_valid\": y_valid,\n        \"X_test\": X_test.astype('float32'), \"y_test\": y_test\n    }\n    \n    # Remove the zip file\n    os.remove(zip_file_path)\n\n    # Remove the extracted directory and its contents\n    shutil.rmtree(data_dir)\n    \n    return data_splits\n\ndef fetch_german_credit_data(data_dir=\"./GERMAN\"):\n    print(\"---------------------GERMAN--------------------------------------\")\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    # URL of the dataset zip file\n    url = \"http://archive.ics.uci.edu/static/public/144/statlog+german+credit+data.zip\"\n    zip_file_path = os.path.join(data_dir, \"german_credit_data.zip\")\n\n    # Download the zip file\n    urllib.request.urlretrieve(url, zip_file_path)\n\n    # Extract the zip file\n    with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n        zip_ref.extractall(data_dir)\n        \n    # Load the data from CSV\n    data_path = os.path.join(data_dir, \"german.data\")\n\n    columns = [\n        \"checking_account_status\", \"duration_months\", \"credit_history\", \"purpose\",\n        \"credit_amount\", \"savings_account_bonds\", \"employment\", \"installment_rate\",\n        \"personal_status_sex\", \"other_debtors_guarantors\", \"present_residence\",\n        \"property\", \"age\", \"other_installment_plans\", \"housing\", \"existing_credits\",\n        \"job\", \"num_dependents\", \"own_telephone\", \"foreign_worker\", \"class\"\n    ]\n    data = pd.read_csv(data_path, sep=' ', header=None, names=columns)\n    \n    # Convert the class labels to binary format (1 = Good, 2 = Bad)\n    data[\"class\"] = data[\"class\"].map({1: 1, 2: 0})\n    \n    # Handle null values (replace with appropriate values)\n    data.fillna(method='ffill', inplace=True)  # Forward fill\n    \n    # Convert categorical variables to dummy variables\n    categorical_columns = [\n        \"checking_account_status\", \"credit_history\", \"purpose\", \"savings_account_bonds\",\n        \"employment\", \"personal_status_sex\", \"other_debtors_guarantors\", \"property\",\n        \"other_installment_plans\", \"housing\", \"job\", \"own_telephone\", \"foreign_worker\"\n    ]\n    data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)\n    \n    # Split the data into features (X) and target (y)\n    X = data.drop(\"class\", axis=1)\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = data[\"class\"]\n    \n    # Split the data into train, test, and validation sets\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)\n    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n    \n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_valid = (y_valid.values.reshape(-1) == 1).astype('int64')\n    \n    # Create a dictionary to store the data splits\n    data_splits = {\n        \"X_train\": X_train.astype('float32'), \"y_train\": y_train,\n        \"X_valid\": X_valid.astype('float32'), \"y_valid\": y_valid,\n        \"X_test\": X_test.astype('float32'), \"y_test\": y_test\n    }\n    \n    # Remove the zip file\n    os.remove(zip_file_path)\n\n    # Remove the extracted directory and its contents\n    shutil.rmtree(data_dir)\n    \n    return data_splits\n\ndef fetch_spambase_dataset(data_dir=\"./SPAM\"):\n    print(\"---------------------SPAM--------------------------------------\")\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    # URL of the dataset zip file\n    url = \"http://archive.ics.uci.edu/static/public/94/spambase.zip\"\n    zip_file_path = os.path.join(data_dir, \"spambase.zip\")\n\n    # Download the zip file\n    urllib.request.urlretrieve(url, zip_file_path)\n\n    # Extract the zip file\n    with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n        zip_ref.extractall(data_dir)\n        \n    # Load the data from CSV\n    data_path = os.path.join(data_dir, \"spambase.data\")\n\n    columns = [\n        f\"f{i}\" for i in range(57)\n    ] + [\"spam\"]\n    data = pd.read_csv(data_path, header=None, names=columns)\n    \n    # Split the data into features (X) and target (y)\n    X = data.drop(\"spam\", axis=1)\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = data[\"spam\"]\n    \n    # Split the data into train, test, and validation sets\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)\n    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n    \n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_valid = (y_valid.values.reshape(-1) == 1).astype('int64')\n    \n    # Create a dictionary to store the data splits\n    data_splits = {\n        \"X_train\": X_train.astype('float32'), \"y_train\": y_train,\n        \"X_valid\": X_valid.astype('float32'), \"y_valid\": y_valid,\n        \"X_test\": X_test.astype('float32'), \"y_test\": y_test\n    }\n    \n    # Remove the zip file\n    os.remove(zip_file_path)\n\n    # Remove the extracted directory and its contents\n    shutil.rmtree(data_dir)\n    \n    return data_splits\n\ndef fetch_accelerometer_gyro_dataset(data_dir=\"./GYRO\"):\n    print(\"---------------------GYRO--------------------------------------\")\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    # URL of the dataset zip file\n    url = \"https://archive.ics.uci.edu/static/public/755/accelerometer+gyro+mobile+phone+dataset.zip\"\n    zip_file_path = os.path.join(data_dir, \"accelerometer_gyro_dataset.zip\")\n\n    # Download the zip file\n    urllib.request.urlretrieve(url, zip_file_path)\n\n    # Extract the zip file\n    with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n        zip_ref.extractall(data_dir)\n        \n    # Load the data from CSV\n    data_path = os.path.join(data_dir, \"accelerometer_gyro_mobile_phone_dataset.csv\")\n    \n    data = pd.read_csv(data_path)\n    \n    # Convert categorical column to numeric (e.g., label encoding)\n    data[\"timestamp\"] = data[\"timestamp\"].astype(\"category\").cat.codes\n    \n    # Split the data into features (X) and target (y)\n    X = data.drop(\"Activity\", axis=1)\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = data[\"Activity\"]\n    \n    # Split the data into train, test, and validation sets\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)\n    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n    \n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_valid = (y_valid.values.reshape(-1) == 1).astype('int64')\n    \n    # Create a dictionary to store the data splits\n    data_splits = {\n        \"X_train\": X_train.astype('float32'), \"y_train\": y_train,\n        \"X_valid\": X_valid.astype('float32'), \"y_valid\": y_valid,\n        \"X_test\": X_test.astype('float32'), \"y_test\": y_test\n    }\n    \n    # Remove the zip file\n    os.remove(zip_file_path)\n\n    # Remove the extracted directory and its contents\n    shutil.rmtree(data_dir)\n    \n    return data_splits\n\ndef fetch_swarm_behaviour(data_dir=\"./SWARM\"):\n    print(\"---------------------SWARM--------------------------------------\")\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    # URL of the dataset zip file\n    url = \"https://archive.ics.uci.edu/static/public/524/swarm+behaviour.zip\"\n    zip_file_path = os.path.join(data_dir, \"swarm_behaviour.zip\")\n\n    # Download the zip file\n    urllib.request.urlretrieve(url, zip_file_path)\n\n    # Extract the zip file\n    with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n        zip_ref.extractall(data_dir)\n        \n    # Load the data from CSV\n    data_path = os.path.join(data_dir, \"Swarm Behavior Data/Grouped.csv\")\n    \n    data = pd.read_csv(data_path)\n    \n    # Split the data into features (X) and target (y)\n    X = data.drop(\"Class\", axis=1)\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = data[\"Class\"]\n    \n    # Split the data into train, test, and validation sets\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)\n    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n    \n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_valid = (y_valid.values.reshape(-1) == 1).astype('int64')\n    \n    # Create a dictionary to store the data splits\n    data_splits = {\n        \"X_train\": X_train.astype('float32'), \"y_train\": y_train,\n        \"X_valid\": X_valid.astype('float32'), \"y_valid\": y_valid,\n        \"X_test\": X_test.astype('float32'), \"y_test\": y_test\n    }\n    \n    # Remove the zip file\n    os.remove(zip_file_path)\n    # Remove the extracted directory and its contents\n    shutil.rmtree(data_dir) \n    return data_splits\n\n\ndef fetch_openml_credit_data(data_dir=\"./OpenML_Credit\"):\n    print(\"---------------------OpenML_Credit DATASET--------------------------------------\")\n    # Create the data directory if it doesn't exist\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    data_url = \"https://api.openml.org/data/v1/download/22103185/credit.arff\"\n    # Download the ARFF file\n    arff_file_path = os.path.join(data_dir, \"credit.arff\")\n    urllib.request.urlretrieve(data_url, arff_file_path)\n\n    # Load ARFF file into DataFrame\n    data, meta = arff.loadarff(arff_file_path)\n    df = pd.DataFrame(data)\n    # Convert target variable to int\n    last_column = df.columns[-1]\n\n    df[last_column] = df[last_column].astype(int)\n    \n#     print(\"df\",df)\n\n    # Split the data into train, validation, and test sets\n    X = df.drop(last_column, axis=1)  # Assuming \"SeriousDlqin2yrs\" is the target variable\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = df[last_column]\n\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)\n    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)\n\n#     y_train = y_train.astype('int64')\n#     y_test = y_test.astype('int64')\n#     y_val = y_val.astype('int64')\n\n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_val = (y_val.values.reshape(-1) == 1).astype('int64')\n\n    # Remove the ARFF file\n    os.remove(arff_file_path)\n\n    # Remove the data directory\n    shutil.rmtree(data_dir)\n\n    return dict(\n        X_train=X_train.astype('float32'), y_train=y_train,\n        X_valid=X_val.astype('float32'), y_valid=y_val,\n        X_test=X_test.astype('float32'), y_test=y_test\n    )\n\n\ndef fetch_openml_electricity_data(data_dir=\"./OpenML_Electricity\"):\n    print(\"---------------------OpenML_Electricity DATASET--------------------------------------\")\n    # Create the data directory if it doesn't exist\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    data_url = \"https://api.openml.org/data/v1/download/22103245/electricity.arff\"\n    # Download the ARFF file\n    arff_file_path = os.path.join(data_dir, \"electricity.arff\")\n    urllib.request.urlretrieve(data_url, arff_file_path)\n\n    # Load ARFF file into DataFrame\n    data, meta = arff.loadarff(arff_file_path)\n    df = pd.DataFrame(data)\n    # Convert target variable to int\n    last_column = df.columns[-1]\n\n    df[last_column] = df[last_column].map({b'DOWN': 0, b'UP': 1})\n    \n#     print(\"df\",df)\n\n    # Split the data into train, validation, and test sets\n    X = df.drop(last_column, axis=1)  # Assuming \"SeriousDlqin2yrs\" is the target variable\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = df[last_column]\n\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)\n    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)\n\n#     y_train = y_train.astype('int64')\n#     y_test = y_test.astype('int64')\n#     y_val = y_val.astype('int64')\n\n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_val = (y_val.values.reshape(-1) == 1).astype('int64')\n\n    # Remove the ARFF file\n    os.remove(arff_file_path)\n\n    # Remove the data directory\n    shutil.rmtree(data_dir)\n\n    return dict(\n        X_train=X_train.astype('float32'), y_train=y_train,\n        X_valid=X_val.astype('float32'), y_valid=y_val,\n        X_test=X_test.astype('float32'), y_test=y_test\n    )\n\n\ndef fetch_openml_covertype_data(data_dir=\"./OpenML_Covertype\"):\n    print(\"---------------------OpenML_Covertype DATASET--------------------------------------\")\n    # Create the data directory if it doesn't exist\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    data_url = \"https://api.openml.org/data/v1/download/22103246/covertype.arff\"\n    # Download the ARFF file\n    arff_file_path = os.path.join(data_dir, \"covertype.arff\")\n    urllib.request.urlretrieve(data_url, arff_file_path)\n\n    # Load ARFF file into DataFrame\n    data, meta = arff.loadarff(arff_file_path)\n    df = pd.DataFrame(data)\n    # Convert target variable to int\n    last_column = df.columns[-1]\n\n    df[last_column] = df[last_column].astype(int)\n    \n#     print(\"df\",df)\n\n    # Split the data into train, validation, and test sets\n    X = df.drop(last_column, axis=1)  # Assuming \"SeriousDlqin2yrs\" is the target variable\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = df[last_column]\n\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)\n    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)\n\n#     y_train = y_train.astype('int64')\n#     y_test = y_test.astype('int64')\n#     y_val = y_val.astype('int64')\n\n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_val = (y_val.values.reshape(-1) == 1).astype('int64')\n\n    # Remove the ARFF file\n    os.remove(arff_file_path)\n\n    # Remove the data directory\n    shutil.rmtree(data_dir)\n\n    return dict(\n        X_train=X_train.astype('float32'), y_train=y_train,\n        X_valid=X_val.astype('float32'), y_valid=y_val,\n        X_test=X_test.astype('float32'), y_test=y_test\n    )\n\n\ndef fetch_openml_pol_data(data_dir=\"./OpenML_Pol\"):\n    print(\"---------------------OpenML_Pol DATASET--------------------------------------\")\n    # Create the data directory if it doesn't exist\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    data_url = \"https://api.openml.org/data/v1/download/22103247/pol.arff\"\n    # Download the ARFF file\n    arff_file_path = os.path.join(data_dir, \"pol.arff\")\n    urllib.request.urlretrieve(data_url, arff_file_path)\n\n    # Load ARFF file into DataFrame\n    data, meta = arff.loadarff(arff_file_path)\n    df = pd.DataFrame(data)\n    # Convert target variable to int\n    last_column = df.columns[-1]\n\n#     print(\"df\",df)\n    \n    df[last_column] = df[last_column].map({b'N':0,b'P':1})\n    \n    \n\n    # Split the data into train, validation, and test sets\n    X = df.drop(last_column, axis=1)  # Assuming \"SeriousDlqin2yrs\" is the target variable\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = df[last_column]\n\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)\n    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)\n\n#     y_train = y_train.astype('int64')\n#     y_test = y_test.astype('int64')\n#     y_val = y_val.astype('int64')\n\n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_val = (y_val.values.reshape(-1) == 1).astype('int64')\n\n    # Remove the ARFF file\n    os.remove(arff_file_path)\n\n    # Remove the data directory\n    shutil.rmtree(data_dir)\n\n    return dict(\n        X_train=X_train.astype('float32'), y_train=y_train,\n        X_valid=X_val.astype('float32'), y_valid=y_val,\n        X_test=X_test.astype('float32'), y_test=y_test\n    )\n\n\ndef fetch_openml_house_16H_data(data_dir=\"./OpenML_House_16H\"):\n    print(\"---------------------OpenML_House_16H DATASET--------------------------------------\")\n    # Create the data directory if it doesn't exist\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    data_url = \"https://api.openml.org/data/v1/download/22103248/house_16H.arff\"\n    # Download the ARFF file\n    arff_file_path = os.path.join(data_dir, \"house_16H.arff\")\n    urllib.request.urlretrieve(data_url, arff_file_path)\n\n    # Load ARFF file into DataFrame\n    data, meta = arff.loadarff(arff_file_path)\n    df = pd.DataFrame(data)\n    # Convert target variable to int\n    last_column = df.columns[-1]\n\n#     print(\"df\",df)\n    df[last_column] = df[last_column].map({b'N':0,b'P':1})\n    \n    # Split the data into train, validation, and test sets\n    X = df.drop(last_column, axis=1)  # Assuming \"SeriousDlqin2yrs\" is the target variable\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = df[last_column]\n\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)\n    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)\n\n#     y_train = y_train.astype('int64')\n#     y_test = y_test.astype('int64')\n#     y_val = y_val.astype('int64')\n\n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_val = (y_val.values.reshape(-1) == 1).astype('int64')\n\n    # Remove the ARFF file\n    os.remove(arff_file_path)\n\n    # Remove the data directory\n    shutil.rmtree(data_dir)\n\n    return dict(\n        X_train=X_train.astype('float32'), y_train=y_train,\n        X_valid=X_val.astype('float32'), y_valid=y_val,\n        X_test=X_test.astype('float32'), y_test=y_test\n    )\n\n\ndef fetch_openml_MiniBooNE_data(data_dir=\"./OpenML_MiniBooNE\"):\n    print(\"---------------------OpenML_MiniBooNE DATASET--------------------------------------\")\n    # Create the data directory if it doesn't exist\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    data_url = \"https://api.openml.org/data/v1/download/22103253/MiniBooNE.arff\"\n    # Download the ARFF file\n    arff_file_path = os.path.join(data_dir, \"MiniBooNE.arff\")\n    urllib.request.urlretrieve(data_url, arff_file_path)\n\n    # Load ARFF file into DataFrame\n    data, meta = arff.loadarff(arff_file_path)\n    df = pd.DataFrame(data)\n    # Convert target variable to int\n    last_column = df.columns[-1]\n\n#     print(\"df\",df)\n    \n    df[last_column] = df[last_column].map({b'False':0,b'True':1})\n\n    # Split the data into train, validation, and test sets\n    X = df.drop(last_column, axis=1)  # Assuming \"SeriousDlqin2yrs\" is the target variable\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = df[last_column]\n\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)\n    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)\n\n#     y_train = y_train.astype('int64')\n#     y_test = y_test.astype('int64')\n#     y_val = y_val.astype('int64')\n\n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_val = (y_val.values.reshape(-1) == 1).astype('int64')\n\n    # Remove the ARFF file\n    os.remove(arff_file_path)\n\n    # Remove the data directory\n    shutil.rmtree(data_dir)\n\n    return dict(\n        X_train=X_train.astype('float32'), y_train=y_train,\n        X_valid=X_val.astype('float32'), y_valid=y_val,\n        X_test=X_test.astype('float32'), y_test=y_test\n    )\n\n\ndef fetch_openml_eye_movements_data(data_dir=\"./OpenML_Eye_movements\"):\n    print(\"---------------------OpenML_Eye_movements DATASET--------------------------------------\")\n    # Create the data directory if it doesn't exist\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    data_url = \"https://api.openml.org/data/v1/download/22103255/eye_movements.arff\"\n    # Download the ARFF file\n    arff_file_path = os.path.join(data_dir, \"eye_movements.arff\")\n    urllib.request.urlretrieve(data_url, arff_file_path)\n\n    # Load ARFF file into DataFrame\n    data, meta = arff.loadarff(arff_file_path)\n    df = pd.DataFrame(data)\n    # Convert target variable to int\n    last_column = df.columns[-1]\n\n#     print(\"df\",df)\n    df[last_column] = df[last_column].astype(int)\n\n    # Split the data into train, validation, and test sets\n    X = df.drop(last_column, axis=1)  # Assuming \"SeriousDlqin2yrs\" is the target variable\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = df[last_column]\n\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)\n    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)\n\n#     y_train = y_train.astype('int64')\n#     y_test = y_test.astype('int64')\n#     y_val = y_val.astype('int64')\n\n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_val = (y_val.values.reshape(-1) == 1).astype('int64')\n\n    # Remove the ARFF file\n    os.remove(arff_file_path)\n\n    # Remove the data directory\n    shutil.rmtree(data_dir)\n\n    return dict(\n        X_train=X_train.astype('float32'), y_train=y_train,\n        X_valid=X_val.astype('float32'), y_valid=y_val,\n        X_test=X_test.astype('float32'), y_test=y_test\n    )\n\n\ndef fetch_openml_Diabetes130US_data(data_dir=\"./OpenML_Diabetes130US\"):\n    print(\"---------------------OpenML_Diabetes130US DATASET--------------------------------------\")\n    # Create the data directory if it doesn't exist\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    data_url = \"https://api.openml.org/data/v1/download/22111908/Diabetes130US.arff\"\n    # Download the ARFF file\n    arff_file_path = os.path.join(data_dir, \"Diabetes130US.arff\")\n    urllib.request.urlretrieve(data_url, arff_file_path)\n\n    # Load ARFF file into DataFrame\n    data, meta = arff.loadarff(arff_file_path)\n    df = pd.DataFrame(data)\n    # Convert target variable to int\n    last_column = df.columns[-1]\n#     print(\"df\",df)\n    df[last_column] = df[last_column].astype(int)\n    \n\n    # Split the data into train, validation, and test sets\n    X = df.drop(last_column, axis=1)  # Assuming \"SeriousDlqin2yrs\" is the target variable\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = df[last_column]\n\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)\n    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)\n\n#     y_train = y_train.astype('int64')\n#     y_test = y_test.astype('int64')\n#     y_val = y_val.astype('int64')\n\n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_val = (y_val.values.reshape(-1) == 1).astype('int64')\n\n    # Remove the ARFF file\n    os.remove(arff_file_path)\n\n    # Remove the data directory\n    shutil.rmtree(data_dir)\n\n    return dict(\n        X_train=X_train.astype('float32'), y_train=y_train,\n        X_valid=X_val.astype('float32'), y_valid=y_val,\n        X_test=X_test.astype('float32'), y_test=y_test\n    )\n\n\ndef fetch_openml_jannis_data(data_dir=\"./OpenML_Jannis\"):\n    print(\"---------------------OpenML_Jannis DATASET--------------------------------------\")\n    # Create the data directory if it doesn't exist\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    data_url = \"https://api.openml.org/data/v1/download/22111907/jannis.arff\"\n    # Download the ARFF file\n    arff_file_path = os.path.join(data_dir, \"jannis.arff\")\n    urllib.request.urlretrieve(data_url, arff_file_path)\n\n    # Load ARFF file into DataFrame\n    data, meta = arff.loadarff(arff_file_path)\n    df = pd.DataFrame(data)\n    # Convert target variable to int\n    last_column = df.columns[-1]\n#     print(\"df\",df)\n\n    df[last_column] = df[last_column].astype(int)\n\n\n    # Split the data into train, validation, and test sets\n    X = df.drop(last_column, axis=1)  # Assuming \"SeriousDlqin2yrs\" is the target variable\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = df[last_column]\n\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)\n    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)\n\n#     y_train = y_train.astype('int64')\n#     y_test = y_test.astype('int64')\n#     y_val = y_val.astype('int64')\n\n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_val = (y_val.values.reshape(-1) == 1).astype('int64')\n\n    # Remove the ARFF file\n    os.remove(arff_file_path)\n\n    # Remove the data directory\n    shutil.rmtree(data_dir)\n\n    return dict(\n        X_train=X_train.astype('float32'), y_train=y_train,\n        X_valid=X_val.astype('float32'), y_valid=y_val,\n        X_test=X_test.astype('float32'), y_test=y_test\n    )\n\n\ndef fetch_openml_Bioresponse_data(data_dir=\"./OpenML_Bioresponse\"):\n    print(\"---------------------OpenML_Bioresponse DATASET--------------------------------------\")\n    # Create the data directory if it doesn't exist\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    data_url = \"https://api.openml.org/data/v1/download/22111905/Bioresponse.arff\"\n    # Download the ARFF file\n    arff_file_path = os.path.join(data_dir, \"Bioresponse.arff\")\n    urllib.request.urlretrieve(data_url, arff_file_path)\n\n    # Load ARFF file into DataFrame\n    data, meta = arff.loadarff(arff_file_path)\n    df = pd.DataFrame(data)\n    # Convert target variable to int\n    last_column = df.columns[-1]\n#     print(\"df\",df)\n\n    df[last_column] = df[last_column].astype(int)\n\n    # Split the data into train, validation, and test sets\n    X = df.drop(last_column, axis=1)  # Assuming \"SeriousDlqin2yrs\" is the target variable\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = df[last_column]\n\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)\n    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)\n\n#     y_train = y_train.astype('int64')\n#     y_test = y_test.astype('int64')\n#     y_val = y_val.astype('int64')\n\n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_val = (y_val.values.reshape(-1) == 1).astype('int64')\n\n    # Remove the ARFF file\n    os.remove(arff_file_path)\n\n    # Remove the data directory\n    shutil.rmtree(data_dir)\n\n    return dict(\n        X_train=X_train.astype('float32'), y_train=y_train,\n        X_valid=X_val.astype('float32'), y_valid=y_val,\n        X_test=X_test.astype('float32'), y_test=y_test\n    )\n\n\ndef fetch_openml_california_data(data_dir=\"./OpenML_California\"):\n    print(\"---------------------OpenML_California DATASET--------------------------------------\")\n    # Create the data directory if it doesn't exist\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    data_url = \"https://api.openml.org/data/v1/download/22111914/california.arff\"\n    # Download the ARFF file\n    arff_file_path = os.path.join(data_dir, \"california.arff\")\n    urllib.request.urlretrieve(data_url, arff_file_path)\n\n    # Load ARFF file into DataFrame\n    data, meta = arff.loadarff(arff_file_path)\n    df = pd.DataFrame(data)\n    # Convert target variable to int\n    last_column = df.columns[-1]\n#     print(\"df\",df)\n\n    df[last_column] = df[last_column].astype(int)\n\n    # Split the data into train, validation, and test sets\n    X = df.drop(last_column, axis=1)  # Assuming \"SeriousDlqin2yrs\" is the target variable\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = df[last_column]\n\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)\n    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)\n\n#     y_train = y_train.astype('int64')\n#     y_test = y_test.astype('int64')\n#     y_val = y_val.astype('int64')\n\n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_val = (y_val.values.reshape(-1) == 1).astype('int64')\n\n    # Remove the ARFF file\n    os.remove(arff_file_path)\n\n    # Remove the data directory\n    shutil.rmtree(data_dir)\n\n    return dict(\n        X_train=X_train.astype('float32'), y_train=y_train,\n        X_valid=X_val.astype('float32'), y_valid=y_val,\n        X_test=X_test.astype('float32'), y_test=y_test\n    )\n\n\ndef fetch_openml_heloc_data(data_dir=\"./OpenML_Heloc\"):\n    print(\"---------------------OpenML_Heloc DATASET--------------------------------------\")\n    # Create the data directory if it doesn't exist\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    data_url = \"https://api.openml.org/data/v1/download/22111912/heloc.arff\"\n    # Download the ARFF file\n    arff_file_path = os.path.join(data_dir, \"heloc.arff\")\n    urllib.request.urlretrieve(data_url, arff_file_path)\n\n    # Load ARFF file into DataFrame\n    data, meta = arff.loadarff(arff_file_path)\n    df = pd.DataFrame(data)\n    # Convert target variable to int\n    last_column = df.columns[-1]\n#     print(\"df\",df)\n\n    df[last_column] = df[last_column].astype(int)\n\n    # Split the data into train, validation, and test sets\n    X = df.drop(last_column, axis=1)  # Assuming \"SeriousDlqin2yrs\" is the target variable\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = df[last_column]\n\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)\n    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)\n\n#     y_train = y_train.astype('int64')\n#     y_test = y_test.astype('int64')\n#     y_val = y_val.astype('int64')\n\n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_val = (y_val.values.reshape(-1) == 1).astype('int64')\n\n    # Remove the ARFF file\n    os.remove(arff_file_path)\n\n    # Remove the data directory\n    shutil.rmtree(data_dir)\n\n    return dict(\n        X_train=X_train.astype('float32'), y_train=y_train,\n        X_valid=X_val.astype('float32'), y_valid=y_val,\n        X_test=X_test.astype('float32'), y_test=y_test\n    )\n\n#**class Dataset:**\n\nREAL_DATASETS = {\n#     'A9A': fetch_A9A,\n#     'EPSILON': fetch_EPSILON,\n#     'PROTEIN': fetch_PROTEIN,\n#     'YEAR': fetch_YEAR,\n#     'MICROSOFT': fetch_MICROSOFT,\n#     'YAHOO': fetch_YAHOO,\n#     'CLICK': fetch_CLICK,\n#     'GLASS': fetch_GLASS,\n#     'COVTYPE': fetch_COVTYPE,\n#     'ALOI': fetch_ALOI,\n#     'DIGITS': fetch_DIGITS,\n#     'MUSH': fetch_MUSHROOMS,\n#     'TTT': fetch_TICTACTOE,\n    ####### 10 latest UCI datasets ########\n    'ADULT': fetch_ADULT,\n    'bank_marketing': fetch_bank_marketing,\n    'credit_card_defaults': fetch_credit_card_defaults,\n    'gamma_telescope': fetch_gamma_telescope,\n    'rice_dataset': fetch_rice_dataset,\n    'german_credit_data': fetch_german_credit_data,\n    'spambase_dataset': fetch_spambase_dataset,\n    'accelerometer_gyro_dataset': fetch_accelerometer_gyro_dataset,\n    'swarm_behaviour': fetch_swarm_behaviour,\n#     'HIGGS': fetch_HIGGS,\n    ######## OpenML Tabular Datasets ##########\n    'OpenML_Credit': fetch_openml_credit_data,\n    'OpenML_Electricity': fetch_openml_electricity_data,\n    'OpenML_Covertype': fetch_openml_covertype_data,\n    'OpenML_Pol': fetch_openml_pol_data,\n    'OpenML_House_16H': fetch_openml_house_16H_data,\n    'OpenML_MiniBooNE': fetch_openml_MiniBooNE_data,\n    'OpenML_Eye_movements': fetch_openml_eye_movements_data,\n    'OpenML_Diabetes130US': fetch_openml_Diabetes130US_data,\n    'OpenML_Jannis': fetch_openml_jannis_data,\n    'OpenML_Bioresponse': fetch_openml_Bioresponse_data,\n    'OpenML_California': fetch_openml_california_data,\n    'OpenML_Heloc': fetch_openml_heloc_data\n}\n\nTOY_DATASETS = [\n    'xor',\n    'reg-xor',\n    'swissroll',\n]\n\nclass Dataset:\n    def __init__(self, dataset, data_path='./DATA', normalize=False, normalize_target=False, quantile_transform=False, quantile_noise=1e-3, in_features=None, out_features=None, flatten=False, **kwargs):\n        \"\"\"\n        Dataset is a dataclass that contains all training and evaluation data required for an experiment\n        :param dataset: a pre-defined dataset name (see DATASETS) or a custom dataset\n            Your dataset should be at (or will be downloaded into) {data_path}/{dataset}\n        :param data_path: a shared data folder path where the dataset is stored (or will be downloaded into)\n        :param normalize: standardize features by removing the mean and scaling to unit variance\n        :param quantile_transform: whether tranform the feature distributions into normals, using a quantile transform\n        :param quantile_noise: magnitude of the quantile noise\n        :param in_features: which features to use as inputs\n        :param out_features: which features to reconstruct as output\n        :param flatten: whether flattening instances to vectors\n        :param kwargs: depending on the dataset, you may select train size, test size or other params\n        \"\"\"\n\n        if dataset in REAL_DATASETS:\n            data_dict = REAL_DATASETS[dataset](Path(data_path) / dataset, **kwargs)\n\n            self.X_train = data_dict['X_train']\n            self.y_train = data_dict['y_train']\n            self.X_valid = data_dict['X_valid']\n            self.y_valid = data_dict['y_valid']\n            self.X_test = data_dict['X_test']\n            self.y_test = data_dict['y_test']\n\n            if flatten:\n                self.X_train, self.X_valid, self.X_test = self.X_train.reshape(len(self.X_train), -1), self.X_valid.reshape(len(self.X_valid), -1), self.X_test.reshape(len(self.X_test), -1)\n\n            if normalize:\n\n                print(\"Normalize dataset\")\n                axis = [0] + [i + 2 for i in range(self.X_train.ndim - 2)]\n                self.mean = np.mean(self.X_train, axis=tuple(axis), dtype=np.float32)\n                self.std = np.std(self.X_train, axis=tuple(axis), dtype=np.float32)\n\n                # if constants, set std to 1\n                self.std[self.std == 0.] = 1.\n\n                if dataset not in ['ALOI']:\n                    self.X_train = (self.X_train - self.mean) / self.std\n                    self.X_valid = (self.X_valid - self.mean) / self.std\n                    self.X_test = (self.X_test - self.mean) / self.std\n\n            if quantile_transform:\n                quantile_train = np.copy(self.X_train)\n                if quantile_noise:\n                    stds = np.std(quantile_train, axis=0, keepdims=True)\n                    noise_std = quantile_noise / np.maximum(stds, quantile_noise)\n                    quantile_train += noise_std * np.random.randn(*quantile_train.shape)\n\n                qt = QuantileTransformer(output_distribution='normal').fit(quantile_train)\n                self.X_train = qt.transform(self.X_train)\n                self.X_valid = qt.transform(self.X_valid)\n                self.X_test = qt.transform(self.X_test)\n\n            if normalize_target:\n\n                print(\"Normalize target value\")\n                self.mean_y = np.mean(self.y_train, axis=0, dtype=np.float32)\n                self.std_y = np.std(self.y_train, axis=0, dtype=np.float32)\n\n                # if constants, set std to 1\n                if self.std_y == 0.:\n                    self.std_y = 1.\n\n                self.y_train = (self.y_train - self.mean_y) / self.std_y\n                self.y_valid = (self.y_valid - self.mean_y) / self.std_y\n                self.y_test = (self.y_test - self.mean_y) / self.std_y\n\n            if in_features is not None:\n                self.X_train_in, self.X_valid_in, self.X_test_in = self.X_train[:, in_features], self.X_valid[:, in_features], self.X_test[:, in_features]\n\n            if out_features is not None:\n                self.X_train_out, self.X_valid_out, self.X_test_out = self.X_train[:, out_features], self.X_valid[:, out_features], self.X_test[:, out_features]\n\n        elif dataset in TOY_DATASETS:\n            data_dict = toy_dataset(distr=dataset, **kwargs)\n\n            self.X = data_dict['X']\n            self.Y = data_dict['Y']\n            if 'labels' in data_dict:\n                self.labels = data_dict['labels']\n\n        self.data_path = data_path\n        self.dataset = dataset\n\nclass TorchDataset(torch.utils.data.Dataset):\n\n    def __init__(self, *data, **options):\n        \n        n_data = len(data)\n        if n_data == 0:\n            raise ValueError(\"At least one set required as input\")\n\n        self.data = data\n        means = options.pop('means', None)\n        stds = options.pop('stds', None)\n        self.transform = options.pop('transform', None)\n        self.test = options.pop('test', False)\n        \n        if options:\n            raise TypeError(\"Invalid parameters passed: %s\" % str(options))\n         \n        if means is not None:\n            assert stds is not None, \"must specify both <means> and <stds>\"\n\n            self.normalize = lambda data: [(d - m) / s for d, m, s in zip(data, means, stds)]\n\n        else:\n            self.normalize = lambda data: data\n\n    def __len__(self):\n        return len(self.data[0])\n\n    def __getitem__(self, idx):\n        data = self.normalize([s[idx] for s in self.data])\n        if self.transform:\n\n            if self.test:\n                data = sum([[self.transform.test_transform(d)] * 2 for d in data], [])\n            else:\n                data = sum([self.transform(d) for d in data], [])\n            \n        return data","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"**TAO**","metadata":{}},{"cell_type":"code","source":"import random\nfrom copy import deepcopy\nfrom queue import deque\n\nimport numpy as np\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn import datasets\nfrom sklearn.base import BaseEstimator, RegressorMixin, ClassifierMixin\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.metrics import get_scorer\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_text\nfrom sklearn.utils import check_X_y\n\nimport pandas as pd\nfrom sklearn.base import ClassifierMixin\nfrom sklearn.utils.validation import check_X_y, check_array\nfrom sklearn.utils.multiclass import check_classification_targets\nimport scipy.sparse\n\ndef check_fit_arguments(model, X, y, feature_names):\n    \"\"\"Process arguments for fit and predict methods.\n    \"\"\"\n    if isinstance(model, ClassifierMixin):\n        model.classes_, y = np.unique(y, return_inverse=True)  # deals with str inputs\n        check_classification_targets(y)\n\n    if feature_names is None:\n        if isinstance(X, pd.DataFrame):\n            model.feature_names_ = X.columns\n        elif isinstance(X, list):\n            model.feature_names_ = ['X' + str(i) for i in range(len(X[0]))]\n        else:\n            model.feature_names_ = ['X' + str(i) for i in range(X.shape[1])]\n    else:\n        model.feature_names_ = feature_names\n    if scipy.sparse.issparse(X):\n        X = X.toarray()\n    X, y = check_X_y(X, y)\n    _, model.n_features_in_ = X.shape\n    assert len(model.feature_names_) == model.n_features_in_, 'feature_names should be same size as X.shape[1]'\n    y = y.astype(float)\n    return X, y, model.feature_names_\n\ndef check_fit_X(X):\n    \"\"\"Process X argument for fit and predict methods.\n    \"\"\"\n    if scipy.sparse.issparse(X):\n        X = X.toarray()\n    X = check_array(X)\n    return X\n\n\nclass TaoTree(BaseEstimator):\n\n    def __init__(self, model_type: str = 'CART',\n                 n_iters: int = 20,\n                 model_args: dict = {'max_leaf_nodes': 15},\n                 randomize_tree=False,\n                 update_scoring='accuracy',\n                 min_node_samples_tao=3,\n                 min_leaf_samples_tao=2,\n                 node_model='stump',\n                 node_model_args: dict = {},\n                 reg_param: float = 1e-3,\n                 weight_errors: bool = False,\n                 verbose: int = 0,\n                 ):\n        \"\"\"TAO: Alternating optimization of decision trees, with application to learning sparse oblique trees (Neurips 2018)\n        https://proceedings.neurips.cc/paper/2018/hash/185c29dc24325934ee377cfda20e414c-Abstract.html\n        Note: this implementation learns single-feature splits rather than oblique trees.\n\n        Currently supports\n        - given a CART tree, posthoc improve it with TAO\n            - also works with HSTreeCV\n\n        Todo\n        - update bottom to top otherwise input points don't get updated\n        - update leaf nodes\n        - support regression\n        - support FIGS\n        - support error-weighting\n        - support oblique trees\n            - support generic models at decision node\n            - support pruning (e.g. if weights -> 0, then remove a node)\n        - support classifiers in leaves\n\n        Parameters\n        ----------\n\n        model_type: str\n            'CART' or 'FIGS'\n\n        n_iters\n            Number of iterations to run TAO\n\n        model_args\n            Arguments to pass to the model\n\n        randomize_tree\n            Whether to randomize the tree before each iteration\n\n        min_node_samples_tao: int\n            Minimum number of samples in a node to apply tao\n\n        min_leaf_samples_tao: int\n\n        node_model: str\n            'stump' or 'linear'\n\n        reg_param\n            Regularization parameter for node-wise linear model (if node_model is 'linear')\n\n        verbose: int\n            Verbosity level\n        \"\"\"\n        super().__init__()\n        self.model_type = model_type\n        self.n_iters = n_iters\n        self.model_args = model_args\n        self.randomize_tree = randomize_tree\n        self.update_scoring = update_scoring\n        self.min_node_samples_tao = min_node_samples_tao\n        self.min_leaf_samples_tao = min_leaf_samples_tao\n        self.node_model = node_model\n        self.node_model_args = node_model_args\n        self.reg_param = reg_param\n        self.weight_errors = weight_errors\n        self.verbose = verbose\n        self._init_prediction_task()  # decides between regressor and classifier\n\n    def _init_prediction_task(self):\n        \"\"\"\n        TaoRegressor and TaoClassifier override this method\n        to alter the prediction task. When using this class directly,\n        it is equivalent to SuperCARTRegressor\n        \"\"\"\n        self.prediction_task = 'classification'\n\n    def fit(self, X, y=None, feature_names=None, sample_weight=None):\n        \"\"\"\n        Params\n        ------\n        _sample_weight: array-like of shape (n_samples,), default=None\n            Sample weights. If None, then samples are equally weighted.\n            Splits that would create child nodes with net zero or negative weight\n            are ignored while searching for a split in each node.\n        \"\"\"\n        X, y, feature_names = check_fit_arguments(self, X, y, feature_names)\n        if isinstance(self, RegressorMixin):\n            raise Warning('TAO Regression is not yet tested')\n        X, y = check_X_y(X, y)\n        y = y.astype(float)\n        if feature_names is not None:\n            self.feature_names_ = feature_names\n        if self.model_type == 'CART':\n            if isinstance(self, ClassifierMixin):\n                self.model = DecisionTreeClassifier(**self.model_args)\n            elif isinstance(self, RegressorMixin):\n                self.model = DecisionTreeRegressor(**self.model_args)\n            self.model.fit(X, y, sample_weight=sample_weight)\n            if self.verbose>1:\n                print(export_text(self.model))\n            # plot_tree(self.model)\n            # plt.savefig('/Users/chandan/Desktop/tree.png', dpi=300)\n            # plt.show()\n\n        if self.randomize_tree:\n            np.random.shuffle(self.model.tree_.feature)  # shuffle CART features\n            # np.random.shuffle(self.model.tree_.threshold)\n            for i in range(self.model.tree_.node_count):  # split on feature medians\n                self.model.tree_.threshold[i] = np.median(\n                    X[:, self.model.tree_.feature[i]])\n        if self.verbose:\n            print('starting score', self.model.score(X, y))\n        for i in range(self.n_iters):\n            num_updates = self._tao_iter_cart(X, y, self.model.tree_, sample_weight=sample_weight)\n            if num_updates == 0:\n                break\n\n        return self\n\n    def _tao_iter_cart(self, X, y, tree, X_score=None, y_score=None, sample_weight=None):\n        \"\"\"Updates tree by applying the tao algorithm to the tree\n        Params\n        ------\n        X: array-like of shape (n_samples, n_features)\n            The input samples.\n        y: array-like of shape (n_samples,)\n            The target values.\n        model: DecisionTreeClassifier.tree_ or DecisionTreeRegressor.tree_\n            The model to be post-hoc improved\n        \"\"\"\n\n        # Tree properties\n        children_left = tree.children_left\n        children_right = tree.children_right\n        feature = tree.feature\n        threshold = tree.threshold\n        value = tree.value\n        \n#         print(\"X\",X)\n#         print(\"y\",y)\n#         print(\"tree\",tree)\n        \n#         print(\"children_left\",children_left)\n#         print(\"children_right\",children_right)\n#         print(\"feature\",feature)\n#         print(\"threshold\",threshold)\n#         print(\"value\",value)\n\n        # For each node, store the path to that node #######################################################\n        indexes_with_prefix_paths = []  # data structure with (index, path_to_node_index)\n        # e.g. if if node 3 is the left child of node 1 which is the right child of node 0\n        # then we get (3, [(0, R), (1, L)])\n\n        # start with the root node id (0) and its depth (0)\n        queue = deque()\n        queue.append((0, []))\n        while len(queue) > 0:\n            node_id, path_to_node_index = queue.popleft()\n            indexes_with_prefix_paths.append((node_id, path_to_node_index))\n\n            # If a split node, append left and right children and depth to queue\n            if children_left[node_id] != children_right[node_id]:\n                queue.append((children_left[node_id], path_to_node_index + [(node_id, 'L')]))\n                queue.append((children_right[node_id], path_to_node_index + [(node_id, 'R')]))\n#         print(indexes_with_prefix_paths)\n\n        num_updates = 0\n\n        # Reversing BFS queue presents nodes bottom -> top one level at a time\n        for (node_id, path_to_node_index) in reversed(indexes_with_prefix_paths):\n            # For each each node, try a TAO update\n#             print('node_id', node_id, path_to_node_index)\n\n            # Compute the points being input to the node ######################################\n            def filter_points_by_path(X, y, path_to_node_index):\n                \"\"\"Returns the points in X that are in the path to the node\"\"\"\n                for node_id, direction in path_to_node_index:\n                    idxs = X[:, feature[node_id]] <= threshold[node_id]  ##### Change\n                    if direction == 'R':\n                        idxs = ~idxs\n                    # print('idxs', idxs.size, idxs.sum())\n                    X = X[idxs]\n                    y = y[idxs]\n                return X, y\n\n            X_node, y_node = filter_points_by_path(X, y, path_to_node_index)\n\n            if sample_weight is not None:\n                sample_weight_node = filter_points_by_path(X, sample_weight, path_to_node_index)[1]\n            else:\n                sample_weight_node = np.ones(y_node.size)\n\n            # Skip over leaf nodes and nodes with too few samples ######################################\n            if children_left[node_id] == children_right[node_id]:  # is leaf node\n                if isinstance(self, RegressorMixin) and X_node.shape[0] >= self.min_leaf_samples_tao:\n                    # old_score = self.model.score(X, y)\n                    value[node_id] = np.mean(y_node)\n                    \"\"\"\n                    new_score = self.model.score(X, y)\n                    if new_score > old_score:\n                        print(f'\\tLeaf improved score from {old_score:0.3f} to {new_score:0.3f}')\n                    if new_score < old_score:\n                        print(f'\\tLeaf reduced score from {old_score:0.3f} to {new_score:0.3f}')\n                        # raise ValueError('Leaf update reduced score')\n                    \"\"\"\n                # print('\\tshapes', X_node.shape, y_node.shape)\n                # print('\\tvals:', value[node_id][0][0], np.mean(y_node))\n                # assert value[node_id][0][0] == np.mean(y_node), 'unless tree changed, vals should be leaf means'\n                continue\n            elif X_node.shape[0] < self.min_node_samples_tao:\n                continue\n\n            # Compute the outputs for these points if they go left or right ######################################\n            def predict_from_node(X, node_id):\n                \"\"\"Returns predictions for X starting at node node_id\"\"\"\n\n                def predict_from_node(x, node_id):\n                    \"\"\"Returns predictions for x starting at node node_id\"\"\"\n                    if children_left[node_id] == children_right[node_id]:\n                        if isinstance(self, RegressorMixin):\n                            return value[node_id]\n                        if isinstance(self, ClassifierMixin):\n                            return np.argmax(value[node_id])  # note value stores counts for each class\n                    if x[feature[node_id]] <= threshold[node_id]: ##### Change\n                        return predict_from_node(x, children_left[node_id])\n                    else:\n                        return predict_from_node(x, children_right[node_id])\n\n                preds = np.zeros(X.shape[0])\n                for i in range(X.shape[0]):\n                    preds[i] = predict_from_node(X[i], node_id)\n                return preds\n\n            y_node_left = predict_from_node(X_node, children_left[node_id])\n            y_node_right = predict_from_node(X_node, children_right[node_id])\n            if node_id == 0:  # root node\n                assert np.all(np.logical_or(self.model.predict(X_node) == y_node_left,\n                                            self.model.predict(\n                                                X_node) == y_node_right)), \\\n                    'actual predictions should match either predict_from_node left or right'\n\n            # Decide on prediction target (want to go left (0) / right (1) when advantageous)\n            # TAO paper binarizes these (e.g. predict 0 or 1 depending on which of these is better)\n            y_node_absolute_errors = np.abs(np.vstack((y_node - y_node_left,\n                                                       y_node - y_node_right))).T\n\n            # screen out indexes where going left/right has no effect\n            idxs_relevant = y_node_absolute_errors[:, 0] != y_node_absolute_errors[:, 1]\n            if idxs_relevant.sum() <= 1:  # nothing to change\n                if self.verbose:\n                    print('no errors to change')\n                continue\n            # assert np.all((self.model.predict(X) != y)[idxs_relevant]), 'relevant indexes should be errors'\n            y_node_target = np.argmin(y_node_absolute_errors, axis=1)\n            y_node_target = y_node_target[idxs_relevant]\n\n            # here, we optionally weight these errors by the size of the error\n            # if we want this to work for classification, must switch to predict_proba\n            # if self.prediction_task == 'regression':\n            # weight by the difference in error ###############################################################\n            if self.weight_errors:\n                sample_weight_node *= np.abs(y_node_absolute_errors[:, 1] - y_node_absolute_errors[:, 0])\n            sample_weight_node_target = sample_weight_node[idxs_relevant]\n            X_node = X_node[idxs_relevant]\n\n            # Fit a 1-variable binary classification model on these outputs ######################################\n            # Note: this could be customized (e.g. for sparse oblique trees)\n            best_score = -np.inf\n            best_feat_num = None\n            for feat_num in range(X.shape[1]):\n                if isinstance(self, ClassifierMixin):\n                    if self.node_model == 'linear':\n                        m = LogisticRegression(**self.node_model_args)\n                    elif self.node_model == 'stump':\n                        m = DecisionTreeClassifier(max_depth=1, **self.node_model_args)\n                if isinstance(self, RegressorMixin):\n                    if self.node_model == 'linear':\n                        m = LinearRegression(**self.node_model_args)\n                    elif self.node_model == 'stump':\n                        m = DecisionTreeRegressor(max_depth=1, **self.node_model_args)\n                X_node_single_feat = X_node[:, feat_num: feat_num + 1]\n                m.fit(X_node_single_feat, y_node_target, sample_weight=sample_weight_node_target)\n                score = m.score(X_node_single_feat, y_node_target, sample_weight=sample_weight_node_target)\n                if score > best_score:\n                    best_score = score\n                    best_feat_num = feat_num\n                    best_model = deepcopy(m)\n                    if self.node_model == 'linear':\n                        best_threshold = -best_model.intercept_ / best_model.coef_[0]\n                    elif self.node_model == 'stump':\n                        best_threshold = best_model.tree_.threshold[0]\n            # print((feature[node_id], threshold[node_id]), '\\n->',\n            #       (best_feat_num, best_threshold))\n\n            # Update the node with the new feature / threshold ######################################\n            old_feat_num = feature[node_id]\n            old_threshold = threshold[node_id]\n            # print(X.sum(), y.sum())\n\n            if X_score is None:\n                X_score = X\n            if y_score is None:\n                y_score = y\n\n            scorer = get_scorer(self.update_scoring)\n\n            old_score = scorer(self.model, X_score, y_score)\n\n            feature[node_id] = best_feat_num\n            threshold[node_id] = best_threshold\n            new_score = scorer(self.model, X_score, y_score)\n\n            # debugging\n            if self.verbose > 1:\n                if old_score == new_score:\n                    print('\\tno change', best_feat_num, old_feat_num)\n                print(f'\\tscore_total {old_score:0.4f} -> {new_score:0.4f}')\n            if old_score >= new_score:\n                feature[node_id] = old_feat_num\n                threshold[node_id] = old_threshold\n            else:\n                # (Track if any updates were necessary)\n                num_updates += 1\n                if self.verbose > 0:\n                    print(f'Improved score from {old_score:0.4f} to {new_score:0.4f}')\n\n            # debugging snippet (if score_m_new > score_m_old, then new_score should be > old_score, but it isn't!!!!)\n            if self.verbose > 1:\n                \"\"\"\n                X_node_single_feat = X_node[:, best_feat_num: best_feat_num + 1]\n                score_m_new = best_model.score(X_node_single_feat, y_node_target, sample_weight=sample_weight)\n                best_model.tree_.feature[0] = old_feat_num\n                best_model.tree_.threshold[0] = old_threshold\n                X_node_single_feat = X_node[:, old_feat_num: old_feat_num + 1]\n                score_m_old = best_model.score(X_node_single_feat, y_node_target, sample_weight=sample_weight)\n                print('\\t\\t', f'score_local {score_m_old:0.4f} -> {score_m_new:0.4f}')\n                \"\"\"\n\n        return num_updates\n\n    def predict(self, X):\n        return self.model.predict(X)\n\n    def predict_proba(self, X):\n        return self.model.predict_proba(X)\n\n    def score(self, X, y):\n        return self.model.score(X, y)\n\n\nclass TaoTreeRegressor(TaoTree, RegressorMixin):\n    pass\n\nclass TaoTreeClassifier(TaoTree, ClassifierMixin):\n    pass\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"**Change the below cell to run on Synthetic or Tabular dataset on different hyperparameters**","metadata":{}},{"cell_type":"code","source":"DATA_NAME=\"syn\" #\"Tabular\" use \"Tabular\" to run on tabular datasets","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"if DATA_NAME == \"syn\":\n    np.random.seed(13)\n    random.seed(13)\n    seed=365\n    num_levels=4\n    threshold = 0 #data seperation distance\n\n\n\n    data_configs = [\n        {\"input_dim\": 20, \"num_data\": 40000},\n        {\"input_dim\": 100, \"num_data\": 60000},\n        {\"input_dim\": 500, \"num_data\": 100000}\n    ]\n\n    # Code block to run for each dictionary\n    for config in data_configs:\n        input_dim = config[\"input_dim\"]\n        num_data = config[\"num_data\"]\n\n        print(f\"Running code for input_dim={input_dim}, num_data={num_data}\")\n\n        ((data_x, labels), (w_list, b_list, vals), stats) = data_gen_decision_tree(\n                                                    dim=input_dim, seed=seed, num_levels=num_levels,\n                                                    num_data=num_data)\n        seed_set=seed\n        w_list_old = np.array(w_list)\n        b_list_old = np.array(b_list)\n        print(sum(labels==1))\n        print(sum(labels==0))\n    #     print(labels.shape)\n    #     print(vals)\n    #     print(stats)\n        print(\"Seed= \",seed_set)\n        num_data = len(data_x)\n        num_train= num_data//2\n        num_vali = num_data//4\n        num_test = num_data//4\n        train_data = data_x[:num_train,:]\n        train_data_labels = labels[:num_train]\n\n        vali_data = data_x[num_train:num_train+num_vali,:]\n        vali_data_labels = labels[num_train:num_train+num_vali]\n\n        test_data = data_x[num_train+num_vali :,:]\n        test_data_labels = labels[num_train+num_vali :]\n\n        m = TaoTreeClassifier(randomize_tree=False, weight_errors=False,\n                              node_model='linear', model_args={'max_depth': 5},\n                              verbose=1)\n\n    #     ###Testing###\n        DATA_NAME = \"syn\"\n        data = Dataset_syn(DATA_NAME)\n        print(data.X_train.shape)\n        print(data.y_train.shape)\n\n        m.fit(data.X_train, data.y_train)\n        print('Train acc', np.mean(m.predict(data.X_train) == data.y_train))\n        print('Test acc', np.mean(m.predict(data.X_test) == data.y_test))\n\nelse:\n#     DATA_NAME=[\"ADULT\",\"bank_marketing\",\"credit_card_defaults\",\"gamma_telescope\",\"rice_dataset\",\"german_credit_data\",\"spambase_dataset\",\"accelerometer_gyro_dataset\",\"swarm_behaviour\"]#,\"HIGGS\"]\n    DATA_NAME=[\"OpenML_Credit\",\"OpenML_Electricity\",\"OpenML_Pol\",\"OpenML_House_16H\",\"OpenML_MiniBooNE\",\"OpenML_Eye_movements\",\"OpenML_Diabetes130US\",\"OpenML_Jannis\",\"OpenML_Bioresponse\",\"OpenML_California\",\"OpenML_Heloc\",\"OpenML_Covertype\",\"bank_marketing\",\"credit_card_defaults\",\"gamma_telescope\",\"rice_dataset\",\"german_credit_data\",\"spambase_dataset\",\"accelerometer_gyro_dataset\",\"swarm_behaviour\"]#,\"HIGGS\"]\n\n#     DATA_NAME=[\"ADULT\"]\n    for data_name in DATA_NAME:\n        data = Dataset(data_name)\n    #     print(data)\n        print(\"=========================================================================================\")\n        m = TaoTreeClassifier(randomize_tree=False, weight_errors=False,\n                              node_model='linear', model_args={'max_depth': 5},\n                              verbose=1)\n        print(data.X_train.shape)\n        print(data.y_train.shape)\n\n        m.fit(data.X_train, data.y_train)\n        print('Train acc', np.mean(m.predict(data.X_train) == data.y_train))\n        print('Test acc', np.mean(m.predict(data.X_test) == data.y_test))\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]}]}