{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[],"dockerImageVersionId":30699,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"**Imports**","metadata":{}},{"cell_type":"code","source":"'''\n# Use the data in DLGN and its variations from https://arxiv.org/abs/2010.04627\n'''\n#Imports for Latent Tree data\n\nimport random\nimport requests\nimport os\nimport torch\nfrom tqdm import tqdm\nimport numpy as np\nimport gzip\nimport shutil\nimport tarfile\nimport bz2\nimport pandas as pd\nimport gzip\nimport shutil\nimport warnings\n\nfrom pathlib import Path\nfrom sklearn.datasets import load_svmlight_file\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.datasets import make_swiss_roll\nfrom sklearn.preprocessing import QuantileTransformer\n\nfrom category_encoders import LeaveOneOutEncoder\nfrom category_encoders.ordinal import OrdinalEncoder\nimport os\nimport zipfile\nimport shutil\nimport urllib.request\nimport numpy as np\nimport pandas as pd\nfrom sklearn.preprocessing import StandardScaler, LabelEncoder\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.model_selection import train_test_split, GridSearchCV\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.svm import SVC\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.neural_network import MLPClassifier\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.ensemble import RandomForestClassifier\n\nfrom sklearn.neighbors import NearestCentroid\nfrom scipy.io import arff\n# !pip install numpy==1.22.0  # Install a compatible version of NumPy\n# !pip install scipy  # Install or update SciPy\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport torch.optim as optim\nfrom tqdm import tqdm\nimport copy\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn.metrics import pairwise_distances\nfrom mpl_toolkits.axes_grid1 import make_axes_locatable\nfrom itertools import product as cartesian_prod\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.metrics import pairwise_distances\n\nfrom sklearn import tree\nfrom sklearn import cluster, mixture\n\n\nnp.set_printoptions(precision=4)\n\n\n#@title Importing Packages\nimport os\nimport random\nimport pandas as pd\nfrom copy import deepcopy\nimport torchvision\nimport torchvision.transforms as transforms\n\nimport time\nimport sys\n\ndevice = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')\nprint(device)","metadata":{"execution":{"iopub.status.busy":"2024-04-26T08:23:09.482760Z","iopub.execute_input":"2024-04-26T08:23:09.483487Z","iopub.status.idle":"2024-04-26T08:23:09.497587Z","shell.execute_reply.started":"2024-04-26T08:23:09.483454Z","shell.execute_reply":"2024-04-26T08:23:09.496559Z"},"trusted":true},"execution_count":45,"outputs":[{"name":"stdout","text":"cuda:0\n","output_type":"stream"}]},{"cell_type":"markdown","source":"**Download dataset**","metadata":{}},{"cell_type":"code","source":"#**from src.utils import download**\ndef download(url, filename, delete_if_interrupted=True, chunk_size=4096):\n    \"\"\" saves file from url to filename with a fancy progressbar \"\"\"\n    try:\n        with open(filename, \"wb\") as f:\n            print(\"Downloading {} > {}\".format(url, filename))\n            response = requests.get(url, stream=True)\n            total_length = response.headers.get('content-length')\n\n            if total_length is None:  # no content length header\n                f.write(response.content)\n            else:\n                total_length = int(total_length)\n                with tqdm(total=total_length) as progressbar:\n                    for data in response.iter_content(chunk_size=chunk_size):\n                        if data:  # filter-out keep-alive chunks\n                            f.write(data)\n                            progressbar.update(len(data))\n\n    except Exception as e:\n        if delete_if_interrupted:\n            print(\"Removing incomplete download {}.\".format(filename))\n            os.remove(filename)\n        raise e\n    return filename\n\n#**from src.clus_datasets import ***\n#**CLUSTERING DATASET FETCHERS**\n\n# -------------------------------------------------------- CLUSTERING DATASET FETCHERS\n\ndef fetch_GLASS(path, valid_size=0.2, test_size=0.2, seed=None):\n\n    path = Path(path)\n    data_path = path / 'glass.data'\n\n    if not data_path.exists():\n        path.mkdir(parents=True)\n\n        download('https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data', data_path)\n\n    data = np.genfromtxt(data_path, delimiter=',')\n    \n    X, Y = (data[:, 1:-1]).astype(np.float32), (data[:, -1] - 1).astype(int)\n\n    X_train, X_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=test_size, random_state=seed)\n\n    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=valid_size / (1 - test_size), random_state=seed)\n\n    return dict(\n        X_train=X_train, y_train=y_train, X_valid=X_val, y_valid=y_val, X_test=X_test, y_test=y_test\n    )\n\ndef fetch_COVTYPE(path, valid_size=0.2, test_size=0.2, seed=None):\n\n    path = Path(path)\n    data_path = path / 'covtype.data'\n\n    if not data_path.exists():\n        path.mkdir(parents=True)\n        archive_path = path / 'covtype.data.gz'\n        download('https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz', archive_path)\n\n        with gzip.open(archive_path, 'rb') as f_in:\n\n            with open(data_path, 'wb') as f_out:\n                shutil.copyfileobj(f_in, f_out)\n\n    data = np.genfromtxt(data_path, delimiter=',')\n    \n    X, Y = (data[:, :-1]).astype(np.float32), (data[:, -1] - 1).astype(int)\n\n    X_train, X_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=test_size, random_state=seed)\n\n    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=valid_size / (1 - test_size), random_state=seed)\n\n    return dict(\n        X_train=X_train, y_train=y_train, X_valid=X_val, y_valid=y_val, X_test=X_test, y_test=y_test\n    )\n\ndef fetch_ALOI(path, valid_size=0.2, test_size=0.2, seed=None):\n\n    from PIL import Image\n    from tqdm import tqdm\n\n    path = Path(path)\n    data_path = path / 'aloi_red4'\n    npz_path = path / 'aloi_red4.npz'\n\n    if not data_path.exists():\n\n        path.mkdir(parents=True, exist_ok=True)\n\n        for data_type in [\"ill\", \"col\", \"view\", \"stereo\"]:\n            \n            archive_path = path / f'aloi_red4_{data_type}.tar'\n\n            download(f'http://aloi.science.uva.nl/tars/aloi_red4_{data_type}.tar', archive_path)\n\n            with tarfile.open(archive_path, 'r') as f_in:\n\n                f_in.extractall(path=data_path)\n\n    if not npz_path.exists():\n        \n        X = np.empty((110250, 3, 144, 192), dtype=np.uint8)\n        Y = np.empty(110250, dtype=np.uint16)\n\n        # loop over classes\n        i = 0\n        for c in tqdm(range(1000), desc='Converting ALOI png to npy'):\n\n            c_path = data_path / \"png4\" / str(c + 1) \n\n            # loop over class instances\n            for i_path in c_path.glob('*.png'):\n                \n                im_frame = Image.open(i_path)\n                X[i] = np.transpose(np.array(im_frame), (2, 0, 1))\n                Y[i] = c\n                i += 1\n\n        np.savez_compressed(npz_path, X=X, Y=Y)\n    \n    else:\n\n        data = np.load(npz_path)\n        X, Y = data['X'], data['Y']\n\n    X_train, X_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=test_size, random_state=seed)\n    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=valid_size / (1 - test_size), random_state=seed)\n\n    return dict(\n        X_train=X_train, y_train=y_train, X_valid=X_val, y_valid=y_val, X_test=X_test, y_test=y_test\n    )\n\ndef fetch_DIGITS(path, valid_size=0.2, test_size=0.2, seed=None):\n\n    from sklearn.datasets import load_digits\n\n    X, Y = load_digits(return_X_y=True)\n    X, Y = X.reshape(-1, 1, 8, 8).astype(np.uint8), Y.astype(int)\n\n    X_train, X_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=test_size, random_state=seed)\n\n    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=valid_size / (1 - test_size), random_state=seed)\n\n    return dict(\n        X_train=X_train, y_train=y_train, X_valid=X_val, y_valid=y_val, X_test=X_test, y_test=y_test\n    )\n\n#**from src.tabular_datasets import ***\n\ndef fetch_A9A(path, train_size=None, valid_size=None, test_size=None, **kwargs):\n    train_path = os.path.join(path, 'a9a')\n    test_path = os.path.join(path, 'a9a.t')\n    if not all(os.path.exists(fname) for fname in (train_path, test_path)):\n        os.makedirs(path, exist_ok=True)\n        download(\"https://www.dropbox.com/s/9cqdx166iwonrj9/a9a?dl=1\", train_path)\n        download(\"https://www.dropbox.com/s/sa0ds895c0v4xc6/a9a.t?dl=1\", test_path)\n\n    X_train, y_train = load_svmlight_file(train_path, dtype=np.float32, n_features=123)\n    X_test, y_test = load_svmlight_file(test_path, dtype=np.float32, n_features=123)\n    X_train, X_test = X_train.toarray(), X_test.toarray()\n    y_train[y_train == -1] = 0\n    y_test[y_test == -1] = 0\n    y_train, y_test = y_train.astype(np.int), y_test.astype(np.int)\n\n    if all(sizes is None for sizes in (train_size, valid_size, test_size)):\n        train_idx_path = os.path.join(path, 'stratified_train_idx.txt')\n        valid_idx_path = os.path.join(path, 'stratified_valid_idx.txt')\n        if not all(os.path.exists(fname) for fname in (train_idx_path, valid_idx_path)):\n            download(\"https://www.dropbox.com/s/xy4wwvutwikmtha/stratified_train_idx.txt?dl=1\", train_idx_path)\n            download(\"https://www.dropbox.com/s/nthpxofymrais5s/stratified_test_idx.txt?dl=1\", valid_idx_path)\n        train_idx = pd.read_csv(train_idx_path, header=None)[0].values\n        valid_idx = pd.read_csv(valid_idx_path, header=None)[0].values\n    else:\n        assert train_size, \"please provide either train_size or none of sizes\"\n        if valid_size is None:\n            valid_size = len(X_train) - train_size\n            assert valid_size > 0\n        if train_size + valid_size > len(X_train):\n            warnings.warn('train_size + valid_size = {} exceeds dataset size: {}.'.format(\n                train_size + valid_size, len(X_train)), Warning)\n        if test_size is not None:\n            warnings.warn('Test set is fixed for this dataset.', Warning)\n\n        shuffled_indices = np.random.permutation(np.arange(len(X_train)))\n        train_idx = shuffled_indices[:train_size]\n        valid_idx = shuffled_indices[train_size: train_size + valid_size]    \n\n    return dict(\n        X_train=X_train[train_idx], y_train=y_train[train_idx],\n        X_valid=X_train[valid_idx], y_valid=y_train[valid_idx],\n        X_test=X_test, y_test=y_test\n    )\n\ndef fetch_EPSILON(path, train_size=None, valid_size=None, test_size=None, **kwargs):\n    train_path = os.path.join(path, 'epsilon_normalized')\n    test_path = os.path.join(path, 'epsilon_normalized.t')\n    if not all(os.path.exists(fname) for fname in (train_path, test_path)):\n        os.makedirs(path, exist_ok=True)\n        train_archive_path = os.path.join(path, 'epsilon_normalized.bz2')\n        test_archive_path = os.path.join(path, 'epsilon_normalized.t.bz2')\n        if not all(os.path.exists(fname) for fname in (train_archive_path, test_archive_path)):\n            download(\"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/epsilon_normalized.bz2\", train_archive_path)\n            download(\"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/epsilon_normalized.t.bz2\", test_archive_path)\n        print(\"unpacking dataset\")\n        for file_name, archive_name in zip((train_path, test_path), (train_archive_path, test_archive_path)):\n            zipfile = bz2.BZ2File(archive_name)\n            with open(file_name, 'wb') as f:\n                f.write(zipfile.read())\n\n    print(\"reading dataset (it may take a long time)\")\n    X_train, y_train = load_svmlight_file(train_path, dtype=np.float32, n_features=2000)\n    X_test, y_test = load_svmlight_file(test_path, dtype=np.float32, n_features=2000)\n    X_train, X_test = X_train.toarray(), X_test.toarray()\n    y_train, y_test = y_train.astype(np.int), y_test.astype(np.int)\n    y_train[y_train == -1] = 0\n    y_test[y_test == -1] = 0\n\n    if all(sizes is None for sizes in (train_size, valid_size, test_size)):\n        train_idx_path = os.path.join(path, 'stratified_train_idx.txt')\n        valid_idx_path = os.path.join(path, 'stratified_valid_idx.txt')\n        if not all(os.path.exists(fname) for fname in (train_idx_path, valid_idx_path)):\n            download(\"https://www.dropbox.com/s/wxgm94gvm6d3xn5/stratified_train_idx.txt?dl=1\", train_idx_path)\n            download(\"https://www.dropbox.com/s/fm4llo5uucdglti/stratified_valid_idx.txt?dl=1\", valid_idx_path)\n        train_idx = pd.read_csv(train_idx_path, header=None)[0].values\n        valid_idx = pd.read_csv(valid_idx_path, header=None)[0].values\n    else:\n        assert train_size, \"please provide either train_size or none of sizes\"\n        if valid_size is None:\n            valid_size = len(X_train) - train_size\n            assert valid_size > 0\n        if train_size + valid_size > len(X_train):\n            warnings.warn('train_size + valid_size = {} exceeds dataset size: {}.'.format(\n                train_size + valid_size, len(X_train)), Warning)\n        if test_size is not None:\n            warnings.warn('Test set is fixed for this dataset.', Warning)\n\n        shuffled_indices = np.random.permutation(np.arange(len(X_train)))\n        train_idx = shuffled_indices[:train_size]\n        valid_idx = shuffled_indices[train_size: train_size + valid_size]\n\n    return dict(\n        X_train=X_train[train_idx], y_train=y_train[train_idx],\n        X_valid=X_train[valid_idx], y_valid=y_train[valid_idx],\n        X_test=X_test, y_test=y_test\n    )\n\n\ndef fetch_PROTEIN(path, train_size=None, valid_size=None, test_size=None, **kwargs):\n    \"\"\"\n    https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.html#protein\n    \"\"\"\n    train_path = os.path.join(path, 'protein')\n    test_path = os.path.join(path, 'protein.t')\n    if not all(os.path.exists(fname) for fname in (train_path, test_path)):\n        os.makedirs(path, exist_ok=True)\n        download(\"https://www.dropbox.com/s/pflp4vftdj3qzbj/protein.tr?dl=1\", train_path)\n        download(\"https://www.dropbox.com/s/z7i5n0xdcw57weh/protein.t?dl=1\", test_path)\n    for fname in (train_path, test_path):\n        raw = open(fname).read().replace(' .', '0.')\n        with open(fname, 'w') as f:\n            f.write(raw)\n\n    X_train, y_train = load_svmlight_file(train_path, dtype=np.float32, n_features=357)\n    X_test, y_test = load_svmlight_file(test_path, dtype=np.float32, n_features=357)\n    X_train, X_test = X_train.toarray(), X_test.toarray()\n    y_train, y_test = y_train.astype(np.int), y_test.astype(np.int)\n\n    if all(sizes is None for sizes in (train_size, valid_size, test_size)):\n        train_idx_path = os.path.join(path, 'stratified_train_idx.txt')\n        valid_idx_path = os.path.join(path, 'stratified_valid_idx.txt')\n        if not all(os.path.exists(fname) for fname in (train_idx_path, valid_idx_path)):\n            download(\"https://www.dropbox.com/s/wq2v9hl1wxfufs3/small_stratified_train_idx.txt?dl=1\", train_idx_path)\n            download(\"https://www.dropbox.com/s/7o9el8pp1bvyy22/small_stratified_valid_idx.txt?dl=1\", valid_idx_path)\n        train_idx = pd.read_csv(train_idx_path, header=None)[0].values\n        valid_idx = pd.read_csv(valid_idx_path, header=None)[0].values\n    else:\n        assert train_size, \"please provide either train_size or none of sizes\"\n        if valid_size is None:\n            valid_size = len(X_train) - train_size\n            assert valid_size > 0\n        if train_size + valid_size > len(X_train):\n            warnings.warn('train_size + valid_size = {} exceeds dataset size: {}.'.format(\n                train_size + valid_size, len(X_train)), Warning)\n        if test_size is not None:\n            warnings.warn('Test set is fixed for this dataset.', Warning)\n\n        shuffled_indices = np.random.permutation(np.arange(len(X_train)))\n        train_idx = shuffled_indices[:train_size]\n        valid_idx = shuffled_indices[train_size: train_size + valid_size]\n\n    return dict(\n        X_train=X_train[train_idx], y_train=y_train[train_idx],\n        X_valid=X_train[valid_idx], y_valid=y_train[valid_idx],\n        X_test=X_test, y_test=y_test\n    )\n\n\ndef fetch_YEAR(path, train_size=None, valid_size=None, test_size=51630, **kwargs):\n    data_path = os.path.join(path, 'data.csv')\n    if not os.path.exists(data_path):\n        os.makedirs(path, exist_ok=True)\n        download('https://www.dropbox.com/s/l09pug0ywaqsy0e/YearPredictionMSD.txt?dl=1', data_path)\n    n_features = 91\n    types = {i: (np.float32 if i != 0 else np.int) for i in range(n_features)}\n    data = pd.read_csv(data_path, header=None, dtype=types)\n    data_train, data_test = data.iloc[:-test_size], data.iloc[-test_size:]\n\n    X_train, y_train = data_train.iloc[:, 1:].values, data_train.iloc[:, 0].values\n    X_test, y_test = data_test.iloc[:, 1:].values, data_test.iloc[:, 0].values\n\n    if all(sizes is None for sizes in (train_size, valid_size)):\n        train_idx_path = os.path.join(path, 'stratified_train_idx.txt')\n        valid_idx_path = os.path.join(path, 'stratified_valid_idx.txt')\n        if not all(os.path.exists(fname) for fname in (train_idx_path, valid_idx_path)):\n            download(\"https://www.dropbox.com/s/00u6cnj9mthvzj1/stratified_train_idx.txt?dl=1\", train_idx_path)\n            download(\"https://www.dropbox.com/s/420uhjvjab1bt7k/stratified_valid_idx.txt?dl=1\", valid_idx_path)\n        train_idx = pd.read_csv(train_idx_path, header=None)[0].values\n        valid_idx = pd.read_csv(valid_idx_path, header=None)[0].values\n    else:\n        assert train_size, \"please provide either train_size or none of sizes\"\n        if valid_size is None:\n            valid_size = len(X_train) - train_size\n            assert valid_size > 0\n        if train_size + valid_size > len(X_train):\n            warnings.warn('train_size + valid_size = {} exceeds dataset size: {}.'.format(\n                train_size + valid_size, len(X_train)), Warning)\n\n        shuffled_indices = np.random.permutation(np.arange(len(X_train)))\n        train_idx = shuffled_indices[:train_size]\n        valid_idx = shuffled_indices[train_size: train_size + valid_size]\n\n    return dict(\n        X_train=X_train[train_idx], y_train=y_train[train_idx],\n        X_valid=X_train[valid_idx], y_valid=y_train[valid_idx],\n        X_test=X_test, y_test=y_test,\n    )\n\n\ndef fetch_HIGGS(path, train_size=None, valid_size=None, test_size=5 * 10 ** 5, **kwargs):\n    data_path = os.path.join(path, 'higgs.csv')\n    if not os.path.exists(data_path):\n        os.makedirs(path, exist_ok=True)\n        archive_path = os.path.join(path, 'HIGGS.csv.gz')\n        download('https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz', archive_path)\n        with gzip.open(archive_path, 'rb') as f_in:\n            with open(data_path, 'wb') as f_out:\n                shutil.copyfileobj(f_in, f_out)\n    n_features = 29\n    types = {i: (np.float32 if i != 0 else np.int) for i in range(n_features)}\n    data = pd.read_csv(data_path, header=None, dtype=types)\n    data_train, data_test = data.iloc[:-test_size], data.iloc[-test_size:]\n\n    X_train, y_train = data_train.iloc[:, 1:].values, data_train.iloc[:, 0].values\n    X_test, y_test = data_test.iloc[:, 1:].values, data_test.iloc[:, 0].values\n\n    if all(sizes is None for sizes in (train_size, valid_size)):\n        train_idx_path = os.path.join(path, 'stratified_train_idx.txt')\n        valid_idx_path = os.path.join(path, 'stratified_valid_idx.txt')\n        if not all(os.path.exists(fname) for fname in (train_idx_path, valid_idx_path)):\n            download(\"https://www.dropbox.com/s/i2uekmwqnp9r4ix/stratified_train_idx.txt?dl=1\", train_idx_path)\n            download(\"https://www.dropbox.com/s/wkbk74orytmb2su/stratified_valid_idx.txt?dl=1\", valid_idx_path)\n        train_idx = pd.read_csv(train_idx_path, header=None)[0].values\n        valid_idx = pd.read_csv(valid_idx_path, header=None)[0].values\n    else:\n        assert train_size, \"please provide either train_size or none of sizes\"\n        if valid_size is None:\n            valid_size = len(X_train) - train_size\n            assert valid_size > 0\n        if train_size + valid_size > len(X_train):\n            warnings.warn('train_size + valid_size = {} exceeds dataset size: {}.'.format(\n                train_size + valid_size, len(X_train)), Warning)\n\n        shuffled_indices = np.random.permutation(np.arange(len(X_train)))\n        train_idx = shuffled_indices[:train_size]\n        valid_idx = shuffled_indices[train_size: train_size + valid_size]\n\n    return dict(\n        X_train=X_train[train_idx], y_train=y_train[train_idx],\n        X_valid=X_train[valid_idx], y_valid=y_train[valid_idx],\n        X_test=X_test, y_test=y_test,\n    )\n\n\ndef fetch_MICROSOFT(path, **kwargs):\n    train_path = os.path.join(path, 'msrank_train.tsv')\n    test_path = os.path.join(path, 'msrank_test.tsv')\n    if not all(os.path.exists(fname) for fname in (train_path, test_path)):\n        os.makedirs(path, exist_ok=True)\n        download(\"https://www.dropbox.com/s/izpty5feug57kqn/msrank_train.tsv?dl=1\", train_path)\n        download(\"https://www.dropbox.com/s/tlsmm9a6krv0215/msrank_test.tsv?dl=1\", test_path)\n\n        for fname in (train_path, test_path):\n            raw = open(fname).read().replace('\\\\t', '\\t')\n            with open(fname, 'w') as f:\n                f.write(raw)\n\n    data_train = pd.read_csv(train_path, header=None, skiprows=1, sep='\\t')\n    data_test = pd.read_csv(test_path, header=None, skiprows=1, sep='\\t')\n\n    train_idx_path = os.path.join(path, 'train_idx.txt')\n    valid_idx_path = os.path.join(path, 'valid_idx.txt')\n    if not all(os.path.exists(fname) for fname in (train_idx_path, valid_idx_path)):\n        download(\"https://www.dropbox.com/s/pba6dyibyogep46/train_idx.txt?dl=1\", train_idx_path)\n        download(\"https://www.dropbox.com/s/yednqu9edgdd2l1/valid_idx.txt?dl=1\", valid_idx_path)\n    train_idx = pd.read_csv(train_idx_path, header=None)[0].values\n    valid_idx = pd.read_csv(valid_idx_path, header=None)[0].values\n\n    X_train, y_train, query_train = data_train.iloc[train_idx, 2:].values, data_train.iloc[train_idx, 0].values, data_train.iloc[train_idx, 1].values\n    X_valid, y_valid, query_valid = data_train.iloc[valid_idx, 2:].values, data_train.iloc[valid_idx, 0].values, data_train.iloc[valid_idx, 1].values\n    X_test, y_test, query_test = data_test.iloc[:, 2:].values, data_test.iloc[:, 0].values, data_test.iloc[:, 1].values\n\n    return dict(\n        X_train=X_train.astype(np.float32), y_train=y_train.astype(np.int64), query_train=query_train,\n        X_valid=X_valid.astype(np.float32), y_valid=y_valid.astype(np.int64), query_valid=query_valid,\n        X_test=X_test.astype(np.float32), y_test=y_test.astype(np.int64), query_test=query_test,\n    )\n\n\ndef fetch_YAHOO(path, *args):\n    train_path = os.path.join(path, 'yahoo_train.tsv')\n    valid_path = os.path.join(path, 'yahoo_valid.tsv')\n    test_path = os.path.join(path, 'yahoo_test.tsv')\n    if not all(os.path.exists(fname) for fname in (train_path, valid_path, test_path)):\n        os.makedirs(path, exist_ok=True)\n        train_archive_path = os.path.join(path, 'yahoo_train.tsv.gz')\n        valid_archive_path = os.path.join(path, 'yahoo_valid.tsv.gz')\n        test_archive_path = os.path.join(path, 'yahoo_test.tsv.gz')\n        if not all(os.path.exists(fname) for fname in (train_archive_path, valid_archive_path, test_archive_path)):\n            download(\"https://www.dropbox.com/s/7rq3ki5vtxm6gzx/yahoo_set_1_train.gz?dl=1\", train_archive_path)\n            download(\"https://www.dropbox.com/s/3ai8rxm1v0l5sd1/yahoo_set_1_validation.gz?dl=1\", valid_archive_path)\n            download(\"https://www.dropbox.com/s/3d7tdfb1an0b6i4/yahoo_set_1_test.gz?dl=1\", test_archive_path)\n\n        for file_name, archive_name in zip((train_path, valid_path, test_path), (train_archive_path, valid_archive_path, test_archive_path)):\n            with gzip.open(archive_name, 'rb') as f_in:\n                with open(file_name, 'wb') as f_out:\n                    shutil.copyfileobj(f_in, f_out)\n\n        for fname in (train_path, valid_path, test_path):\n            raw = open(fname).read().replace('\\\\t', '\\t')\n            with open(fname, 'w') as f:\n                f.write(raw)\n\n    data_train = pd.read_csv(train_path, header=None, skiprows=1, sep='\\t')\n    data_valid = pd.read_csv(valid_path, header=None, skiprows=1, sep='\\t')\n    data_test = pd.read_csv(test_path, header=None, skiprows=1, sep='\\t')\n\n    X_train, y_train, query_train = data_train.iloc[:, 2:].values, data_train.iloc[:, 0].values, data_train.iloc[:, 1].values\n    X_valid, y_valid, query_valid = data_valid.iloc[:, 2:].values, data_valid.iloc[:, 0].values, data_valid.iloc[:, 1].values\n    X_test, y_test, query_test = data_test.iloc[:, 2:].values, data_test.iloc[:, 0].values, data_test.iloc[:, 1].values\n\n    return dict(\n        X_train=X_train.astype(np.float32), y_train=y_train, query_train=query_train,\n        X_valid=X_valid.astype(np.float32), y_valid=y_valid, query_valid=query_valid,\n        X_test=X_test.astype(np.float32), y_test=y_test, query_test=query_test,\n    )\n\n\ndef fetch_CLICK(path, valid_size=100_000, seed=None, **kwargs):\n    # based on: https://www.kaggle.com/slamnz/primer-airlines-delay\n    csv_path = os.path.join(path, 'click.csv')\n    if not os.path.exists(csv_path):\n        os.makedirs(path, exist_ok=True)\n        download('https://www.dropbox.com/s/w43ylgrl331svqc/click.csv?dl=1', csv_path)\n\n    data = pd.read_csv(csv_path, index_col=0)\n    X, y = data.drop(columns=['target']), data['target']\n    X_train, X_test = X[:-100_000].copy(), X[-100_000:].copy()\n    y_train, y_test = y[:-100_000].copy(), y[-100_000:].copy()\n\n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n\n    cat_features = ['url_hash', 'ad_id', 'advertiser_id', 'query_id',\n                    'keyword_id', 'title_id', 'description_id', 'user_id']\n\n    X_train, X_val, y_train, y_val = train_test_split(\n        X_train, y_train, test_size=valid_size, random_state=seed)\n\n    cat_encoder = LeaveOneOutEncoder()\n    cat_encoder.fit(X_train[cat_features], y_train)\n    X_train[cat_features] = cat_encoder.transform(X_train[cat_features])\n    X_val[cat_features] = cat_encoder.transform(X_val[cat_features])\n    X_test[cat_features] = cat_encoder.transform(X_test[cat_features])\n    return dict(\n        X_train=X_train.values.astype('float32'), y_train=y_train,\n        X_valid=X_val.values.astype('float32'), y_valid=y_val,\n        X_test=X_test.values.astype('float32'), y_test=y_test\n    )\n\ndef fetch_MUSHROOMS(path, valid_size=0.2, test_size=0.2, seed=None):\n\n    path = Path(path)\n    data_path = path / 'agaricus-lepiota.data'\n\n    if not data_path.exists():\n        path.mkdir(parents=True, exist_ok=True)\n\n        download('https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data', data_path)\n\n    data = pd.read_csv(data_path, names=np.arange(23))\n    encoder = OrdinalEncoder(return_df=False)\n    data = encoder.fit_transform(data)\n    \n    X, Y = (data[:, 1:]).astype(np.float32), (data[:, 0] - 1).astype(int)\n\n    X_train, X_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=test_size, random_state=seed)\n\n    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=valid_size / (1 - test_size), random_state=seed)\n\n    return dict(\n        X_train=X_train, y_train=y_train, X_valid=X_val, y_valid=y_val, X_test=X_test, y_test=y_test\n    )\n\ndef fetch_TICTACTOE(path, valid_size=0.2, test_size=0.2, seed=None):\n\n    path = Path(path)\n    data_path = path / 'tic-tac-toe.data'\n\n    if not data_path.exists():\n        path.mkdir(parents=True, exist_ok=True)\n\n        download('https://archive.ics.uci.edu/ml/machine-learning-databases/tic-tac-toe/tic-tac-toe.data', data_path)\n\n    data = pd.read_csv(data_path, names=np.arange(10))\n    encoder = OrdinalEncoder(return_df=False)\n    data = encoder.fit_transform(data)\n    \n    X, Y = (data[:, :-1]).astype(np.float32), (data[:, -1] - 1).astype(int)\n\n    X_train, X_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=test_size, random_state=seed)\n\n    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=valid_size / (1 - test_size), random_state=seed)\n\n    return dict(\n        X_train=X_train, y_train=y_train, X_valid=X_val, y_valid=y_val, X_test=X_test, y_test=y_test\n    )\n\n#**from src.toy_datasets import ***\n\n# ------------------------------------------------------------ TOY DATASETS\n\ndef toy_dataset(n=1000, distr=\"xor\", dim=2):\n\n    if distr == \"xor\":\n        \n        X = np.random.uniform(low=tuple([-1.] * dim), high=tuple([1.] * dim), size=(n, dim))\n        Y = (X[:,0] * X[:,1] >= 0).astype(int)\n\n        return dict(X=X.astype(np.float32), Y=Y)\n\n    elif distr == \"reg-xor\":\n        \n        X = np.random.uniform(low=tuple([-1.] * dim), high=tuple([1.] * dim), size=(n, dim))\n        labels = (X[:,0] * X[:,1] >= 0)\n\n        Y = np.empty(n)\n        Y[labels] = np.random.normal(0.8, 0.1, np.sum(labels))\n        Y[~labels] = np.random.normal(0.2, 0.1, np.sum(~labels))\n\n        return dict(X=X.astype(np.float32), Y=Y.astype(np.float32), labels=labels)\n\n    elif distr == \"swissroll\":\n\n        n2 = n // 2\n\n        X1,_ = make_swiss_roll(n_samples=n2, noise=0)\n        Y1 = np.ones(n2)\n\n        X2 = np.random.uniform(low=tuple([-1.] * dim), high=tuple([1.] * dim), size=(n2, dim))\n        Y2 = np.zeros(n2)\n\n        X = np.r_[X1[:,::2] / 15, X2]\n        Y = np.r_[Y1, Y2]\n\n        return dict(X=X.astype(np.float32), Y=Y)\n\n    else:\n        NotImplementedError","metadata":{"execution":{"iopub.status.busy":"2024-04-26T08:23:10.700789Z","iopub.execute_input":"2024-04-26T08:23:10.701165Z","iopub.status.idle":"2024-04-26T08:23:10.821647Z","shell.execute_reply.started":"2024-04-26T08:23:10.701121Z","shell.execute_reply":"2024-04-26T08:23:10.820699Z"},"trusted":true},"execution_count":46,"outputs":[]},{"cell_type":"markdown","source":"**Preprocessing**","metadata":{}},{"cell_type":"code","source":"def preprocess_data_adult(data_path):\n    # Read the data into a DataFrame\n    columns = [\n        \"age\", \"workclass\", \"fnlwgt\", \"education\", \"education-num\", \"marital-status\",\n        \"occupation\", \"relationship\", \"race\", \"sex\", \"capital-gain\", \"capital-loss\",\n        \"hours-per-week\", \"native-country\", \"income\"\n    ]\n    df = pd.read_csv(data_path, names=columns, na_values=[\" ?\"])\n\n    # Drop rows with missing values\n    df.dropna(inplace=True)\n\n    # Convert categorical features using Label Encoding\n    categorical_columns = [\"workclass\", \"education\", \"marital-status\", \"occupation\", \"relationship\", \"race\", \"sex\", \"native-country\"]\n    label_encoders = {}\n    for col in categorical_columns:\n        le = LabelEncoder()\n        df[col] = le.fit_transform(df[col])\n        label_encoders[col] = le\n\n    # Encode the target variable\n    df[\"income\"] = df[\"income\"].apply(lambda x: 1 if x == \" >50K\" else 0)\n\n    return df\n\ndef preprocess_data_bank_marketing(data):\n    # Convert categorical features using Label Encoding\n    label_encoders = {}\n    for col in data.select_dtypes(include=['object']).columns:\n        le = LabelEncoder()\n        data[col] = le.fit_transform(data[col])\n        label_encoders[col] = le\n\n    return data\n\ndef preprocess_data_credit_card_defaults(data):\n    # Convert categorical features using one-hot encoding\n    data = pd.get_dummies(data, columns=[\"SEX\", \"EDUCATION\", \"MARRIAGE\"], drop_first=True)\n\n    # Standardize numerical features\n    scaler = StandardScaler()\n    data[[\"LIMIT_BAL\", \"AGE\", \"PAY_0\", \"PAY_2\", \"PAY_3\", \"PAY_4\", \"PAY_5\", \"PAY_6\", \"BILL_AMT1\",\n          \"BILL_AMT2\", \"BILL_AMT3\", \"BILL_AMT4\", \"BILL_AMT5\", \"BILL_AMT6\", \"PAY_AMT1\", \"PAY_AMT2\",\n          \"PAY_AMT3\", \"PAY_AMT4\", \"PAY_AMT5\", \"PAY_AMT6\"]] = scaler.fit_transform(\n        data[[\"LIMIT_BAL\", \"AGE\", \"PAY_0\", \"PAY_2\", \"PAY_3\", \"PAY_4\", \"PAY_5\", \"PAY_6\", \"BILL_AMT1\",\n               \"BILL_AMT2\", \"BILL_AMT3\", \"BILL_AMT4\", \"BILL_AMT5\", \"BILL_AMT6\", \"PAY_AMT1\", \"PAY_AMT2\",\n               \"PAY_AMT3\", \"PAY_AMT4\", \"PAY_AMT5\", \"PAY_AMT6\"]])\n\n    return data\n","metadata":{"execution":{"iopub.status.busy":"2024-04-26T08:23:11.332876Z","iopub.execute_input":"2024-04-26T08:23:11.333325Z","iopub.status.idle":"2024-04-26T08:23:11.344846Z","shell.execute_reply.started":"2024-04-26T08:23:11.333288Z","shell.execute_reply":"2024-04-26T08:23:11.343785Z"},"trusted":true},"execution_count":47,"outputs":[]},{"cell_type":"markdown","source":"**Fetch Tabular dataset**","metadata":{}},{"cell_type":"code","source":"def fetch_ADULT(data_dir=\"./ADULT_DATA\"):\n    print(\"---------------------ADULT--------------------------------------\")\n    # Create the data directory if it doesn't exist\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n        \n    # URL of the dataset zip file\n    url = \"https://archive.ics.uci.edu/static/public/2/adult.zip\"\n    zip_file_path = os.path.join(data_dir, \"adult.zip\")\n    # Download the zip file\n    urllib.request.urlretrieve(url, zip_file_path)\n    # Extract the zip file\n    with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n        zip_ref.extractall(data_dir)\n\n    # Preprocess the data\n    train_data_path = os.path.join(data_dir, \"adult.data\")\n#     test_data_path = os.path.join(data_dir, \"adult.test\")\n   \n    df_train = preprocess_data_adult(train_data_path)\n#     df_test = preprocess_data_adult(test_data_path)\n\n    # Split the data into train, validation, and test sets\n    X = df_train.drop(\"income\", axis=1)\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = df_train[\"income\"]\n    \n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)\n    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n    \n#     X_test = df_test.drop(\"income\", axis=1)\n#     y_test = df_test[\"income\"]\n\n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_val = (y_val.values.reshape(-1) == 1).astype('int64')\n    # Remove the zip file\n    os.remove(zip_file_path)\n\n    # Remove the extracted directory and its contents using shutil.rmtree()\n    shutil.rmtree(data_dir)\n\n    return dict(\n        X_train=X_train.astype('float32'), y_train=y_train, X_valid=X_val.astype('float32'), y_valid=y_val, X_test=X_test.astype('float32'), y_test=y_test\n    )\n\ndef fetch_bank_marketing(data_dir=\"./BANK\"):\n    print(\"---------------------BANK--------------------------------------\")\n    # Create the data directory if it doesn't exist\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    # URL of the dataset zip file\n    url = \"https://archive.ics.uci.edu/static/public/222/bank+marketing.zip\"\n    zip_file_path = os.path.join(data_dir, \"bank_marketing.zip\")\n\n    # Download the zip file\n    urllib.request.urlretrieve(url, zip_file_path)\n\n    # Extract the zip file\n    with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n        zip_ref.extractall(data_dir)\n    \n    zip_file_path_bank_add = os.path.join(data_dir, \"bank-additional.zip\")\n    with zipfile.ZipFile(zip_file_path_bank_add, \"r\") as zip_ref:\n        zip_ref.extractall(data_dir)\n\n    # Get the extracted directory path\n    extracted_dir = os.path.join(data_dir, \"bank-additional\")\n\n    # Read the dataset\n    data = pd.read_csv(os.path.join(extracted_dir, \"bank-additional-full.csv\"), sep=';')\n\n    # Preprocess the data\n    data = preprocess_data_bank_marketing(data)\n\n    # Split the data into train, validation, and test sets\n    X = data.drop(\"y\", axis=1)\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = data[\"y\"]\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)\n    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n    \n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_val = (y_val.values.reshape(-1) == 1).astype('int64')\n    # Remove the zip file\n    os.remove(zip_file_path)\n\n    # Remove the extracted directory and its contents\n    shutil.rmtree(data_dir)\n\n    return dict(\n        X_train=X_train.astype('float32'), y_train=y_train,X_test=X_test.astype('float32'), y_test=y_test, X_valid = X_val.astype('float32'), y_valid = y_val\n    )\n\ndef fetch_credit_card_defaults(data_dir=\"./CREDIT\"):\n    print(\"---------------------CREDIT--------------------------------------\")\n    # Create the data directory if it doesn't exist\n    !pip install xlrd\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    # URL of the dataset zip file\n    url = \"https://archive.ics.uci.edu/static/public/350/default+of+credit+card+clients.zip\"\n    zip_file_path = os.path.join(data_dir, \"credit_card_defaults.zip\")\n\n    # Download the zip file\n    urllib.request.urlretrieve(url, zip_file_path)\n\n    # Extract the zip file\n    with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n        zip_ref.extractall(data_dir)\n\n#     # Get the extracted directory path\n#     extracted_dir = os.path.join(data_dir, \"default+of+credit+card+clients\")\n\n    # Read the dataset\n    data = pd.read_excel(os.path.join(data_dir, \"default of credit card clients.xls\"), skiprows=1)\n\n    # Preprocess the data\n    data = preprocess_data_credit_card_defaults(data)\n\n    # Split the data into train, validation, and test sets\n    X = data.drop(\"default payment next month\", axis=1)\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = data[\"default payment next month\"]\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)\n    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n    \n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_val = (y_val.values.reshape(-1) == 1).astype('int64')\n\n    # Remove the zip file\n    os.remove(zip_file_path)\n\n    # Remove the extracted directory and its contents\n    shutil.rmtree(data_dir)\n\n    return dict(\n        X_train=X_train.astype('float32'), y_train=y_train, X_valid=X_val.astype('float32'), y_valid=y_val , X_test=X_test.astype('float32'), y_test=y_test\n    )\n\n\ndef fetch_gamma_telescope(data_dir=\"./TELESCOPE\"):\n    print(\"---------------------TELESCOPE--------------------------------------\")\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    # URL of the dataset zip file\n    url = \"https://archive.ics.uci.edu/static/public/159/magic+gamma+telescope.zip\"\n    zip_file_path = os.path.join(data_dir, \"magic_gamma_telescope.zip\")\n\n    # Download the zip file\n    urllib.request.urlretrieve(url, zip_file_path)\n\n    # Extract the zip file\n    with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n        zip_ref.extractall(data_dir)\n    \n    # Load the data from CSV\n    data_path = os.path.join(data_dir, \"magic04.data\")\n    columns = [\n        \"fLength\", \"fWidth\", \"fSize\", \"fConc\", \"fConc1\", \"fAsym\", \"fM3Long\",\n        \"fM3Trans\", \"fAlpha\", \"fDist\", \"class\"\n    ]\n    data = pd.read_csv(data_path, header=None, names=columns)\n    \n    # Convert the class labels to binary format (g = gamma, h = hadron)\n    data[\"class\"] = data[\"class\"].map({\"g\": 1, \"h\": 0})\n    \n    # Split the data into features (X) and target (y)\n    X = data.drop(\"class\", axis=1)\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = data[\"class\"]\n    \n    # Split the data into train, test, and validation sets\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)\n    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n    \n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_valid = (y_valid.values.reshape(-1) == 1).astype('int64')\n    \n    # Create a dictionary to store the data splits\n    data_splits = {\n        \"X_train\": X_train.astype('float32'), \"y_train\": y_train,\n        \"X_valid\": X_valid.astype('float32'), \"y_valid\": y_valid,\n        \"X_test\": X_test.astype('float32'), \"y_test\": y_test\n    }\n    \n    # Remove the zip file\n    os.remove(zip_file_path)\n\n    # Remove the extracted directory and its contents\n    shutil.rmtree(data_dir)\n    \n    return data_splits\n\ndef fetch_rice_dataset(data_dir=\"./RICE\"):\n    print(\"---------------------RICE--------------------------------------\")\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    # URL of the dataset zip file\n    url = \"https://archive.ics.uci.edu/static/public/545/rice+cammeo+and+osmancik.zip\"\n    zip_file_path = os.path.join(data_dir, \"rice_dataset.zip\")\n\n    # Download the zip file\n    urllib.request.urlretrieve(url, zip_file_path)\n\n    # Extract the zip file\n    with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n        zip_ref.extractall(data_dir)\n        \n    # Load the data from CSV\n    arff_file_name = os.path.join(data_dir, \"Rice_Cammeo_Osmancik.arff\")\n\n    \n    # Load the ARFF file using SciPy\n    data, meta = arff.loadarff(arff_file_name)\n    \n    df = pd.DataFrame(data)\n    print(\"df\",df)\n    df[\"Class\"] = df[\"Class\"].map({b'Cammeo': 1, b'Osmancik': 0})\n    \n    # Split the data into features (X) and target (y)\n    X = df.drop(\"Class\", axis=1)\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = df[\"Class\"]\n    \n    # Split the data into train, test, and validation sets\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)\n    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n    \n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_valid = (y_valid.values.reshape(-1) == 1).astype('int64')\n    \n    # Create a dictionary to store the data splits\n    data_splits = {\n        \"X_train\": X_train.astype('float32'), \"y_train\": y_train,\n        \"X_valid\": X_valid.astype('float32'), \"y_valid\": y_valid,\n        \"X_test\": X_test.astype('float32'), \"y_test\": y_test\n    }\n    \n    # Remove the zip file\n    os.remove(zip_file_path)\n\n    # Remove the extracted directory and its contents\n    shutil.rmtree(data_dir)\n    \n    return data_splits\n\ndef fetch_german_credit_data(data_dir=\"./GERMAN\"):\n    print(\"---------------------GERMAN--------------------------------------\")\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    # URL of the dataset zip file\n    url = \"http://archive.ics.uci.edu/static/public/144/statlog+german+credit+data.zip\"\n    zip_file_path = os.path.join(data_dir, \"german_credit_data.zip\")\n\n    # Download the zip file\n    urllib.request.urlretrieve(url, zip_file_path)\n\n    # Extract the zip file\n    with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n        zip_ref.extractall(data_dir)\n        \n    # Load the data from CSV\n    data_path = os.path.join(data_dir, \"german.data\")\n\n    columns = [\n        \"checking_account_status\", \"duration_months\", \"credit_history\", \"purpose\",\n        \"credit_amount\", \"savings_account_bonds\", \"employment\", \"installment_rate\",\n        \"personal_status_sex\", \"other_debtors_guarantors\", \"present_residence\",\n        \"property\", \"age\", \"other_installment_plans\", \"housing\", \"existing_credits\",\n        \"job\", \"num_dependents\", \"own_telephone\", \"foreign_worker\", \"class\"\n    ]\n    data = pd.read_csv(data_path, sep=' ', header=None, names=columns)\n    \n    # Convert the class labels to binary format (1 = Good, 2 = Bad)\n    data[\"class\"] = data[\"class\"].map({1: 1, 2: 0})\n    \n    # Handle null values (replace with appropriate values)\n    data.fillna(method='ffill', inplace=True)  # Forward fill\n    \n    # Convert categorical variables to dummy variables\n    categorical_columns = [\n        \"checking_account_status\", \"credit_history\", \"purpose\", \"savings_account_bonds\",\n        \"employment\", \"personal_status_sex\", \"other_debtors_guarantors\", \"property\",\n        \"other_installment_plans\", \"housing\", \"job\", \"own_telephone\", \"foreign_worker\"\n    ]\n    data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)\n    \n    # Split the data into features (X) and target (y)\n    X = data.drop(\"class\", axis=1)\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = data[\"class\"]\n    \n    # Split the data into train, test, and validation sets\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)\n    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n    \n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_valid = (y_valid.values.reshape(-1) == 1).astype('int64')\n    \n    # Create a dictionary to store the data splits\n    data_splits = {\n        \"X_train\": X_train.astype('float32'), \"y_train\": y_train,\n        \"X_valid\": X_valid.astype('float32'), \"y_valid\": y_valid,\n        \"X_test\": X_test.astype('float32'), \"y_test\": y_test\n    }\n    \n    # Remove the zip file\n    os.remove(zip_file_path)\n\n    # Remove the extracted directory and its contents\n    shutil.rmtree(data_dir)\n    \n    return data_splits\n\ndef fetch_spambase_dataset(data_dir=\"./SPAM\"):\n    print(\"---------------------SPAM--------------------------------------\")\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    # URL of the dataset zip file\n    url = \"http://archive.ics.uci.edu/static/public/94/spambase.zip\"\n    zip_file_path = os.path.join(data_dir, \"spambase.zip\")\n\n    # Download the zip file\n    urllib.request.urlretrieve(url, zip_file_path)\n\n    # Extract the zip file\n    with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n        zip_ref.extractall(data_dir)\n        \n    # Load the data from CSV\n    data_path = os.path.join(data_dir, \"spambase.data\")\n\n    columns = [\n        f\"f{i}\" for i in range(57)\n    ] + [\"spam\"]\n    data = pd.read_csv(data_path, header=None, names=columns)\n    \n    # Split the data into features (X) and target (y)\n    X = data.drop(\"spam\", axis=1)\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = data[\"spam\"]\n    \n    # Split the data into train, test, and validation sets\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)\n    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n    \n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_valid = (y_valid.values.reshape(-1) == 1).astype('int64')\n    \n    # Create a dictionary to store the data splits\n    data_splits = {\n        \"X_train\": X_train.astype('float32'), \"y_train\": y_train,\n        \"X_valid\": X_valid.astype('float32'), \"y_valid\": y_valid,\n        \"X_test\": X_test.astype('float32'), \"y_test\": y_test\n    }\n    \n    # Remove the zip file\n    os.remove(zip_file_path)\n\n    # Remove the extracted directory and its contents\n    shutil.rmtree(data_dir)\n    \n    return data_splits\n\ndef fetch_accelerometer_gyro_dataset(data_dir=\"./GYRO\"):\n    print(\"---------------------GYRO--------------------------------------\")\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    # URL of the dataset zip file\n    url = \"https://archive.ics.uci.edu/static/public/755/accelerometer+gyro+mobile+phone+dataset.zip\"\n    zip_file_path = os.path.join(data_dir, \"accelerometer_gyro_dataset.zip\")\n\n    # Download the zip file\n    urllib.request.urlretrieve(url, zip_file_path)\n\n    # Extract the zip file\n    with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n        zip_ref.extractall(data_dir)\n        \n    # Load the data from CSV\n    data_path = os.path.join(data_dir, \"accelerometer_gyro_mobile_phone_dataset.csv\")\n    \n    data = pd.read_csv(data_path)\n    \n    # Convert categorical column to numeric (e.g., label encoding)\n    data[\"timestamp\"] = data[\"timestamp\"].astype(\"category\").cat.codes\n    \n    # Split the data into features (X) and target (y)\n    X = data.drop(\"Activity\", axis=1)\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = data[\"Activity\"]\n    \n    # Split the data into train, test, and validation sets\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)\n    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n    \n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_valid = (y_valid.values.reshape(-1) == 1).astype('int64')\n    \n    # Create a dictionary to store the data splits\n    data_splits = {\n        \"X_train\": X_train.astype('float32'), \"y_train\": y_train,\n        \"X_valid\": X_valid.astype('float32'), \"y_valid\": y_valid,\n        \"X_test\": X_test.astype('float32'), \"y_test\": y_test\n    }\n    \n    # Remove the zip file\n    os.remove(zip_file_path)\n\n    # Remove the extracted directory and its contents\n    shutil.rmtree(data_dir)\n    \n    return data_splits\n\ndef fetch_swarm_behaviour(data_dir=\"./SWARM\"):\n    print(\"---------------------SWARM--------------------------------------\")\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    # URL of the dataset zip file\n    url = \"https://archive.ics.uci.edu/static/public/524/swarm+behaviour.zip\"\n    zip_file_path = os.path.join(data_dir, \"swarm_behaviour.zip\")\n\n    # Download the zip file\n    urllib.request.urlretrieve(url, zip_file_path)\n\n    # Extract the zip file\n    with zipfile.ZipFile(zip_file_path, \"r\") as zip_ref:\n        zip_ref.extractall(data_dir)\n        \n    # Load the data from CSV\n    data_path = os.path.join(data_dir, \"Swarm Behavior Data/Grouped.csv\")\n    \n    data = pd.read_csv(data_path)\n    \n    # Split the data into features (X) and target (y)\n    X = data.drop(\"Class\", axis=1)\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = data[\"Class\"]\n    \n    # Split the data into train, test, and validation sets\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)\n    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)\n    \n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_valid = (y_valid.values.reshape(-1) == 1).astype('int64')\n    \n    # Create a dictionary to store the data splits\n    data_splits = {\n        \"X_train\": X_train.astype('float32'), \"y_train\": y_train,\n        \"X_valid\": X_valid.astype('float32'), \"y_valid\": y_valid,\n        \"X_test\": X_test.astype('float32'), \"y_test\": y_test\n    }\n    \n    # Remove the zip file\n    os.remove(zip_file_path)\n    # Remove the extracted directory and its contents\n    shutil.rmtree(data_dir) \n    return data_splits\n\ndef fetch_openml_credit_data(data_dir=\"./OpenML_Credit\"):\n    print(\"---------------------OpenML_Credit DATASET--------------------------------------\")\n    # Create the data directory if it doesn't exist\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    data_url = \"https://api.openml.org/data/v1/download/22103185/credit.arff\"\n    # Download the ARFF file\n    arff_file_path = os.path.join(data_dir, \"credit.arff\")\n    urllib.request.urlretrieve(data_url, arff_file_path)\n\n    # Load ARFF file into DataFrame\n    data, meta = arff.loadarff(arff_file_path)\n    df = pd.DataFrame(data)\n    # Convert target variable to int\n    last_column = df.columns[-1]\n\n    df[last_column] = df[last_column].astype(int)\n    \n#     print(\"df\",df)\n\n    # Split the data into train, validation, and test sets\n    X = df.drop(last_column, axis=1)  # Assuming \"SeriousDlqin2yrs\" is the target variable\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = df[last_column]\n\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)\n    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)\n\n#     y_train = y_train.astype('int64')\n#     y_test = y_test.astype('int64')\n#     y_val = y_val.astype('int64')\n\n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_val = (y_val.values.reshape(-1) == 1).astype('int64')\n\n    # Remove the ARFF file\n    os.remove(arff_file_path)\n\n    # Remove the data directory\n    shutil.rmtree(data_dir)\n\n    return dict(\n        X_train=X_train.astype('float32'), y_train=y_train,\n        X_valid=X_val.astype('float32'), y_valid=y_val,\n        X_test=X_test.astype('float32'), y_test=y_test\n    )\n\n\ndef fetch_openml_electricity_data(data_dir=\"./OpenML_Electricity\"):\n    print(\"---------------------OpenML_Electricity DATASET--------------------------------------\")\n    # Create the data directory if it doesn't exist\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    data_url = \"https://api.openml.org/data/v1/download/22103245/electricity.arff\"\n    # Download the ARFF file\n    arff_file_path = os.path.join(data_dir, \"electricity.arff\")\n    urllib.request.urlretrieve(data_url, arff_file_path)\n\n    # Load ARFF file into DataFrame\n    data, meta = arff.loadarff(arff_file_path)\n    df = pd.DataFrame(data)\n    # Convert target variable to int\n    last_column = df.columns[-1]\n\n    df[last_column] = df[last_column].map({b'DOWN': 0, b'UP': 1})\n    \n#     print(\"df\",df)\n\n    # Split the data into train, validation, and test sets\n    X = df.drop(last_column, axis=1)  # Assuming \"SeriousDlqin2yrs\" is the target variable\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = df[last_column]\n\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)\n    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)\n\n#     y_train = y_train.astype('int64')\n#     y_test = y_test.astype('int64')\n#     y_val = y_val.astype('int64')\n\n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_val = (y_val.values.reshape(-1) == 1).astype('int64')\n\n    # Remove the ARFF file\n    os.remove(arff_file_path)\n\n    # Remove the data directory\n    shutil.rmtree(data_dir)\n\n    return dict(\n        X_train=X_train.astype('float32'), y_train=y_train,\n        X_valid=X_val.astype('float32'), y_valid=y_val,\n        X_test=X_test.astype('float32'), y_test=y_test\n    )\n\n\ndef fetch_openml_covertype_data(data_dir=\"./OpenML_Covertype\"):\n    print(\"---------------------OpenML_Covertype DATASET--------------------------------------\")\n    # Create the data directory if it doesn't exist\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    data_url = \"https://api.openml.org/data/v1/download/22103246/covertype.arff\"\n    # Download the ARFF file\n    arff_file_path = os.path.join(data_dir, \"covertype.arff\")\n    urllib.request.urlretrieve(data_url, arff_file_path)\n\n    # Load ARFF file into DataFrame\n    data, meta = arff.loadarff(arff_file_path)\n    df = pd.DataFrame(data)\n    # Convert target variable to int\n    last_column = df.columns[-1]\n\n    df[last_column] = df[last_column].astype(int)\n    \n#     print(\"df\",df)\n\n    # Split the data into train, validation, and test sets\n    X = df.drop(last_column, axis=1)  # Assuming \"SeriousDlqin2yrs\" is the target variable\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = df[last_column]\n\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)\n    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)\n\n#     y_train = y_train.astype('int64')\n#     y_test = y_test.astype('int64')\n#     y_val = y_val.astype('int64')\n\n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_val = (y_val.values.reshape(-1) == 1).astype('int64')\n\n    # Remove the ARFF file\n    os.remove(arff_file_path)\n\n    # Remove the data directory\n    shutil.rmtree(data_dir)\n\n    return dict(\n        X_train=X_train.astype('float32'), y_train=y_train,\n        X_valid=X_val.astype('float32'), y_valid=y_val,\n        X_test=X_test.astype('float32'), y_test=y_test\n    )\n\n\ndef fetch_openml_pol_data(data_dir=\"./OpenML_Pol\"):\n    print(\"---------------------OpenML_Pol DATASET--------------------------------------\")\n    # Create the data directory if it doesn't exist\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    data_url = \"https://api.openml.org/data/v1/download/22103247/pol.arff\"\n    # Download the ARFF file\n    arff_file_path = os.path.join(data_dir, \"pol.arff\")\n    urllib.request.urlretrieve(data_url, arff_file_path)\n\n    # Load ARFF file into DataFrame\n    data, meta = arff.loadarff(arff_file_path)\n    df = pd.DataFrame(data)\n    # Convert target variable to int\n    last_column = df.columns[-1]\n\n#     print(\"df\",df)\n    \n    df[last_column] = df[last_column].map({b'N':0,b'P':1})\n    \n    \n\n    # Split the data into train, validation, and test sets\n    X = df.drop(last_column, axis=1)  # Assuming \"SeriousDlqin2yrs\" is the target variable\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = df[last_column]\n\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)\n    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)\n\n#     y_train = y_train.astype('int64')\n#     y_test = y_test.astype('int64')\n#     y_val = y_val.astype('int64')\n\n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_val = (y_val.values.reshape(-1) == 1).astype('int64')\n\n    # Remove the ARFF file\n    os.remove(arff_file_path)\n\n    # Remove the data directory\n    shutil.rmtree(data_dir)\n\n    return dict(\n        X_train=X_train.astype('float32'), y_train=y_train,\n        X_valid=X_val.astype('float32'), y_valid=y_val,\n        X_test=X_test.astype('float32'), y_test=y_test\n    )\n\n\ndef fetch_openml_house_16H_data(data_dir=\"./OpenML_House_16H\"):\n    print(\"---------------------OpenML_House_16H DATASET--------------------------------------\")\n    # Create the data directory if it doesn't exist\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    data_url = \"https://api.openml.org/data/v1/download/22103248/house_16H.arff\"\n    # Download the ARFF file\n    arff_file_path = os.path.join(data_dir, \"house_16H.arff\")\n    urllib.request.urlretrieve(data_url, arff_file_path)\n\n    # Load ARFF file into DataFrame\n    data, meta = arff.loadarff(arff_file_path)\n    df = pd.DataFrame(data)\n    # Convert target variable to int\n    last_column = df.columns[-1]\n\n#     print(\"df\",df)\n    df[last_column] = df[last_column].map({b'N':0,b'P':1})\n    \n    # Split the data into train, validation, and test sets\n    X = df.drop(last_column, axis=1)  # Assuming \"SeriousDlqin2yrs\" is the target variable\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = df[last_column]\n\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)\n    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)\n\n#     y_train = y_train.astype('int64')\n#     y_test = y_test.astype('int64')\n#     y_val = y_val.astype('int64')\n\n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_val = (y_val.values.reshape(-1) == 1).astype('int64')\n\n    # Remove the ARFF file\n    os.remove(arff_file_path)\n\n    # Remove the data directory\n    shutil.rmtree(data_dir)\n\n    return dict(\n        X_train=X_train.astype('float32'), y_train=y_train,\n        X_valid=X_val.astype('float32'), y_valid=y_val,\n        X_test=X_test.astype('float32'), y_test=y_test\n    )\n\n\ndef fetch_openml_MiniBooNE_data(data_dir=\"./OpenML_MiniBooNE\"):\n    print(\"---------------------OpenML_MiniBooNE DATASET--------------------------------------\")\n    # Create the data directory if it doesn't exist\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    data_url = \"https://api.openml.org/data/v1/download/22103253/MiniBooNE.arff\"\n    # Download the ARFF file\n    arff_file_path = os.path.join(data_dir, \"MiniBooNE.arff\")\n    urllib.request.urlretrieve(data_url, arff_file_path)\n\n    # Load ARFF file into DataFrame\n    data, meta = arff.loadarff(arff_file_path)\n    df = pd.DataFrame(data)\n    # Convert target variable to int\n    last_column = df.columns[-1]\n\n#     print(\"df\",df)\n    \n    df[last_column] = df[last_column].map({b'False':0,b'True':1})\n\n    # Split the data into train, validation, and test sets\n    X = df.drop(last_column, axis=1)  # Assuming \"SeriousDlqin2yrs\" is the target variable\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = df[last_column]\n\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)\n    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)\n\n#     y_train = y_train.astype('int64')\n#     y_test = y_test.astype('int64')\n#     y_val = y_val.astype('int64')\n\n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_val = (y_val.values.reshape(-1) == 1).astype('int64')\n\n    # Remove the ARFF file\n    os.remove(arff_file_path)\n\n    # Remove the data directory\n    shutil.rmtree(data_dir)\n\n    return dict(\n        X_train=X_train.astype('float32'), y_train=y_train,\n        X_valid=X_val.astype('float32'), y_valid=y_val,\n        X_test=X_test.astype('float32'), y_test=y_test\n    )\n\n\ndef fetch_openml_eye_movements_data(data_dir=\"./OpenML_Eye_movements\"):\n    print(\"---------------------OpenML_Eye_movements DATASET--------------------------------------\")\n    # Create the data directory if it doesn't exist\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    data_url = \"https://api.openml.org/data/v1/download/22103255/eye_movements.arff\"\n    # Download the ARFF file\n    arff_file_path = os.path.join(data_dir, \"eye_movements.arff\")\n    urllib.request.urlretrieve(data_url, arff_file_path)\n\n    # Load ARFF file into DataFrame\n    data, meta = arff.loadarff(arff_file_path)\n    df = pd.DataFrame(data)\n    # Convert target variable to int\n    last_column = df.columns[-1]\n\n#     print(\"df\",df)\n    df[last_column] = df[last_column].astype(int)\n\n    # Split the data into train, validation, and test sets\n    X = df.drop(last_column, axis=1)  # Assuming \"SeriousDlqin2yrs\" is the target variable\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = df[last_column]\n\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)\n    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)\n\n#     y_train = y_train.astype('int64')\n#     y_test = y_test.astype('int64')\n#     y_val = y_val.astype('int64')\n\n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_val = (y_val.values.reshape(-1) == 1).astype('int64')\n\n    # Remove the ARFF file\n    os.remove(arff_file_path)\n\n    # Remove the data directory\n    shutil.rmtree(data_dir)\n\n    return dict(\n        X_train=X_train.astype('float32'), y_train=y_train,\n        X_valid=X_val.astype('float32'), y_valid=y_val,\n        X_test=X_test.astype('float32'), y_test=y_test\n    )\n\n\ndef fetch_openml_Diabetes130US_data(data_dir=\"./OpenML_Diabetes130US\"):\n    print(\"---------------------OpenML_Diabetes130US DATASET--------------------------------------\")\n    # Create the data directory if it doesn't exist\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    data_url = \"https://api.openml.org/data/v1/download/22111908/Diabetes130US.arff\"\n    # Download the ARFF file\n    arff_file_path = os.path.join(data_dir, \"Diabetes130US.arff\")\n    urllib.request.urlretrieve(data_url, arff_file_path)\n\n    # Load ARFF file into DataFrame\n    data, meta = arff.loadarff(arff_file_path)\n    df = pd.DataFrame(data)\n    # Convert target variable to int\n    last_column = df.columns[-1]\n#     print(\"df\",df)\n    df[last_column] = df[last_column].astype(int)\n    \n\n    # Split the data into train, validation, and test sets\n    X = df.drop(last_column, axis=1)  # Assuming \"SeriousDlqin2yrs\" is the target variable\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = df[last_column]\n\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)\n    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)\n\n#     y_train = y_train.astype('int64')\n#     y_test = y_test.astype('int64')\n#     y_val = y_val.astype('int64')\n\n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_val = (y_val.values.reshape(-1) == 1).astype('int64')\n\n    # Remove the ARFF file\n    os.remove(arff_file_path)\n\n    # Remove the data directory\n    shutil.rmtree(data_dir)\n\n    return dict(\n        X_train=X_train.astype('float32'), y_train=y_train,\n        X_valid=X_val.astype('float32'), y_valid=y_val,\n        X_test=X_test.astype('float32'), y_test=y_test\n    )\n\n\ndef fetch_openml_jannis_data(data_dir=\"./OpenML_Jannis\"):\n    print(\"---------------------OpenML_Jannis DATASET--------------------------------------\")\n    # Create the data directory if it doesn't exist\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    data_url = \"https://api.openml.org/data/v1/download/22111907/jannis.arff\"\n    # Download the ARFF file\n    arff_file_path = os.path.join(data_dir, \"jannis.arff\")\n    urllib.request.urlretrieve(data_url, arff_file_path)\n\n    # Load ARFF file into DataFrame\n    data, meta = arff.loadarff(arff_file_path)\n    df = pd.DataFrame(data)\n    # Convert target variable to int\n    last_column = df.columns[-1]\n#     print(\"df\",df)\n\n    df[last_column] = df[last_column].astype(int)\n\n\n    # Split the data into train, validation, and test sets\n    X = df.drop(last_column, axis=1)  # Assuming \"SeriousDlqin2yrs\" is the target variable\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = df[last_column]\n\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)\n    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)\n\n#     y_train = y_train.astype('int64')\n#     y_test = y_test.astype('int64')\n#     y_val = y_val.astype('int64')\n\n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_val = (y_val.values.reshape(-1) == 1).astype('int64')\n\n    # Remove the ARFF file\n    os.remove(arff_file_path)\n\n    # Remove the data directory\n    shutil.rmtree(data_dir)\n\n    return dict(\n        X_train=X_train.astype('float32'), y_train=y_train,\n        X_valid=X_val.astype('float32'), y_valid=y_val,\n        X_test=X_test.astype('float32'), y_test=y_test\n    )\n\n\ndef fetch_openml_Bioresponse_data(data_dir=\"./OpenML_Bioresponse\"):\n    print(\"---------------------OpenML_Bioresponse DATASET--------------------------------------\")\n    # Create the data directory if it doesn't exist\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    data_url = \"https://api.openml.org/data/v1/download/22111905/Bioresponse.arff\"\n    # Download the ARFF file\n    arff_file_path = os.path.join(data_dir, \"Bioresponse.arff\")\n    urllib.request.urlretrieve(data_url, arff_file_path)\n\n    # Load ARFF file into DataFrame\n    data, meta = arff.loadarff(arff_file_path)\n    df = pd.DataFrame(data)\n    # Convert target variable to int\n    last_column = df.columns[-1]\n#     print(\"df\",df)\n\n    df[last_column] = df[last_column].astype(int)\n\n    # Split the data into train, validation, and test sets\n    X = df.drop(last_column, axis=1)  # Assuming \"SeriousDlqin2yrs\" is the target variable\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = df[last_column]\n\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)\n    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)\n\n#     y_train = y_train.astype('int64')\n#     y_test = y_test.astype('int64')\n#     y_val = y_val.astype('int64')\n\n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_val = (y_val.values.reshape(-1) == 1).astype('int64')\n\n    # Remove the ARFF file\n    os.remove(arff_file_path)\n\n    # Remove the data directory\n    shutil.rmtree(data_dir)\n\n    return dict(\n        X_train=X_train.astype('float32'), y_train=y_train,\n        X_valid=X_val.astype('float32'), y_valid=y_val,\n        X_test=X_test.astype('float32'), y_test=y_test\n    )\n\n\ndef fetch_openml_california_data(data_dir=\"./OpenML_California\"):\n    print(\"---------------------OpenML_California DATASET--------------------------------------\")\n    # Create the data directory if it doesn't exist\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    data_url = \"https://api.openml.org/data/v1/download/22111914/california.arff\"\n    # Download the ARFF file\n    arff_file_path = os.path.join(data_dir, \"california.arff\")\n    urllib.request.urlretrieve(data_url, arff_file_path)\n\n    # Load ARFF file into DataFrame\n    data, meta = arff.loadarff(arff_file_path)\n    df = pd.DataFrame(data)\n    # Convert target variable to int\n    last_column = df.columns[-1]\n#     print(\"df\",df)\n\n    df[last_column] = df[last_column].astype(int)\n\n    # Split the data into train, validation, and test sets\n    X = df.drop(last_column, axis=1)  # Assuming \"SeriousDlqin2yrs\" is the target variable\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = df[last_column]\n\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)\n    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)\n\n#     y_train = y_train.astype('int64')\n#     y_test = y_test.astype('int64')\n#     y_val = y_val.astype('int64')\n\n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_val = (y_val.values.reshape(-1) == 1).astype('int64')\n\n    # Remove the ARFF file\n    os.remove(arff_file_path)\n\n    # Remove the data directory\n    shutil.rmtree(data_dir)\n\n    return dict(\n        X_train=X_train.astype('float32'), y_train=y_train,\n        X_valid=X_val.astype('float32'), y_valid=y_val,\n        X_test=X_test.astype('float32'), y_test=y_test\n    )\n\n\ndef fetch_openml_heloc_data(data_dir=\"./OpenML_Heloc\"):\n    print(\"---------------------OpenML_Heloc DATASET--------------------------------------\")\n    # Create the data directory if it doesn't exist\n    if not os.path.exists(data_dir):\n        os.makedirs(data_dir)\n\n    data_url = \"https://api.openml.org/data/v1/download/22111912/heloc.arff\"\n    # Download the ARFF file\n    arff_file_path = os.path.join(data_dir, \"heloc.arff\")\n    urllib.request.urlretrieve(data_url, arff_file_path)\n\n    # Load ARFF file into DataFrame\n    data, meta = arff.loadarff(arff_file_path)\n    df = pd.DataFrame(data)\n    # Convert target variable to int\n    last_column = df.columns[-1]\n#     print(\"df\",df)\n\n    df[last_column] = df[last_column].astype(int)\n\n    # Split the data into train, validation, and test sets\n    X = df.drop(last_column, axis=1)  # Assuming \"SeriousDlqin2yrs\" is the target variable\n    scaler = StandardScaler()\n    X = scaler.fit_transform(X)\n    y = df[last_column]\n\n    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)\n    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)\n\n#     y_train = y_train.astype('int64')\n#     y_test = y_test.astype('int64')\n#     y_val = y_val.astype('int64')\n\n    y_train = (y_train.values.reshape(-1) == 1).astype('int64')\n    y_test = (y_test.values.reshape(-1) == 1).astype('int64')\n    y_val = (y_val.values.reshape(-1) == 1).astype('int64')\n\n    # Remove the ARFF file\n    os.remove(arff_file_path)\n\n    # Remove the data directory\n    shutil.rmtree(data_dir)\n\n    return dict(\n        X_train=X_train.astype('float32'), y_train=y_train,\n        X_valid=X_val.astype('float32'), y_valid=y_val,\n        X_test=X_test.astype('float32'), y_test=y_test\n    )\n","metadata":{"execution":{"iopub.status.busy":"2024-04-26T08:23:12.457311Z","iopub.execute_input":"2024-04-26T08:23:12.457680Z","iopub.status.idle":"2024-04-26T08:23:12.696111Z","shell.execute_reply.started":"2024-04-26T08:23:12.457640Z","shell.execute_reply":"2024-04-26T08:23:12.695136Z"},"trusted":true},"execution_count":48,"outputs":[]},{"cell_type":"code","source":"#**class Dataset:**\n\nREAL_DATASETS = {\n    'A9A': fetch_A9A,\n    'EPSILON': fetch_EPSILON,\n    'PROTEIN': fetch_PROTEIN,\n    'YEAR': fetch_YEAR,\n    'MICROSOFT': fetch_MICROSOFT,\n    'YAHOO': fetch_YAHOO,\n    'CLICK': fetch_CLICK,\n    'GLASS': fetch_GLASS,\n    'COVTYPE': fetch_COVTYPE,\n    'ALOI': fetch_ALOI,\n    'DIGITS': fetch_DIGITS,\n    'MUSH': fetch_MUSHROOMS,\n    'TTT': fetch_TICTACTOE,\n    ####### 10 latest UCI datasets ########\n    'ADULT': fetch_ADULT,\n    'bank_marketing': fetch_bank_marketing,\n    'credit_card_defaults': fetch_credit_card_defaults,\n    'gamma_telescope': fetch_gamma_telescope,\n    'rice_dataset': fetch_rice_dataset,\n    'german_credit_data': fetch_german_credit_data,\n    'spambase_dataset': fetch_spambase_dataset,\n    'accelerometer_gyro_dataset': fetch_accelerometer_gyro_dataset,\n    'swarm_behaviour': fetch_swarm_behaviour,\n    'HIGGS': fetch_HIGGS,\n    ######## OpenML Tabular Datasets ##########\n    'OpenML_Credit': fetch_openml_credit_data,\n    'OpenML_Electricity': fetch_openml_electricity_data,\n    'OpenML_Covertype': fetch_openml_covertype_data,\n    'OpenML_Pol': fetch_openml_pol_data,\n    'OpenML_House_16H': fetch_openml_house_16H_data,\n    'OpenML_MiniBooNE': fetch_openml_MiniBooNE_data,\n    'OpenML_Eye_movements': fetch_openml_eye_movements_data,\n    'OpenML_Diabetes130US': fetch_openml_Diabetes130US_data,\n    'OpenML_Jannis': fetch_openml_jannis_data,\n    'OpenML_Bioresponse': fetch_openml_Bioresponse_data,\n    'OpenML_California': fetch_openml_california_data,\n    'OpenML_Heloc': fetch_openml_heloc_data\n}\n\nTOY_DATASETS = [\n    'xor',\n    'reg-xor',\n    'swissroll',\n]\n\nclass Dataset:\n    def __init__(self, dataset, data_path='./DATA', normalize=False, normalize_target=False, quantile_transform=False, quantile_noise=1e-3, in_features=None, out_features=None, flatten=False, **kwargs):\n        \"\"\"\n        Dataset is a dataclass that contains all training and evaluation data required for an experiment\n        :param dataset: a pre-defined dataset name (see DATASETS) or a custom dataset\n            Your dataset should be at (or will be downloaded into) {data_path}/{dataset}\n        :param data_path: a shared data folder path where the dataset is stored (or will be downloaded into)\n        :param normalize: standardize features by removing the mean and scaling to unit variance\n        :param quantile_transform: whether tranform the feature distributions into normals, using a quantile transform\n        :param quantile_noise: magnitude of the quantile noise\n        :param in_features: which features to use as inputs\n        :param out_features: which features to reconstruct as output\n        :param flatten: whether flattening instances to vectors\n        :param kwargs: depending on the dataset, you may select train size, test size or other params\n        \"\"\"\n\n        if dataset in REAL_DATASETS:\n            data_dict = REAL_DATASETS[dataset](Path(data_path) / dataset, **kwargs)\n\n            self.X_train = data_dict['X_train']\n            self.y_train = data_dict['y_train']\n            self.X_valid = data_dict['X_valid']\n            self.y_valid = data_dict['y_valid']\n            self.X_test = data_dict['X_test']\n            self.y_test = data_dict['y_test']\n\n            if flatten:\n                self.X_train, self.X_valid, self.X_test = self.X_train.reshape(len(self.X_train), -1), self.X_valid.reshape(len(self.X_valid), -1), self.X_test.reshape(len(self.X_test), -1)\n\n            if normalize:\n\n                print(\"Normalize dataset\")\n                axis = [0] + [i + 2 for i in range(self.X_train.ndim - 2)]\n                self.mean = np.mean(self.X_train, axis=tuple(axis), dtype=np.float32)\n                self.std = np.std(self.X_train, axis=tuple(axis), dtype=np.float32)\n\n                # if constants, set std to 1\n                self.std[self.std == 0.] = 1.\n\n                if dataset not in ['ALOI']:\n                    self.X_train = (self.X_train - self.mean) / self.std\n                    self.X_valid = (self.X_valid - self.mean) / self.std\n                    self.X_test = (self.X_test - self.mean) / self.std\n\n            if quantile_transform:\n                quantile_train = np.copy(self.X_train)\n                if quantile_noise:\n                    stds = np.std(quantile_train, axis=0, keepdims=True)\n                    noise_std = quantile_noise / np.maximum(stds, quantile_noise)\n                    quantile_train += noise_std * np.random.randn(*quantile_train.shape)\n\n                qt = QuantileTransformer(output_distribution='normal').fit(quantile_train)\n                self.X_train = qt.transform(self.X_train)\n                self.X_valid = qt.transform(self.X_valid)\n                self.X_test = qt.transform(self.X_test)\n\n            if normalize_target:\n\n                print(\"Normalize target value\")\n                self.mean_y = np.mean(self.y_train, axis=0, dtype=np.float32)\n                self.std_y = np.std(self.y_train, axis=0, dtype=np.float32)\n\n                # if constants, set std to 1\n                if self.std_y == 0.:\n                    self.std_y = 1.\n\n                self.y_train = (self.y_train - self.mean_y) / self.std_y\n                self.y_valid = (self.y_valid - self.mean_y) / self.std_y\n                self.y_test = (self.y_test - self.mean_y) / self.std_y\n\n            if in_features is not None:\n                self.X_train_in, self.X_valid_in, self.X_test_in = self.X_train[:, in_features], self.X_valid[:, in_features], self.X_test[:, in_features]\n\n            if out_features is not None:\n                self.X_train_out, self.X_valid_out, self.X_test_out = self.X_train[:, out_features], self.X_valid[:, out_features], self.X_test[:, out_features]\n\n        elif dataset in TOY_DATASETS:\n            data_dict = toy_dataset(distr=dataset, **kwargs)\n\n            self.X = data_dict['X']\n            self.Y = data_dict['Y']\n            if 'labels' in data_dict:\n                self.labels = data_dict['labels']\n\n        self.data_path = data_path\n        self.dataset = dataset\n\nclass TorchDataset(torch.utils.data.Dataset):\n\n    def __init__(self, *data, **options):\n        \n        n_data = len(data)\n        if n_data == 0:\n            raise ValueError(\"At least one set required as input\")\n\n        self.data = data\n        means = options.pop('means', None)\n        stds = options.pop('stds', None)\n        self.transform = options.pop('transform', None)\n        self.test = options.pop('test', False)\n        \n        if options:\n            raise TypeError(\"Invalid parameters passed: %s\" % str(options))\n         \n        if means is not None:\n            assert stds is not None, \"must specify both <means> and <stds>\"\n\n            self.normalize = lambda data: [(d - m) / s for d, m, s in zip(data, means, stds)]\n\n        else:\n            self.normalize = lambda data: data\n\n    def __len__(self):\n        return len(self.data[0])\n\n    def __getitem__(self, idx):\n        data = self.normalize([s[idx] for s in self.data])\n        if self.transform:\n\n            if self.test:\n                data = sum([[self.transform.test_transform(d)] * 2 for d in data], [])\n            else:\n                data = sum([self.transform(d) for d in data], [])\n            \n        return data","metadata":{"execution":{"iopub.status.busy":"2024-04-26T08:23:12.741135Z","iopub.execute_input":"2024-04-26T08:23:12.741455Z","iopub.status.idle":"2024-04-26T08:23:12.770638Z","shell.execute_reply.started":"2024-04-26T08:23:12.741431Z","shell.execute_reply":"2024-04-26T08:23:12.769630Z"},"trusted":true},"execution_count":49,"outputs":[]},{"cell_type":"markdown","source":"**DLGN Model**","metadata":{}},{"cell_type":"code","source":"def set_npseed(seed):\n    np.random.seed(seed)\n\n\ndef set_torchseed(seed):\n    torch.manual_seed(seed)\n    torch.cuda.manual_seed(seed)\n    torch.cuda.manual_seed_all(seed)\n    torch.backends.cudnn.deterministic = True\n    torch.backends.cudnn.benchmark = False\n    \nclass DLGN_FC(nn.Module):\n    def __init__(self, input_dim=None, output_dim=None, num_hidden_nodes=[], beta=30, dlgn_mode='dlgn_sf', mode='pwc'):\n        super(DLGN_FC, self).__init__()\n        self.num_hidden_layers = len(num_hidden_nodes)\n        self.beta=beta  # Soft gating parameter\n        self.dlgn_mode = dlgn_mode\n        self.mode = mode\n        self.num_nodes=[input_dim]+num_hidden_nodes+[output_dim]\n        self.gating_layers=nn.ModuleList()\n        self.value_layers=nn.ModuleList()\n\n        for i in range(self.num_hidden_layers+1):\n            if i!=self.num_hidden_layers:\n                if self.dlgn_mode == 'dlgn_sf':\n                    temp = nn.Linear(self.num_nodes[0], self.num_nodes[i+1], bias=False)\n                else :\n                    temp = nn.Linear(self.num_nodes[i], self.num_nodes[i+1], bias=False)\n                self.gating_layers.append(temp)\n            temp = nn.Linear(self.num_nodes[i], self.num_nodes[i+1], bias=False)\n            self.value_layers.append(temp)\n    \n    def set_parameters_with_mask(self, to_copy, parameter_masks):\n        # self and to_copy are DLGN_FC objects with same architecture\n        # parameter_masks is compatible with dict(to_copy.named_parameters())\n        for (name, copy_param) in to_copy.named_parameters():\n            copy_param = copy_param.clone().detach()\n            orig_param  = self.state_dict()[name]\n            if name in parameter_masks:\n                param_mask = parameter_masks[name]>0\n                orig_param[param_mask] = copy_param[param_mask]\n            else:\n                orig_param = copy_param.data.detach()\n\n    def return_gating_functions(self):\n        effective_weights = []\n        for i in range(self.num_hidden_layers):\n            curr_weight = self.gating_layers[i].weight.detach().clone()\n            curr_weight /= torch.norm(curr_weight, dim=1, keepdim=True)\n            if self.dlgn_mode=='dlgn_sf':\n                effective_weights.append(curr_weight)\n            else:\n                if i==0:\n                    effective_weights.append(curr_weight)\n                else:\n                    effective_weights.append(torch.matmul(curr_weight,effective_weights[-1]))\n        return effective_weights\n        # effective_weights (and effective biases) is a list of size num_hidden_layers\n\n\n    def forward(self, x):\n        gate_scores=[x]\n\n        for el in self.parameters():\n            if el.is_cuda:\n                device = torch.device('cuda:0')\n                # device = torch.device('cpu')\n            else:\n                device = torch.device('cpu')\n        if self.mode=='pwc':\n            values=[torch.ones(x.shape).to(device)]\n        else:\n            values=[x.to(device)]\n\n        for i in range(self.num_hidden_layers):\n            if self.dlgn_mode=='dlgn_sf':\n                gate_scores.append( (x@self.gating_layers[i].weight.T) )\n            else:\n                gate_scores.append(self.gating_layers[i].to(device)(gate_scores[-1].to(device)))\n            curr_gate_on_off = torch.sigmoid(self.beta * gate_scores[-1])\n            values.append(self.value_layers[i](values[-1])*curr_gate_on_off)\n        values.append(self.value_layers[self.num_hidden_layers](values[-1]))\n        # Values is a list of size 1+num_hidden_layers+1\n        #gate_scores is a list of size 1+num_hidden_layers\n        return values,gate_scores","metadata":{"execution":{"iopub.status.busy":"2024-04-26T08:23:13.804853Z","iopub.execute_input":"2024-04-26T08:23:13.805186Z","iopub.status.idle":"2024-04-26T08:23:13.824500Z","shell.execute_reply.started":"2024-04-26T08:23:13.805163Z","shell.execute_reply":"2024-04-26T08:23:13.823476Z"},"trusted":true},"execution_count":50,"outputs":[]},{"cell_type":"code","source":"#from src.optimization import train_stochastic, evaluate\n\ndef train_stochastic(dataloader, model, optimizer, criterion,criterion2, epoch,parameter_mask=dict(), monitor=None, prog_bar=True):\n\n    model.train()\n\n    last_iter = epoch * len(dataloader)\n#     print(\"last_iter inside train_stochastic:\",last_iter)\n    train_obj = 0.\n    total_losses=0.\n    if prog_bar:\n        pbar = tqdm(dataloader)\n    else:\n        pbar = dataloader\n\n    for i, batch in enumerate(pbar):\n        optimizer.zero_grad()        \n\n        t_x, t_y = batch\n        t_x = t_x.to(device)\n        t_y = t_y.to(device)\n        if(len(t_x)>1):\n            if t_y.dim() > 2: # predictors support only flatten output atm\n                t_y = t_y.view(len(t_y), -1).to(device)\n\n            values,gate_scores = model(t_x)\n            y_pred = torch.sigmoid(values[-1])\n            y_pred = y_pred.squeeze()\n            loss = criterion(y_pred, t_y) / len(t_x)\n            train_obj += loss.cpu().detach().numpy()\n\n            ##Testing 0-1 loss\n            y_pred1=y_pred.clone()\n            y_pred1[y_pred > 0.5] = 1\n            y_pred1[y_pred <= 0.5] = 0\n            y_pred1 = y_pred1.detach()\n            loss2 = criterion2(y_pred1, t_y)/ len(t_x)\n            total_losses += loss2.cpu().detach()\n            ##Testing 0-1 loss\n\n\n            if prog_bar:\n                pbar.set_description(\"avg train loss %f\" % (total_losses / (i + 1)))\n            loss.backward()\n            for name,param in model.named_parameters():\n                parameter_mask[name] = parameter_mask[name].to(device)\n                param.grad *= parameter_mask[name]\n#             for name,param in model.named_parameters():\n#                 param.grad *= parameter_mask[name].to(device)\n\n            optimizer.step()\n\n            if monitor:\n                monitor.write(model, i + last_iter, report_tree=True, train={\"Loss\": loss.cpu().detach()})\n            \ndef evaluate(dataloader, model, criteria, epoch=None, monitor=None):\n\n    model.eval()\n    total_losses = {k: 0. for k in criteria.keys()}\n    num_points = 0\n    for batch in dataloader:\n\n        t_x, t_y = batch\n        t_x = t_x.to(device)\n        t_y = t_y.to(device)\n        \n        if t_y.dim() > 2: # predictors support only flatten output atm\n            t_y = t_y.view(len(t_y), -1)\n        \n        num_points += len(t_x)\n        \n        values,gate_scores = model(t_x)\n        \n        y_pred = torch.sigmoid(values[-1])\n        y_pred = y_pred.squeeze()\n\n        y_pred[y_pred > 0.5] = 1\n        y_pred[y_pred <= 0.5] = 0\n        y_pred = y_pred.detach()\n\n        for k in criteria.keys():\n            loss = criteria[k](y_pred, t_y)\n            total_losses[k] += loss.cpu().detach()\n\n    if monitor:\n        monitor.write(model, epoch, val={k: loss / num_points for k, loss in total_losses.items()})\n\n    return {k: loss.cpu().numpy() / num_points for k, loss in total_losses.items()}","metadata":{"execution":{"iopub.status.busy":"2024-04-26T08:23:14.248926Z","iopub.execute_input":"2024-04-26T08:23:14.249737Z","iopub.status.idle":"2024-04-26T08:23:14.266894Z","shell.execute_reply.started":"2024-04-26T08:23:14.249703Z","shell.execute_reply":"2024-04-26T08:23:14.266008Z"},"trusted":true},"execution_count":51,"outputs":[]},{"cell_type":"code","source":"def train_dlgn_classification (DLGN_obj,trainloader,valloader,num_epoch=1,parameter_mask=dict()):\n    # DLGN_obj is the initial network\n    # parameter_mask is a dictionary compatible with dict(DLGN_obj.named_parameters())\n    # if a key corresponding to a named_parameter is not present it is assumed to be all ones (i.e it will be updated)\n    \n    set_torchseed(seed)\n    DLGN_obj.to(device)\n#     DLGN_obj_initial = DLGN_FC(to_copy=DLGN_obj)\n#     DLGN_obj_return = DLGN_FC(to_copy=DLGN_obj)\n    \n    \n    # init loss\n    loss = nn.BCELoss(reduction=\"sum\")\n    criterion = lambda x, y: loss(x.float(), y.float())\n    # evaluation criterion => error rate\n    eval_criterion = lambda x, y: (x != y).sum()\n       \n    \n    if optimizer_name == 'SGD':\n        optimizer = optim.SGD(DLGN_obj.parameters(), lr=lr)\n    if optimizer_name == 'Adam':\n        optimizer = optim.Adam(DLGN_obj.parameters(), lr=lr)\n        \n        \n#     # init optimizer\n#     optimizer = QHAdam(model.parameters(), lr=LR, nus=(0.7, 1.0), betas=(0.995, 0.998))\n    \n    \n    losses=[]\n    DLGN_obj_store = []\n    test_losses, train_times, test_times = [], [], []\n    \n\n    best_val_loss = float(\"inf\")\n    best_e = -1\n    no_improv = 0\n    t0 = time.time()\n    for e in range(num_epoch):\n        train_stochastic(trainloader, DLGN_obj, optimizer, criterion,eval_criterion, epoch=e,parameter_mask=parameter_mask, monitor=None,prog_bar=False)\n\n        val_loss = evaluate(valloader, DLGN_obj, {'ER': eval_criterion}, epoch=e, monitor=None)\n#         print(\"Epoch %i: validation loss = %f\\n\" % (e, val_loss[\"ER\"]))\n        no_improv += 1\n        if val_loss[\"ER\"] < best_val_loss:\n            best_val_loss = val_loss[\"ER\"]\n            best_e = e\n            no_improv = 0\n#             if perfectSq(epoch):\n#             DLGN_obj_store.append(DLGN_FC(to_copy=DLGN_obj))\n            DLGN_obj_return = deepcopy(DLGN_obj)\n#             LTBinaryClassifier.save_model(model, optimizer, state, save_dir, epoch=e, val_er=best_val_loss)\n\n        if no_improv == num_epoch // 5:\n            break\n    t1 = time.time()\n\n    print(\"best validation error rate (epoch {}): {}\\n\".format(best_e, best_val_loss))\n\n    \n    model = DLGN_obj_return\n    t2 = time.time()\n    test_loss = evaluate(testloader, model, {'ER': eval_criterion})\n    print(\"test error rate (model of epoch {}): {}\\n\".format(best_e, test_loss['ER']))\n    print(\"test accuracy (model of epoch {}): {}\\n\".format(best_e,1-test_loss['ER']))\n\n#     t3 = time.time()\n#     test_losses.append(test_loss['ER'])\n#     train_times.append(t1 - t0)\n#     test_times.append(t3 - t2)\n\n#     print(np.mean(test_losses), np.std(test_losses))\n# #     np.save(save_dir / '../test-losses.npy', test_losses)\n#     print(\"Avg train time\", np.mean(train_times))\n#     print(\"Avg test time\", np.mean(test_times))\n    \n    return DLGN_obj_return, DLGN_obj_store","metadata":{"execution":{"iopub.status.busy":"2024-04-26T08:23:14.685141Z","iopub.execute_input":"2024-04-26T08:23:14.685481Z","iopub.status.idle":"2024-04-26T08:23:14.698305Z","shell.execute_reply.started":"2024-04-26T08:23:14.685453Z","shell.execute_reply":"2024-04-26T08:23:14.697201Z"},"trusted":true},"execution_count":52,"outputs":[]},{"cell_type":"markdown","source":"**Change this cell to run different configs**","metadata":{}},{"cell_type":"code","source":"weight_decay=0.0\nnum_hidden_layers=3\n\noutput_dim=1 #num_of_classes\n# num_hidden_nodes=[50,50,50]\nmodep='pwc'\ndlgn_mode='dlgn' #use dlgn_sf for DLGN-SF model\nmodel_name = 'DLGN' #DLGN/DeepnonLinearModel\nbeta = 30\noptimizer_name ='Adam'\n\nseed = 365\nlr = 0.001\nBATCH_SIZE = 512 \n\nnum_hidden_nodes_list=[[100,100,100,100]]#,[20,20,20],[50,50,50],[100,100,100],[10,10,10,10],[20,20,20,20],[50,50,50,50],[100,100,100,100],[10,10,10,10,10],[20,20,20,20],[50,50,50,50,50],[100,100,100,100,100]]\nlr_list = [0.001]\nnum_epoch = 500\n# DATA_NAME=[\"OpenML_Credit\",\"OpenML_Electricity\",\"OpenML_Covertype\",\"OpenML_Pol\",\"OpenML_House_16H\",\"OpenML_MiniBooNE\",\"OpenML_Eye_movements\",\"OpenML_Diabetes130US\",\"OpenML_Jannis\",\"OpenML_Bioresponse\",\"OpenML_California\",\"OpenML_Heloc\"]#\"OpenML_Credit\"]#,\"bank_marketing\",\"credit_card_defaults\",\"gamma_telescope\",\"rice_dataset\",\"german_credit_data\",\"spambase_dataset\",\"accelerometer_gyro_dataset\",\"swarm_behaviour\"]#,\"HIGGS\"]\nDATA_NAME=[\"bank_marketing\"]\nfor data_name in DATA_NAME:\n    for num_hidden_nodes in num_hidden_nodes_list:\n        num_hidden_layers = len(num_hidden_nodes)\n        for lr in lr_list:\n            print(\"LR Valeu:\",lr)\n            print(\"=================\",num_hidden_layers,num_hidden_nodes,\"=========================\")\n            data = Dataset(data_name,normalize=True)\n            print('classes', np.unique(data.y_test))\n    #         print(\"data\",data.y_train)\n            set_npseed(seed)\n            set_torchseed(seed)\n\n            save_dir = Path(\"./results/tabular-quantile/\") / data_name / \"seed={}\".format(seed)\n            save_dir.mkdir(parents=True, exist_ok=True)\n\n            input_dim=data.X_train.shape[1]\n            trainloader = torch.utils.data.DataLoader(TorchDataset(data.X_train, data.y_train), batch_size=BATCH_SIZE, shuffle=True)\n            valloader = torch.utils.data.DataLoader(TorchDataset(data.X_valid, data.y_valid), batch_size=BATCH_SIZE, shuffle=False)\n            testloader = torch.utils.data.DataLoader(TorchDataset(data.X_test, data.y_test), batch_size=BATCH_SIZE, shuffle=False)\n\n#             set_torchseed(6675)\n            DLGN_init= DLGN_FC(input_dim=input_dim, output_dim=1, num_hidden_nodes=num_hidden_nodes, beta=beta, dlgn_mode=dlgn_mode)\n#             for name,parameter in DLGN_init.named_parameters():\n#                 print(name)\n#                 print(parameter.shape)\n            train_parameter_masks=dict()\n            for name,parameter in DLGN_init.named_parameters():\n                if \"val\" in name:\n                    train_parameter_masks[name]=torch.ones_like(parameter) #*0.001 # Updating all value network layers\n                if \"gat\" in name:\n                    train_parameter_masks[name]=torch.ones_like(parameter)\n\n\n                    # train_parameter_masks[name][:num_neurons_set] *= 0.\n                train_parameter_masks[name].to(device)\n            \n#             set_torchseed(5000)\n            DLGN_obj_return,DLGN_obj_store = train_dlgn_classification(DLGN_obj=deepcopy(DLGN_init),trainloader=trainloader,valloader=valloader,num_epoch=num_epoch,parameter_mask=train_parameter_masks)\n\n            torch.cuda.empty_cache() \n#             losses=np.array(losses)\n\n            ## For single run\n\n\n        '''\n\n        ##For multiple runs\n\n        lr_store = [0.001,0.0001,0.00001]\n        num_hidden_nodes_store=[[2,2,2],[5,5,5],[10,10,10],[20,20,20],[50,50,50],[100,100,100],[200,200,200],[500,500,500]]\n\n        for lr in lr_store:\n            for num_hidden_nodes in num_hidden_nodes_store:\n                print(\"=========================================================================================\")\n                print(\"LR Valeu:\",lr)\n                print(\"num_hidden_nodes:\",num_hidden_nodes)\n                print(\"=========================================================================================\")\n                lr=lr\n                num_hidden_nodes=num_hidden_nodes\n                DLGN_init= DLGN_FC(input_dim=input_dim, output_dim=1, num_hidden_nodes=num_hidden_nodes,beta=beta)\n                DLGN_obj_return,DLGN_obj_store=train_model_func(DLGN_init,trainloader,valloader,num_epoch,NPF_freeze,NPV_freeze)\n                del DLGN_init\n                del DLGN_obj_store\n\n        '''","metadata":{"execution":{"iopub.status.busy":"2024-04-26T08:23:40.717114Z","iopub.execute_input":"2024-04-26T08:23:40.717497Z","iopub.status.idle":"2024-04-26T08:24:34.834720Z","shell.execute_reply.started":"2024-04-26T08:23:40.717466Z","shell.execute_reply":"2024-04-26T08:24:34.833834Z"},"trusted":true},"execution_count":53,"outputs":[{"name":"stdout","text":"LR Valeu: 0.001\n================= 4 [100, 100, 100, 100] =========================\n---------------------BANK--------------------------------------\nNormalize dataset\nclasses [0 1]\nbest validation error rate (epoch 8): 0.08497208060208788\n\ntest error rate (model of epoch 8): 0.08642874484098081\n\ntest accuracy (model of epoch 8): 0.9135712551590192\n\n","output_type":"stream"}]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}