{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[],"dockerImageVersionId":30699,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"**Import**","metadata":{}},{"cell_type":"code","source":"#@title Imports\n%reset -f\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport torch.optim as optim\nfrom tqdm import tqdm\nimport copy\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom mpl_toolkits.axes_grid1 import make_axes_locatable\nfrom itertools import product as cartesian_prod\nfrom sklearn.metrics import pairwise_distances\n\nfrom sklearn import tree\nfrom sklearn import cluster, mixture\nimport zipfile\nimport shutil\nimport urllib.request\nimport numpy as np\nimport pandas as pd\nfrom sklearn.preprocessing import StandardScaler, LabelEncoder\nfrom sklearn.metrics import accuracy_score\nfrom sklearn.model_selection import train_test_split, GridSearchCV\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.svm import SVC\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.neural_network import MLPClassifier\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.neighbors import NearestCentroid\nfrom scipy.io import arff\n\n\nnp.set_printoptions(precision=4)\n\n\n#@title Importing Packages\nimport os\nimport random\nfrom copy import deepcopy\nimport torchvision\nimport torchvision.transforms as transforms","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"**Device**","metadata":{}},{"cell_type":"code","source":"#Device\ndevice = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')\nprint(device)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"**Synthetic Dataset**","metadata":{}},{"cell_type":"code","source":"#Synthetic Dataset\ndef set_npseed(seed):\n    np.random.seed(seed)\n\n\ndef set_torchseed(seed):\n    torch.manual_seed(seed)\n    torch.cuda.manual_seed(seed)\n    torch.cuda.manual_seed_all(seed)\n    torch.backends.cudnn.deterministic = True\n    torch.backends.cudnn.benchmark = False\n\n\n#classification data\n\ndef data_gen_decision_tree(num_data=1000, dim=2, seed=0, w_list=None, b_list=None,vals=None, num_levels=2):        \n    set_npseed(seed=seed)\n\n    # Construct a complete decision tree with 2**num_levels-1 internal nodes,\n    # e.g. num_levels=2 means there are 3 internal nodes.\n    # w_list, b_list is a list of size equal to num_internal_nodes\n    # vals is a list of size equal to num_leaf_nodes, with values +1 or 0\n    num_internal_nodes = 2**num_levels - 1\n    num_leaf_nodes = 2**num_levels\n    stats = np.zeros(num_internal_nodes+num_leaf_nodes) #stores the num of datapoints at each node so at 0(root) all data points will be present\n\n    if vals is None: #when val i.e., labels are not provided make the labels dynamically\n        vals = np.arange(0,num_internal_nodes+num_leaf_nodes,1,dtype=np.int32)%2 #assign 0 or 1 label to the node based on whether its numbering is even or odd\n        vals[:num_internal_nodes] = -99 #we put -99 to the internal nodes as only the values of leaf nodes are counted\n\n    if w_list is None: #if the w values of the nodes (hyperplane eqn) are not provided then generate dynamically\n        w_list = np.random.standard_normal((num_internal_nodes, dim))\n        w_list = w_list/np.linalg.norm(w_list, axis=1)[:, None] #unit norm w vects\n        b_list = np.zeros((num_internal_nodes))\n\n    \n\n#     data_x = np.random.random_sample((num_data, dim))*2 - 1. #generate the datas in range -1 to +1\n#     relevant_stats = data_x @ w_list.T + b_list #stores the x.wT+b value of each nodes for all data points(num_data x num_nodes) to check if > 0 i.e will follow right sub tree route or <0 and will follow left sub tree route\n#     curr_index = np.zeros(shape=(num_data), dtype=int) #stores the curr index for each data point from root to leaf. So initially a datapoint starts from root but then it can go to right or left if it goes to right its curr index will become 2 from 0 else 1 from 0 then in next iteration from say 2 it goes to right then it will become 6\n\n    data_x = np.random.standard_normal((num_data, dim))\n    data_x /= np.sqrt(np.sum(data_x**2, axis=1, keepdims=True))\n    relevant_stats = data_x @ w_list.T + b_list\n    curr_index = np.zeros(shape=(num_data), dtype=int)\n    \n    for level in range(num_levels):\n        nodes_curr_level=list(range(2**level - 1,2**(level+1)-1  ))\n        for el in nodes_curr_level:\n#             b_list[el]=-1*np.median(relevant_stats[curr_index==el,el])\n            relevant_stats[:,el] += b_list[el]\n        decision_variable = np.choose(curr_index, relevant_stats.T) #based on the curr index will choose the corresponding node value of the datapoint\n\n        # Go down and right if wx+b>0 down and left otherwise.\n        # i.e. 0 -> 1 if w[0]x+b[0]<0 and 0->2 otherwise\n        curr_index = (curr_index+1)*2 - (1-(decision_variable > 0)) #update curr index based on the desc_variable\n        \n\n    bound_dist = np.min(np.abs(relevant_stats), axis=1) #finds the abs value of the minm node value of a datapoint. If some node value of a datapoint is 0 then that data point exactly passes through a hyperplane and we remove all such datapoints\n    thres = threshold\n    labels = vals[curr_index] #finally labels for each datapoint is assigned after traversing the whole tree\n\n    data_x_pruned = data_x[bound_dist>thres] #to distingush the hyperplanes seperately for 0 1 labels (classification)\n    #removes all the datapoints that passes through a node hyperplane\n    labels_pruned = labels[bound_dist>thres]\n    relevant_stats = np.sign(data_x_pruned @ w_list.T + b_list) #storing only +1 or -1 for a particular node if it is active or not\n    nodes_active = np.zeros((len(data_x_pruned),  num_internal_nodes+num_leaf_nodes), dtype=np.int32) #stores node actv or not for a data\n\n    for node in range(num_internal_nodes+num_leaf_nodes):\n        if node==0:\n            stats[node]=len(relevant_stats) #for root node all datapoints are present\n            nodes_active[:,0]=1 #root node all data points active status is +1\n            continue\n        parent = (node-1)//2\n        nodes_active[:,node]=nodes_active[:,parent]\n        right_child = node-(parent*2)-1 # 0 means left, 1 means right 1 has children 3,4\n        #finds if it is a right child or left of the parent\n        if right_child==1:\n            nodes_active[:,node] *= relevant_stats[:,parent]>0 #if parent node val was >0 then this right child of parent is active\n        if right_child==0:\n            nodes_active[:,node] *= relevant_stats[:,parent]<0 #else left is active\n        stats = nodes_active.sum(axis=0) #updates the status i.e., no of datapoints active in that node (root has all active then gradually divided in left right)\n    return ((data_x_pruned, labels_pruned), (w_list, b_list, vals), stats)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"def evaluate_algorithm():\n#     # Load the dataset using the provided fetch function\n#     data_splits = data_fetch_function()\n\n    # Algorithms to evaluate\n    algorithms = [\n        (\"SVM\", SVC()),\n        (\"Relu Neural Networks\", MLPClassifier(\n            solver='adam',  # Specify 'adam' optimizer\n            hidden_layer_sizes=(100, 50, 25),\n            activation='relu',\n            learning_rate_init=0.001,\n            max_iter=1000,\n            random_state=42)),\n        (\"Decision Trees\", DecisionTreeClassifier()),\n        (\"Random Forest\", RandomForestClassifier())\n    ]\n    results = []\n\n    # Iterate over each algorithm\n    for algo_name, algo in algorithms:\n        print(f\"Running {algo_name}...\")\n\n        best_acc = 0.0\n        best_params = {}\n\n        # Hyperparameter tuning using GridSearchCV\n        if algo_name == \"Logistic Regression\":\n            param_grid = {\n                'C': [0.001, 0.01, 0.1, 1, 10, 100],\n                'max_iter': [100,1000]\n            }\n        elif algo_name == \"SVM\":\n            # Define parameter grid for SVC\n            param_grid = {\n                'C': [0.1,0.5, 1,2,5],\n                'kernel': ['linear', 'rbf', 'sigmoid'],\n#                 'gamma': ['scale', 'auto'] + [0.001, 0.01, 0.1, 1, 10],\n                'degree': [2, 3, 4, 5],\n            }\n#             param_grid = {\n#                 'kernel': ['rbf']\n#             }\n        elif algo_name == \"Naive Bayes\":\n            param_grid = {\n                'var_smoothing': [1e-08, 1e-07, 1e-05, 1e-04, 1e-02]\n            }\n        elif algo_name == \"K-NN\":\n            # Define parameter grid for KNeighborsClassifier\n            param_grid = {\n                'n_neighbors': [1, 3, 5, 7, 9,15],\n                'weights': ['uniform', 'distance'],\n                'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']\n            }\n        elif algo_name == \"Relu Neural Networks\":\n            # Define parameter grid for MLPClassifier\n            param_grid = {\n                'hidden_layer_sizes': [(100,100,100),(500, 500, 500),(500,500,500,500),(1000,1000,1000,1000)],\n                'learning_rate_init': [0.001,0.005,0.0001],\n                'max_iter': [200,500,1000],\n            }\n        elif algo_name == \"Decision Trees\":\n            # Define parameter grid for DecisionTreeClassifier\n            param_grid = {\n                'criterion': ['gini', 'entropy'],\n                'splitter': ['best', 'random'],\n                'max_depth': [3,4,5,10],\n                'min_samples_split': [2, 5, 10],\n                'min_samples_leaf': [1, 2, 4],\n                'max_features': ['sqrt', 'log2']\n            }\n        elif algo_name == \"Random Forest\":\n            param_grid = {\n                'n_estimators': [50, 100],\n                'criterion': ['gini', 'entropy'],\n                'max_depth': [3,4,5, 10],\n                'min_samples_split': [2, 5, 10],\n                'min_samples_leaf': [1, 2, 4],\n                'max_features': ['auto', 'sqrt', 'log2']\n            }\n        elif algo_name == \"Nearest Centroid\":\n            # Define parameter grid for NearestCentroid (distance metric)\n            param_grid = {\n                'metric': ['euclidean']\n            }  \n        else:\n            # Exclude 'priors' from the parameter grid\n            param_grid = {k: v for k, v in algo.get_params().items() if k != 'priors'}\n\n        clf = GridSearchCV(algo, param_grid, cv=3, verbose=1, n_jobs=-1)\n        clf.fit(train_data,train_data_labels)\n\n        best_acc = clf.best_score_\n        best_params = clf.best_params_\n        print(best_params)\n        \n        # Train the final model on combined training and validation data\n        model = algo.set_params(**best_params)\n        model.fit(train_data,train_data_labels)\n\n        # Predict on the test set\n        y_pred = model.predict(test_data)\n        test_acc = accuracy_score(test_data_labels, y_pred)\n        print(test_acc)\n        # Store the results\n        results.append({\"Algorithm\": algo_name, \"Best Val Accuracy\": best_acc, \"Test Accuracy\": test_acc})\n\n    # Create a results DataFrame\n    results_df = pd.DataFrame(results)\n\n    # Print the results table\n    print(\"\\nResults Table:\")\n    print(results_df)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"**Change this cell to run different configs**","metadata":{}},{"cell_type":"code","source":"#In this cell change the parameter values to train on different synthetic data with different models.\n#Dataset Characteristics\n\nseed=365\nnum_levels=4\nthreshold = 0 #data seperation distance\n\noptimizer_name ='Adam'\nmodep='pwc' \noutput_dim=1\nnum_epoch=1500 #number of epochs to run\n\nx_epoch = 1500\nsaved_epochs = list(range(0,1501,10))\nweight_decay=0.0\nno_of_batches=10 #[1,10,100]\n\ninput_dim = 20 #Synthetic data input dimension\nnum_data = 40000 #Total data points\n\n\n\nprint(f\"Running code for input_dim={input_dim}, num_data={num_data}\")\n\n((data_x, labels), (w_list, b_list, vals), stats) = data_gen_decision_tree(\n                                            dim=input_dim, seed=seed, num_levels=num_levels,\n                                            num_data=num_data)\nseed_set=seed\nw_list_old = np.array(w_list)\nb_list_old = np.array(b_list)\nprint(sum(labels==1))\nprint(sum(labels==0))\nprint(\"Seed= \",seed_set)\nnum_data = len(data_x)\nnum_train= num_data//2\nnum_vali = num_data//4\nnum_test = num_data//4\ntrain_data = data_x[:num_train,:]\ntrain_data_labels = labels[:num_train]\n\nvali_data = data_x[num_train:num_train+num_vali,:]\nvali_data_labels = labels[num_train:num_train+num_vali]\n\ntest_data = data_x[num_train+num_vali :,:]\ntest_data_labels = labels[num_train+num_vali :]\n\nevaluate_algorithm()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}