{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[],"dockerImageVersionId":30699,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"**Import**","metadata":{}},{"cell_type":"code","source":"#Import\n%reset -f\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport torch.optim as optim\nfrom tqdm import tqdm\nimport copy\nimport numpy as np\nnp.set_printoptions(precision=4)\nimport os\nimport random\nfrom copy import deepcopy\nimport torchvision\nimport torchvision.transforms as transforms","metadata":{"execution":{"iopub.status.busy":"2024-04-26T06:11:05.811736Z","iopub.execute_input":"2024-04-26T06:11:05.812626Z","iopub.status.idle":"2024-04-26T06:11:06.161796Z","shell.execute_reply.started":"2024-04-26T06:11:05.812595Z","shell.execute_reply":"2024-04-26T06:11:06.160849Z"},"trusted":true},"execution_count":14,"outputs":[]},{"cell_type":"markdown","source":"**Device**","metadata":{}},{"cell_type":"code","source":"#Device\ndevice = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')\nprint(device)","metadata":{"execution":{"iopub.status.busy":"2024-04-26T06:11:06.357292Z","iopub.execute_input":"2024-04-26T06:11:06.357854Z","iopub.status.idle":"2024-04-26T06:11:06.362892Z","shell.execute_reply.started":"2024-04-26T06:11:06.357827Z","shell.execute_reply":"2024-04-26T06:11:06.361880Z"},"trusted":true},"execution_count":15,"outputs":[{"name":"stdout","text":"cuda:0\n","output_type":"stream"}]},{"cell_type":"markdown","source":"**Synthetic Dataset**","metadata":{}},{"cell_type":"code","source":"#Synthetic Dataset\ndef set_npseed(seed):\n    np.random.seed(seed)\n\n\ndef set_torchseed(seed):\n    torch.manual_seed(seed)\n    torch.cuda.manual_seed(seed)\n    torch.cuda.manual_seed_all(seed)\n    torch.backends.cudnn.deterministic = True\n    torch.backends.cudnn.benchmark = False\n\n\n#classification data\n\ndef data_gen_decision_tree(num_data=1000, dim=2, seed=0, w_list=None, b_list=None,vals=None, num_levels=2):        \n    set_npseed(seed=seed)\n\n    # Construct a complete decision tree with 2**num_levels-1 internal nodes,\n    # e.g. num_levels=2 means there are 3 internal nodes.\n    # w_list, b_list is a list of size equal to num_internal_nodes\n    # vals is a list of size equal to num_leaf_nodes, with values +1 or 0\n    num_internal_nodes = 2**num_levels - 1\n    num_leaf_nodes = 2**num_levels\n    stats = np.zeros(num_internal_nodes+num_leaf_nodes) #stores the num of datapoints at each node so at 0(root) all data points will be present\n\n    if vals is None: #when val i.e., labels are not provided make the labels dynamically\n        vals = np.arange(0,num_internal_nodes+num_leaf_nodes,1,dtype=np.int32)%2 #assign 0 or 1 label to the node based on whether its numbering is even or odd\n        vals[:num_internal_nodes] = -99 #we put -99 to the internal nodes as only the values of leaf nodes are counted\n\n    if w_list is None: #if the w values of the nodes (hyperplane eqn) are not provided then generate dynamically\n        w_list = np.random.standard_normal((num_internal_nodes, dim))\n        w_list = w_list/np.linalg.norm(w_list, axis=1)[:, None] #unit norm w vects\n        b_list = np.zeros((num_internal_nodes))\n\n    \n\n#     data_x = np.random.random_sample((num_data, dim))*2 - 1. #generate the datas in range -1 to +1\n#     relevant_stats = data_x @ w_list.T + b_list #stores the x.wT+b value of each nodes for all data points(num_data x num_nodes) to check if > 0 i.e will follow right sub tree route or <0 and will follow left sub tree route\n#     curr_index = np.zeros(shape=(num_data), dtype=int) #stores the curr index for each data point from root to leaf. So initially a datapoint starts from root but then it can go to right or left if it goes to right its curr index will become 2 from 0 else 1 from 0 then in next iteration from say 2 it goes to right then it will become 6\n\n    data_x = np.random.standard_normal((num_data, dim))\n    data_x /= np.sqrt(np.sum(data_x**2, axis=1, keepdims=True))\n    relevant_stats = data_x @ w_list.T + b_list\n    curr_index = np.zeros(shape=(num_data), dtype=int)\n    \n    for level in range(num_levels):\n        nodes_curr_level=list(range(2**level - 1,2**(level+1)-1  ))\n        for el in nodes_curr_level:\n#             b_list[el]=-1*np.median(relevant_stats[curr_index==el,el])\n            relevant_stats[:,el] += b_list[el]\n        decision_variable = np.choose(curr_index, relevant_stats.T) #based on the curr index will choose the corresponding node value of the datapoint\n\n        # Go down and right if wx+b>0 down and left otherwise.\n        # i.e. 0 -> 1 if w[0]x+b[0]<0 and 0->2 otherwise\n        curr_index = (curr_index+1)*2 - (1-(decision_variable > 0)) #update curr index based on the desc_variable\n        \n\n    bound_dist = np.min(np.abs(relevant_stats), axis=1) #finds the abs value of the minm node value of a datapoint. If some node value of a datapoint is 0 then that data point exactly passes through a hyperplane and we remove all such datapoints\n    thres = threshold\n    labels = vals[curr_index] #finally labels for each datapoint is assigned after traversing the whole tree\n\n    data_x_pruned = data_x[bound_dist>thres] #to distingush the hyperplanes seperately for 0 1 labels (classification)\n    #removes all the datapoints that passes through a node hyperplane\n    labels_pruned = labels[bound_dist>thres]\n    relevant_stats = np.sign(data_x_pruned @ w_list.T + b_list) #storing only +1 or -1 for a particular node if it is active or not\n    nodes_active = np.zeros((len(data_x_pruned),  num_internal_nodes+num_leaf_nodes), dtype=np.int32) #stores node actv or not for a data\n\n    for node in range(num_internal_nodes+num_leaf_nodes):\n        if node==0:\n            stats[node]=len(relevant_stats) #for root node all datapoints are present\n            nodes_active[:,0]=1 #root node all data points active status is +1\n            continue\n        parent = (node-1)//2\n        nodes_active[:,node]=nodes_active[:,parent]\n        right_child = node-(parent*2)-1 # 0 means left, 1 means right 1 has children 3,4\n        #finds if it is a right child or left of the parent\n        if right_child==1:\n            nodes_active[:,node] *= relevant_stats[:,parent]>0 #if parent node val was >0 then this right child of parent is active\n        if right_child==0:\n            nodes_active[:,node] *= relevant_stats[:,parent]<0 #else left is active\n        stats = nodes_active.sum(axis=0) #updates the status i.e., no of datapoints active in that node (root has all active then gradually divided in left right)\n    return ((data_x_pruned, labels_pruned), (w_list, b_list, vals), stats)","metadata":{"execution":{"iopub.status.busy":"2024-04-26T06:11:06.982445Z","iopub.execute_input":"2024-04-26T06:11:06.982774Z","iopub.status.idle":"2024-04-26T06:11:07.001878Z","shell.execute_reply.started":"2024-04-26T06:11:06.982748Z","shell.execute_reply":"2024-04-26T06:11:07.000831Z"},"trusted":true},"execution_count":16,"outputs":[]},{"cell_type":"markdown","source":"**DLGN and DLGN_SF Model**","metadata":{}},{"cell_type":"code","source":"#DLGN and DLGN_SF Model\nclass DLGN_FC(nn.Module):\n    def __init__(self, input_dim=None, output_dim=None, num_hidden_nodes=[], beta=30, dlgn_mode='dlgn_sf', mode='pwc'):\t\t\n        super(DLGN_FC, self).__init__()\n        self.num_hidden_layers = len(num_hidden_nodes)\n        self.beta=beta  # Soft gating parameter\n        self.dlgn_mode = dlgn_mode\n        self.mode = mode\n        self.num_nodes=[input_dim]+num_hidden_nodes+[output_dim]\n        self.gating_layers=nn.ModuleList()\n        self.value_layers=nn.ModuleList()\n\n        for i in range(self.num_hidden_layers+1):\n            if i!=self.num_hidden_layers:\n                if self.dlgn_mode == 'dlgn_sf':\n                    temp = nn.Linear(self.num_nodes[0], self.num_nodes[i+1], bias=False)\n                else :\n                    temp = nn.Linear(self.num_nodes[i], self.num_nodes[i+1], bias=False)\n                self.gating_layers.append(temp)\n            temp = nn.Linear(self.num_nodes[i], self.num_nodes[i+1], bias=False)\n            self.value_layers.append(temp)\n    \n    def set_parameters_with_mask(self, to_copy, parameter_masks):\n        # self and to_copy are DLGN_FC objects with same architecture\n        # parameter_masks is compatible with dict(to_copy.named_parameters())\n        for (name, copy_param) in to_copy.named_parameters():\n            copy_param = copy_param.clone().detach()\n            orig_param  = self.state_dict()[name]\n            if name in parameter_masks:\n                param_mask = parameter_masks[name]>0\n                orig_param[param_mask] = copy_param[param_mask]\n            else:\n                orig_param = copy_param.data.detach()\n\n    def return_gating_functions(self):\n        effective_weights = []\n        for i in range(self.num_hidden_layers):\n            curr_weight = self.gating_layers[i].weight.detach().clone()\n#             curr_weight /= torch.norm(curr_weight, dim=1, keepdim=True)\n            if self.dlgn_mode=='dlgn_sf':\n                effective_weights.append(curr_weight)\n            else:\n                if i==0:\n                    effective_weights.append(curr_weight)\n                else:\n                    effective_weights.append(torch.matmul(curr_weight,effective_weights[-1]))\n        return effective_weights\n        # effective_weights (and effective biases) is a list of size num_hidden_layers\n\n\n    def forward(self, x):\n        gate_scores=[x]\n\n        for el in self.parameters():\n            if el.is_cuda:\n                device = torch.device('cuda:0')\n                # device = torch.device('cpu')\n            else:\n                device = torch.device('cpu')\n        if self.mode=='pwc':\n            values=[torch.ones(x.shape).to(device)]\n        else:\n            values=[x.to(device)]\n\n        for i in range(self.num_hidden_layers):\n            if self.dlgn_mode=='dlgn_sf':\n                gate_scores.append( (x@self.gating_layers[i].weight.T) )\n            else:\n                gate_scores.append(self.gating_layers[i].to(device)(gate_scores[-1].to(device)))\n            curr_gate_on_off = torch.sigmoid(self.beta * gate_scores[-1])\n            values.append(self.value_layers[i](values[-1])*curr_gate_on_off)\n        values.append(self.value_layers[self.num_hidden_layers](values[-1]))\n        # Values is a list of size 1+num_hidden_layers+1\n        #gate_scores is a list of size 1+num_hidden_layers\n        return values,gate_scores","metadata":{"execution":{"iopub.status.busy":"2024-04-26T06:11:07.397466Z","iopub.execute_input":"2024-04-26T06:11:07.397803Z","iopub.status.idle":"2024-04-26T06:11:07.415554Z","shell.execute_reply.started":"2024-04-26T06:11:07.397775Z","shell.execute_reply":"2024-04-26T06:11:07.414596Z"},"trusted":true},"execution_count":17,"outputs":[]},{"cell_type":"markdown","source":"**Train DLGN Model**","metadata":{}},{"cell_type":"code","source":"#@title Train DLGN model\ndef train_dlgn (DLGN_obj, train_data_curr,vali_data_curr,test_data_curr,\n\t\t\t\ttrain_labels_curr,test_labels_curr,vali_labels_curr,num_epoch=1,\n\t\t\t\tparameter_mask=dict()):\n    # DLGN_obj is the initial network\n    # parameter_mask is a dictionary compatible with dict(DLGN_obj.named_parameters())\n    # if a key corresponding to a named_parameter is not present it is assumed to be all ones (i.e it will be updated)\n\n    # Assuming that we are on a CUDA machine, this should print a CUDA device:\n    # device='cpu'\n    # Speed up of a factor of over 40 by using GPU instead of CPU\n\n    print(\"train_data_curr inside train_dlgn:\",train_data_curr.shape)\n    set_torchseed(seed)\n\n    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')\n    # device = torch.device('cpu')\n    DLGN_obj.to(device)\n    criterion = nn.CrossEntropyLoss()\n#     optimizer = optim.SGD(DLGN_obj.parameters(), lr=lr)\n    optimizer = optim.Adam(DLGN_obj.parameters(), lr=lr)\n\n    train_data_torch = torch.Tensor(train_data_curr)\n    vali_data_torch = torch.Tensor(vali_data_curr)\n    test_data_torch = torch.Tensor(test_data_curr)\n\n    train_labels_torch = torch.tensor(train_labels_curr, dtype=torch.int64)\n    test_labels_torch = torch.tensor(test_labels_curr, dtype=torch.int64)\n    vali_labels_torch = torch.tensor(vali_labels_curr, dtype=torch.int64)\n\n    num_batches = no_of_batches\n    batch_size = len(train_data_curr)//num_batches\n    losses=[]\n    DLGN_obj_store = []\n    best_vali_error = len(vali_labels_curr)\n    debug_models= []\n    train_losses = []\n    running_loss = 0.7*num_batches # initial random loss = 0.7 \n\n    for epoch in tqdm(range(saved_epochs[-1]+1)):  # loop over the dataset multiple times\n        if epoch in saved_epochs:\n            DLGN_obj_copy = deepcopy(DLGN_obj)\n            DLGN_obj_copy.to(torch.device('cpu'))\n            DLGN_obj_store.append(DLGN_obj_copy)\n            train_outputs_values, train_outputs_gate_scores =DLGN_obj(torch.Tensor(train_data_curr).to(device))\n            train_preds = train_outputs_values[-1]\n            criterion = nn.CrossEntropyLoss()\n            outputs = torch.cat((-1*train_preds,train_preds), dim=1)\n            targets = torch.tensor(train_labels_curr, dtype=torch.int64).to(device)\n            train_loss = criterion(outputs, targets)\n\n            train_losses.append(train_loss)\n            if epoch%100 == 0:\n                print(train_loss)\n            if train_loss < 5e-6:\n                break\n            if np.isnan(train_loss.detach().cpu().numpy()):\n                break\n        running_loss = 0.0\n        for batch_start in range(0,len(train_data_curr),batch_size):\n            if (batch_start+batch_size)>len(train_data_curr):\n                break\n            optimizer.zero_grad()\n            inputs = train_data_torch[batch_start:batch_start+batch_size]\n            targets = train_labels_torch[batch_start:batch_start+batch_size].reshape(batch_size)\n            inputs = inputs.to(device)\n            targets = targets.to(device)\n            values,gate_scores = DLGN_obj(inputs)\n            outputs = torch.cat((-1*values[-1], values[-1]), dim=1)\n            loss = criterion(outputs, targets)\t\t\t\n            loss.backward()\n            for name,param in DLGN_obj.named_parameters():\n                parameter_mask[name] = parameter_mask[name].to(device)\n                param.grad *= parameter_mask[name]   \n                if \"gat\" in name and epoch>x_epoch:\n                    param.grad *= 0.\n            optimizer.step()\n            running_loss += loss.item()\n\n\n        train_outputs_values, train_outputs_gate_scores =DLGN_obj(torch.Tensor(train_data_curr).to(device))\n        train_preds = train_outputs_values[-1]\n        criterion = nn.CrossEntropyLoss()\n        outputs = torch.cat((-1*train_preds,train_preds), dim=1)\n        targets = torch.tensor(train_labels_curr, dtype=torch.int64).to(device)\n        train_loss = criterion(outputs, targets)\n\n        losses.append(train_loss.cpu().detach().clone().numpy())\n        inputs = vali_data_torch.to(device)\n        targets = vali_labels_torch.to(device)\n        values,gate_scores =DLGN_obj(inputs)\n        vali_preds = torch.cat((-1*values[-1], values[-1]), dim=1)\n        vali_preds = torch.argmax(vali_preds, dim=1)\n        vali_error= torch.sum(targets!=vali_preds)\n        if vali_error < best_vali_error:\n            DLGN_obj_return = deepcopy(DLGN_obj)\n            best_vali_error = vali_error\n            \n    DLGN_obj_return.to(torch.device('cpu'))\n    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')\n    # device = torch.device('cpu')\n    return train_losses, DLGN_obj_return, DLGN_obj_store, losses, debug_models","metadata":{"execution":{"iopub.status.busy":"2024-04-26T06:11:08.108318Z","iopub.execute_input":"2024-04-26T06:11:08.108717Z","iopub.status.idle":"2024-04-26T06:11:08.133767Z","shell.execute_reply.started":"2024-04-26T06:11:08.108686Z","shell.execute_reply":"2024-04-26T06:11:08.132611Z"},"trusted":true},"execution_count":18,"outputs":[]},{"cell_type":"markdown","source":"**Change this cell to run different configs**","metadata":{}},{"cell_type":"code","source":"#In this cell change the parameter values to train on different synthetic data with different models.\n#Dataset Characteristics\n\n'''\nFor SDI input_dim=20, num_data=40000\nDLGN Best Parameters:\nBeta     LR  Hidden Layers       Hidden Nodes            Test Accuracy\n3      0.020        5           [20, 20, 20, 20, 20]         0.9605\n\nDLGN-SF Best Parameters:\nBeta     LR  Hidden Layers        Hidden Nodes           Test Accuracy\n30     0.020        4           [10, 10, 10, 10]              0.9743\n'''\n\n'''\nFor SDII input_dim=100, num_data=60000\nDLGN Best Parameters:\nBeta     LR  Hidden Layers       Hidden Nodes         Test Accuracy\n10     0.010      4             [20, 20, 20, 20]        0.94247\n\nDLGN-SF Best Parameters:\nBeta     LR  Hidden Layers        Hidden Nodes          Test Accuracy\n3      0.020       4             [10, 10, 10, 10]        0.90293\n'''\n\n'''\nFor SDIII input_dim=500, num_data=100000\nDLGN Best Parameters:\nBeta     LR  Hidden Layers       Hidden Nodes         Test Accuracy\n10      0.001      3             [10, 10, 10]          0.65036\n\nDLGN-SF Best Parameters:\nBeta     LR  Hidden Layers        Hidden Nodes        Test Accuracy\n 3     0.010        3             [20, 20, 20]          0.63832\n'''\n\nseed=365\nnum_levels=4\nthreshold = 0 #data seperation distance\n\noptimizer_name ='Adam'\nmodep='pwc' \noutput_dim=1\nnum_epoch=1500 #number of epochs to run\n\nx_epoch = 1500\nsaved_epochs = list(range(0,1501,10))\nweight_decay=0.0\nno_of_batches=10 #[1,10,100]\n\ninput_dim = 20 #Synthetic data input dimension\nnum_data = 40000 #Total data points\n\n\n\nprint(f\"Running code for input_dim={input_dim}, num_data={num_data}\")\n\n((data_x, labels), (w_list, b_list, vals), stats) = data_gen_decision_tree(\n                                            dim=input_dim, seed=seed, num_levels=num_levels,\n                                            num_data=num_data)\nseed_set=seed\nw_list_old = np.array(w_list)\nb_list_old = np.array(b_list)\nprint(sum(labels==1))\nprint(sum(labels==0))\nprint(\"Seed= \",seed_set)\nnum_data = len(data_x)\nnum_train= num_data//2\nnum_vali = num_data//4\nnum_test = num_data//4\ntrain_data = data_x[:num_train,:]\ntrain_data_labels = labels[:num_train]\n\nvali_data = data_x[num_train:num_train+num_vali,:]\nvali_data_labels = labels[num_train:num_train+num_vali]\n\ntest_data = data_x[num_train+num_vali :,:]\ntest_data_labels = labels[num_train+num_vali :]\n\ndlgn_mode = 'dlgn_sf' #use dlgn_sf for running dlgn-sf model\nbeta = 10\nlr = 0.002\n\nnum_hidden_layers = 4\nnum_hidden_nodes = [20,20,20,20]\n\nprint(f\"Running code for num_hidden_layers={num_hidden_layers}, num_hidden_nodes={num_hidden_nodes}\")\n\nmax_no_of_nodes=max(num_hidden_nodes)\n\nset_torchseed(6675)\nDLGN_init= DLGN_FC(input_dim=input_dim, output_dim=1, num_hidden_nodes=num_hidden_nodes, beta=beta, dlgn_mode=dlgn_mode)\n\n\ntrain_parameter_masks=dict()\nfor name,parameter in DLGN_init.named_parameters():\n    if \"val\" in name:\n        train_parameter_masks[name]=torch.ones_like(parameter) #*0.001 # Updating all value network layers\n    if \"gat\" in name:\n        train_parameter_masks[name]=torch.ones_like(parameter)\n\n\n        # train_parameter_masks[name][:num_neurons_set] *= 0.\n    train_parameter_masks[name].to(device)\n\n\n\n\n\n\nset_torchseed(5000)\ntrain_losses, DLGN_obj_final, DLGN_obj_store, losses , debug_models= train_dlgn(train_data_curr=train_data,\n                                            vali_data_curr=vali_data,\n                                            test_data_curr=test_data,\n                                            train_labels_curr=train_data_labels,\n                                            vali_labels_curr=vali_data_labels,\n                                            test_labels_curr=test_data_labels,\n                                            DLGN_obj=deepcopy(DLGN_init),\n                                            parameter_mask=train_parameter_masks,\n                                            )\n\n\ntorch.cuda.empty_cache() \nlosses=np.array(losses)\n\n\ntest_outputs_values, test_outputs_gate_scores =DLGN_obj_final(torch.Tensor(test_data))\ntest_preds = test_outputs_values[-1]\ntest_preds = test_preds.detach().numpy()\nTest_error=np.sum(test_data_labels != (np.sign(test_preds[:,0])+1)//2)\nNum_test_data=len(test_data_labels)\nprint(\"Test_error=\",Test_error)\nprint(\"Num_test_data=\",Num_test_data)\nprint(\"Test Accuracy=\",1-Test_error/Num_test_data)\n","metadata":{"execution":{"iopub.status.busy":"2024-04-26T06:14:40.810969Z","iopub.execute_input":"2024-04-26T06:14:40.811345Z","iopub.status.idle":"2024-04-26T06:15:36.390754Z","shell.execute_reply.started":"2024-04-26T06:14:40.811315Z","shell.execute_reply":"2024-04-26T06:15:36.389275Z"},"trusted":true},"execution_count":20,"outputs":[{"name":"stdout","text":"Running code for input_dim=20, num_data=40000\n18749\n21251\nSeed=  365\nRunning code for num_hidden_layers=4, num_hidden_nodes=[20, 20, 20, 20]\ntrain_data_curr inside train_dlgn: (20000, 20)\n","output_type":"stream"},{"name":"stderr","text":"  0%|          | 3/1501 [00:00<01:01, 24.22it/s]","output_type":"stream"},{"name":"stdout","text":"tensor(0.6932, device='cuda:0', grad_fn=<NllLossBackward0>)\n","output_type":"stream"},{"name":"stderr","text":"  7%|▋         | 105/1501 [00:03<00:51, 27.37it/s]","output_type":"stream"},{"name":"stdout","text":"tensor(0.2603, device='cuda:0', grad_fn=<NllLossBackward0>)\n","output_type":"stream"},{"name":"stderr","text":" 14%|█▎        | 204/1501 [00:07<00:49, 26.40it/s]","output_type":"stream"},{"name":"stdout","text":"tensor(0.1798, device='cuda:0', grad_fn=<NllLossBackward0>)\n","output_type":"stream"},{"name":"stderr","text":" 20%|██        | 304/1501 [00:11<00:42, 27.94it/s]","output_type":"stream"},{"name":"stdout","text":"tensor(0.1358, device='cuda:0', grad_fn=<NllLossBackward0>)\n","output_type":"stream"},{"name":"stderr","text":" 27%|██▋       | 403/1501 [00:15<00:40, 26.92it/s]","output_type":"stream"},{"name":"stdout","text":"tensor(0.0948, device='cuda:0', grad_fn=<NllLossBackward0>)\n","output_type":"stream"},{"name":"stderr","text":" 34%|███▎      | 503/1501 [00:18<00:36, 27.69it/s]","output_type":"stream"},{"name":"stdout","text":"tensor(0.0620, device='cuda:0', grad_fn=<NllLossBackward0>)\n","output_type":"stream"},{"name":"stderr","text":" 40%|████      | 605/1501 [00:22<00:31, 28.26it/s]","output_type":"stream"},{"name":"stdout","text":"tensor(0.0373, device='cuda:0', grad_fn=<NllLossBackward0>)\n","output_type":"stream"},{"name":"stderr","text":" 47%|████▋     | 704/1501 [00:26<00:28, 27.50it/s]","output_type":"stream"},{"name":"stdout","text":"tensor(0.0179, device='cuda:0', grad_fn=<NllLossBackward0>)\n","output_type":"stream"},{"name":"stderr","text":" 53%|█████▎    | 803/1501 [00:29<00:25, 27.45it/s]","output_type":"stream"},{"name":"stdout","text":"tensor(0.1195, device='cuda:0', grad_fn=<NllLossBackward0>)\n","output_type":"stream"},{"name":"stderr","text":" 60%|██████    | 905/1501 [00:33<00:21, 27.53it/s]","output_type":"stream"},{"name":"stdout","text":"tensor(0.0027, device='cuda:0', grad_fn=<NllLossBackward0>)\n","output_type":"stream"},{"name":"stderr","text":" 67%|██████▋   | 1003/1501 [00:36<00:17, 28.46it/s]","output_type":"stream"},{"name":"stdout","text":"tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward0>)\n","output_type":"stream"},{"name":"stderr","text":" 74%|███████▎  | 1105/1501 [00:40<00:14, 27.64it/s]","output_type":"stream"},{"name":"stdout","text":"tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward0>)\n","output_type":"stream"},{"name":"stderr","text":" 80%|████████  | 1205/1501 [00:44<00:10, 28.36it/s]","output_type":"stream"},{"name":"stdout","text":"tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward0>)\n","output_type":"stream"},{"name":"stderr","text":" 87%|████████▋ | 1305/1501 [00:48<00:06, 28.09it/s]","output_type":"stream"},{"name":"stdout","text":"tensor(8.0779e-05, device='cuda:0', grad_fn=<NllLossBackward0>)\n","output_type":"stream"},{"name":"stderr","text":" 93%|█████████▎| 1403/1501 [00:51<00:03, 28.86it/s]","output_type":"stream"},{"name":"stdout","text":"tensor(3.9841e-05, device='cuda:0', grad_fn=<NllLossBackward0>)\n","output_type":"stream"},{"name":"stderr","text":"100%|██████████| 1501/1501 [00:55<00:00, 27.25it/s]\n","output_type":"stream"},{"name":"stdout","text":"tensor(2.0273e-05, device='cuda:0', grad_fn=<NllLossBackward0>)\nTest_error= 924\nNum_test_data= 10000\nTest Accuracy= 0.9076\n","output_type":"stream"}]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}