{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "import math\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "from torch.optim import SGD, RMSprop\n",
    "from torchvision import datasets, transforms\n",
    "dtype = torch.cuda.FloatTensor\n",
    "\n",
    "from src.components import *\n",
    "from src.optimizers import *\n",
    "from sklearn import preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<torch._C.Generator at 0x7f2adb33ae10>"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pickle\n",
    "\n",
    "random.seed(2)\n",
    "np.random.seed(2)\n",
    "torch.manual_seed(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Files already downloaded and verified\n",
      "Files already downloaded and verified\n"
     ]
    }
   ],
   "source": [
    "#https://towardsdatascience.com/handwritten-digit-mnist-pytorch-977b5338e627\n",
    "# dev = torch.device('cpu')\n",
    "dev = torch.device('cuda:6')\n",
    "\n",
    "transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5,), (0.5,))])\n",
    "\n",
    "trainset = datasets.CIFAR10('PATH_TO_STORE_TRAINSET', download=True, train=True, transform=transform)\n",
    "valset = datasets.CIFAR10('PATH_TO_STORE_TESTSET', download=True, train=False, transform=transform)\n",
    "\n",
    "trainset = [(x.to(dev), torch.tensor(y, device=dev)) for x,y in trainset]\n",
    "valset = [(x.to(dev), torch.tensor(y, device=dev)) for x,y in valset]\n",
    "\n",
    "trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True)\n",
    "testloader = torch.utils.data.DataLoader(valset, batch_size=128, shuffle=True)\n",
    "N = len(trainset) #for training\n",
    "# N = len(valset) #for testing\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "torch.Size([128, 3, 32, 32])\n",
      "torch.Size([128])\n"
     ]
    }
   ],
   "source": [
    "dataiter = iter(testloader)\n",
    "images, labels = dataiter.next()\n",
    "\n",
    "print(images.shape)\n",
    "print(labels.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "###### First we run the model with SGD in order to find the MAP solution. Then we run it again with SGLD\n",
    "# in order to find the posterior distribution.\n",
    "\n",
    "model_arch_args = dict(\n",
    "#     num_inputs=iter(trainloader).next()[0].shape[-1]*iter(trainloader).next()[0].shape[-2]*iter(trainloader).next()[0].shape[1],\n",
    "#     num_outputs=10,\n",
    "#     num_layers=2,\n",
    "#     hidden_sizes=[50,50],\n",
    "#     activation_func=nn.ReLU,#nn.Tanh, #nn.ReLU,\n",
    "#     chain_length=4000,\n",
    "#     stochastic_biases=False,\n",
    "#     prior_std = 0.3,\n",
    "#     output_distribution=\"categorical\",\n",
    "#     output_dist_const_params=dict(), #scale=1.0),\n",
    ")\n",
    "\n",
    "sgd_model_args = dict(\n",
    "    group_by_layers=False,\n",
    "    use_random_groups=False,\n",
    "    use_permuted_groups=False,\n",
    "    max_groups=None,\n",
    "    dropout_prob=None,\n",
    "    **model_arch_args,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "sgd_model = BayesianResNet20(**sgd_model_args)\n",
    "sgd_model.initialize_optimizer(\n",
    "    update_determ=True, \n",
    "    update_stoch=True, \n",
    "#     lr=1e-8, #1e-5, \n",
    "    lr=1e-3, \n",
    "    rmsprop=True,\n",
    "    sgd=False, \n",
    "    sgld=False, \n",
    "    psgld=False,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "pickle.dump(sgd_model_args, open(\"./resnet20_sgd_model_params.pickle\", \"wb\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# dev = torch.device('cpu')\n",
    "\n",
    "# for images, labels in trainloader:\n",
    "#     images = images.to(dev)\n",
    "#     labels = labels.to(dev)\n",
    "\n",
    "#trainset = [(x.to(dev), torch.tensor(y, device=dev).unsqueeze(0)) for x,y in trainloader]\n",
    "\n",
    "sgd_model = sgd_model.to(dev)\n",
    "\n",
    "for n, t in sgd_model.tensor_dict.items():\n",
    "    if isinstance(t, StochasticTensor):\n",
    "        t.prior_dist.loc = t.prior_dist.loc.to(dev)\n",
    "        t.prior_dist.scale = t.prior_dist.scale.to(dev)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Iter 1 / 2000, Loss: 4421362534.5, CrossEntropy: 1.7264758348464966, Accuracy: 0.350943094629156\n",
      "Iter 2 / 2000, Loss: 3291276845.5, CrossEntropy: 1.2746440172195435, Accuracy: 0.5322290601023018\n",
      "Iter 3 / 2000, Loss: 2680235659.75, CrossEntropy: 1.030050277709961, Accuracy: 0.6300431585677749\n",
      "Iter 4 / 2000, Loss: 2285055144.0, CrossEntropy: 0.8718842267990112, Accuracy: 0.6863890664961637\n",
      "Iter 5 / 2000, Loss: 1982837788.25, CrossEntropy: 0.7512186765670776, Accuracy: 0.733611732736573\n",
      "Iter 6 / 2000, Loss: 1760809752.5, CrossEntropy: 0.6623088121414185, Accuracy: 0.768454283887468\n",
      "Iter 7 / 2000, Loss: 1591924834.25, CrossEntropy: 0.5949235558509827, Accuracy: 0.7922634271099743\n",
      "Iter 8 / 2000, Loss: 1450012311.5, CrossEntropy: 0.5379517674446106, Accuracy: 0.8116008631713555\n",
      "Iter 9 / 2000, Loss: 1335786847.75, CrossEntropy: 0.49239733815193176, Accuracy: 0.8274976023017903\n",
      "Iter 10 / 2000, Loss: 1232274439.5, CrossEntropy: 0.4509517252445221, Accuracy: 0.8416520140664961\n",
      "Iter 11 / 2000, Loss: 1133404593.625, CrossEntropy: 0.4113626182079315, Accuracy: 0.8573329603580563\n",
      "Iter 12 / 2000, Loss: 1047738266.5, CrossEntropy: 0.3769586980342865, Accuracy: 0.8680226982097188\n",
      "Iter 13 / 2000, Loss: 964345700.375, CrossEntropy: 0.3438212275505066, Accuracy: 0.8799552429667519\n",
      "Iter 14 / 2000, Loss: 896155176.25, CrossEntropy: 0.31654056906700134, Accuracy: 0.8888147378516624\n",
      "Iter 15 / 2000, Loss: 812552368.5, CrossEntropy: 0.28320184350013733, Accuracy: 0.9012907608695653\n",
      "Iter 16 / 2000, Loss: 752092529.75, CrossEntropy: 0.2588290572166443, Accuracy: 0.9079763427109975\n",
      "Iter 17 / 2000, Loss: 697492774.8125, CrossEntropy: 0.23702989518642426, Accuracy: 0.9169876918158568\n",
      "Iter 18 / 2000, Loss: 637185400.3125, CrossEntropy: 0.21294960379600525, Accuracy: 0.9240489130434784\n",
      "Iter 19 / 2000, Loss: 595524482.375, CrossEntropy: 0.19628016650676727, Accuracy: 0.9307424872122763\n",
      "Iter 20 / 2000, Loss: 548104821.0625, CrossEntropy: 0.17719672620296478, Accuracy: 0.9372642263427109\n",
      "Iter 21 / 2000, Loss: 504946801.5, CrossEntropy: 0.1600395292043686, Accuracy: 0.943869884910486\n",
      "Iter 22 / 2000, Loss: 468633526.25, CrossEntropy: 0.1453671157360077, Accuracy: 0.9482336956521739\n",
      "Iter 23 / 2000, Loss: 447089944.3125, CrossEntropy: 0.1367846429347992, Accuracy: 0.9514585997442455\n",
      "Iter 24 / 2000, Loss: 415171909.3125, CrossEntropy: 0.12402287125587463, Accuracy: 0.9552429667519181\n",
      "Iter 25 / 2000, Loss: 385903064.40625, CrossEntropy: 0.11222892254590988, Accuracy: 0.959494884910486\n",
      "Iter 26 / 2000, Loss: 377662948.71875, CrossEntropy: 0.10895416140556335, Accuracy: 0.9604699488491049\n",
      "Iter 27 / 2000, Loss: 360847250.46875, CrossEntropy: 0.1022074967622757, Accuracy: 0.9634271099744245\n",
      "Iter 28 / 2000, Loss: 346743179.625, CrossEntropy: 0.09662183374166489, Accuracy: 0.9648377557544756\n",
      "Iter 29 / 2000, Loss: 330009482.75, CrossEntropy: 0.08984000980854034, Accuracy: 0.9685461956521739\n",
      "Iter 30 / 2000, Loss: 316161574.5625, CrossEntropy: 0.08436132967472076, Accuracy: 0.9703204923273657\n",
      "Iter 31 / 2000, Loss: 306310897.78125, CrossEntropy: 0.0803227350115776, Accuracy: 0.9717071611253197\n",
      "Iter 32 / 2000, Loss: 292295061.40625, CrossEntropy: 0.07473962008953094, Accuracy: 0.9732416879795397\n",
      "Iter 33 / 2000, Loss: 291603256.15625, CrossEntropy: 0.07444943487644196, Accuracy: 0.9732656649616368\n",
      "Iter 34 / 2000, Loss: 275917955.8125, CrossEntropy: 0.06822313368320465, Accuracy: 0.9755035166240409\n",
      "Iter 35 / 2000, Loss: 276053091.3125, CrossEntropy: 0.0683625191450119, Accuracy: 0.9770540281329922\n",
      "Iter 36 / 2000, Loss: 266327425.28125, CrossEntropy: 0.06431802362203598, Accuracy: 0.9769501278772379\n",
      "Iter 37 / 2000, Loss: 260121667.875, CrossEntropy: 0.06184309720993042, Accuracy: 0.9782688618925831\n",
      "Iter 38 / 2000, Loss: 253315805.3125, CrossEntropy: 0.05912578105926514, Accuracy: 0.9785366048593351\n",
      "Iter 39 / 2000, Loss: 249484735.90625, CrossEntropy: 0.05761607363820076, Accuracy: 0.9791120524296675\n",
      "Iter 40 / 2000, Loss: 246039297.3125, CrossEntropy: 0.05622851848602295, Accuracy: 0.9800791240409207\n",
      "Iter 41 / 2000, Loss: 244317410.3125, CrossEntropy: 0.05546925216913223, Accuracy: 0.9808543797953965\n",
      "Iter 42 / 2000, Loss: 245710550.875, CrossEntropy: 0.055982306599617004, Accuracy: 0.97965952685422\n",
      "Iter 43 / 2000, Loss: 231421565.875, CrossEntropy: 0.05026863515377045, Accuracy: 0.9825967071611253\n",
      "Iter 44 / 2000, Loss: 229206156.15625, CrossEntropy: 0.04955284297466278, Accuracy: 0.9820692135549872\n",
      "Iter 45 / 2000, Loss: 227273144.78125, CrossEntropy: 0.04865493252873421, Accuracy: 0.9829124040920717\n",
      "Iter 46 / 2000, Loss: 224256751.1875, CrossEntropy: 0.04742730036377907, Accuracy: 0.9830043158567775\n",
      "Iter 47 / 2000, Loss: 222954135.9375, CrossEntropy: 0.04688316211104393, Accuracy: 0.9837236253196932\n",
      "Iter 48 / 2000, Loss: 222560381.0, CrossEntropy: 0.046739473938941956, Accuracy: 0.9838315217391305\n",
      "Iter 49 / 2000, Loss: 220294686.46875, CrossEntropy: 0.04578021168708801, Accuracy: 0.9840632992327366\n",
      "Iter 50 / 2000, Loss: 212894531.71875, CrossEntropy: 0.04296747222542763, Accuracy: 0.9849184782608695\n",
      "Iter 51 / 2000, Loss: 212656436.0625, CrossEntropy: 0.04275834187865257, Accuracy: 0.9853820332480818\n",
      "Iter 52 / 2000, Loss: 198255830.625, CrossEntropy: 0.03697028383612633, Accuracy: 0.9865808823529412\n",
      "Iter 53 / 2000, Loss: 208691586.65625, CrossEntropy: 0.04115442559123039, Accuracy: 0.9856098145780052\n",
      "Iter 54 / 2000, Loss: 211260857.0, CrossEntropy: 0.04216769337654114, Accuracy: 0.9853101023017904\n",
      "Iter 55 / 2000, Loss: 197804472.96875, CrossEntropy: 0.036761440336704254, Accuracy: 0.9867806905370844\n",
      "Iter 56 / 2000, Loss: 197729380.78125, CrossEntropy: 0.03681652992963791, Accuracy: 0.9870644181585677\n",
      "Iter 57 / 2000, Loss: 197924414.0625, CrossEntropy: 0.03675810620188713, Accuracy: 0.9871922953964194\n",
      "Iter 58 / 2000, Loss: 198833153.46875, CrossEntropy: 0.03715282306075096, Accuracy: 0.9869685102301791\n",
      "Iter 59 / 2000, Loss: 192255137.65625, CrossEntropy: 0.03454923629760742, Accuracy: 0.9879275895140666\n",
      "Iter 60 / 2000, Loss: 189484099.8125, CrossEntropy: 0.03336172550916672, Accuracy: 0.9883511828644501\n",
      "Iter 61 / 2000, Loss: 194147869.46875, CrossEntropy: 0.03522541746497154, Accuracy: 0.9879795396419437\n",
      "Iter 62 / 2000, Loss: 186063282.6875, CrossEntropy: 0.03197618946433067, Accuracy: 0.988531010230179\n",
      "Iter 63 / 2000, Loss: 186727042.3125, CrossEntropy: 0.032274212688207626, Accuracy: 0.988387148337596\n",
      "Iter 64 / 2000, Loss: 190555983.5625, CrossEntropy: 0.033769942820072174, Accuracy: 0.9887468030690537\n",
      "Iter 65 / 2000, Loss: 182750847.34375, CrossEntropy: 0.03065439499914646, Accuracy: 0.9888387148337596\n",
      "Iter 66 / 2000, Loss: 183892485.59375, CrossEntropy: 0.031075630336999893, Accuracy: 0.9890105498721228\n",
      "Iter 67 / 2000, Loss: 183391490.1875, CrossEntropy: 0.0308770090341568, Accuracy: 0.9893981777493607\n",
      "Iter 68 / 2000, Loss: 180516620.34375, CrossEntropy: 0.02980487048625946, Accuracy: 0.9897738171355498\n",
      "Iter 69 / 2000, Loss: 178651518.875, CrossEntropy: 0.028984438627958298, Accuracy: 0.9907368925831203\n",
      "Iter 70 / 2000, Loss: 180353660.875, CrossEntropy: 0.029636643826961517, Accuracy: 0.9896379475703325\n",
      "Iter 71 / 2000, Loss: 179384827.4375, CrossEntropy: 0.02923939749598503, Accuracy: 0.9898297634271099\n",
      "Iter 72 / 2000, Loss: 179129720.15625, CrossEntropy: 0.029136313125491142, Accuracy: 0.9894581202046037\n",
      "Iter 73 / 2000, Loss: 173565492.375, CrossEntropy: 0.02689475007355213, Accuracy: 0.9902173913043478\n",
      "Iter 74 / 2000, Loss: 176310039.0, CrossEntropy: 0.027992989867925644, Accuracy: 0.9900775255754476\n",
      "Iter 75 / 2000, Loss: 171991526.6875, CrossEntropy: 0.02627274952828884, Accuracy: 0.9912963554987213\n",
      "Iter 76 / 2000, Loss: 174694102.75, CrossEntropy: 0.027347715571522713, Accuracy: 0.99065297314578\n",
      "Iter 77 / 2000, Loss: 171327336.21875, CrossEntropy: 0.0259824488312006, Accuracy: 0.9908248081841433\n",
      "Iter 78 / 2000, Loss: 166906181.28125, CrossEntropy: 0.0241966862231493, Accuracy: 0.9914162404092072\n",
      "Iter 79 / 2000, Loss: 166291391.90625, CrossEntropy: 0.023934587836265564, Accuracy: 0.992127557544757\n",
      "Iter 80 / 2000, Loss: 170670350.15625, CrossEntropy: 0.025677140802145004, Accuracy: 0.9908487851662404\n",
      "Iter 81 / 2000, Loss: 166542739.78125, CrossEntropy: 0.02401430904865265, Accuracy: 0.9919876918158568\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Iter 82 / 2000, Loss: 168586408.75, CrossEntropy: 0.024958312511444092, Accuracy: 0.9919437340153453\n",
      "Iter 83 / 2000, Loss: 169264285.0625, CrossEntropy: 0.025086740031838417, Accuracy: 0.9913882672634271\n",
      "Iter 84 / 2000, Loss: 171243213.1875, CrossEntropy: 0.025881066918373108, Accuracy: 0.9908367966751919\n",
      "Iter 85 / 2000, Loss: 159958355.28125, CrossEntropy: 0.021395662799477577, Accuracy: 0.9928228900255756\n",
      "Iter 86 / 2000, Loss: 168245660.71875, CrossEntropy: 0.024689437821507454, Accuracy: 0.991656010230179\n",
      "Iter 87 / 2000, Loss: 164590853.65625, CrossEntropy: 0.02321067452430725, Accuracy: 0.991775895140665\n",
      "Iter 88 / 2000, Loss: 160675408.875, CrossEntropy: 0.021617017686367035, Accuracy: 0.9930866368286445\n",
      "Iter 89 / 2000, Loss: 159952843.28125, CrossEntropy: 0.021366922184824944, Accuracy: 0.9923753196930947\n",
      "Iter 90 / 2000, Loss: 160407824.5, CrossEntropy: 0.021522538736462593, Accuracy: 0.9921435421994885\n",
      "Iter 91 / 2000, Loss: 160553526.1875, CrossEntropy: 0.021535931155085564, Accuracy: 0.9930067135549873\n",
      "Iter 92 / 2000, Loss: 160019387.46875, CrossEntropy: 0.02131759002804756, Accuracy: 0.9923873081841432\n",
      "Iter 93 / 2000, Loss: 161719941.59375, CrossEntropy: 0.021983832120895386, Accuracy: 0.9921075767263428\n",
      "Iter 94 / 2000, Loss: 158577301.125, CrossEntropy: 0.020737284794449806, Accuracy: 0.9931745524296676\n",
      "Iter 95 / 2000, Loss: 159614659.65625, CrossEntropy: 0.02112733945250511, Accuracy: 0.9927269820971867\n",
      "Iter 96 / 2000, Loss: 161451520.875, CrossEntropy: 0.02185889333486557, Accuracy: 0.9923673273657289\n",
      "Iter 97 / 2000, Loss: 157610520.6875, CrossEntropy: 0.020314356312155724, Accuracy: 0.9928868286445013\n",
      "Iter 98 / 2000, Loss: 158579066.0625, CrossEntropy: 0.020708873867988586, Accuracy: 0.9929947250639386\n",
      "Iter 99 / 2000, Loss: 156122238.25, CrossEntropy: 0.01971273496747017, Accuracy: 0.9933144181585678\n",
      "Iter 100 / 2000, Loss: 159476406.6875, CrossEntropy: 0.021083667874336243, Accuracy: 0.9924112851662403\n",
      "Iter 101 / 2000, Loss: 163082968.96875, CrossEntropy: 0.022465113550424576, Accuracy: 0.9924872122762148\n",
      "Iter 102 / 2000, Loss: 150947828.65625, CrossEntropy: 0.017622344195842743, Accuracy: 0.9939538043478261\n",
      "Iter 103 / 2000, Loss: 155324399.90625, CrossEntropy: 0.019383499398827553, Accuracy: 0.9926550511508951\n",
      "Iter 104 / 2000, Loss: 156139157.84375, CrossEntropy: 0.019810019060969353, Accuracy: 0.993574168797954\n",
      "Iter 105 / 2000, Loss: 154612425.1875, CrossEntropy: 0.019088666886091232, Accuracy: 0.9934502877237851\n",
      "Iter 106 / 2000, Loss: 150112505.34375, CrossEntropy: 0.017330896109342575, Accuracy: 0.9941016624040921\n",
      "Iter 107 / 2000, Loss: 155349765.40625, CrossEntropy: 0.019333994016051292, Accuracy: 0.9928748401534527\n",
      "Iter 108 / 2000, Loss: 159111215.4375, CrossEntropy: 0.020822659134864807, Accuracy: 0.9926670396419437\n",
      "Iter 109 / 2000, Loss: 153309403.9375, CrossEntropy: 0.01849844865500927, Accuracy: 0.9938339194373402\n",
      "Iter 110 / 2000, Loss: 149517362.53125, CrossEntropy: 0.017004815861582756, Accuracy: 0.9943534207161125\n",
      "Iter 111 / 2000, Loss: 154076767.875, CrossEntropy: 0.018779633566737175, Accuracy: 0.993366368286445\n",
      "Iter 112 / 2000, Loss: 153397540.25, CrossEntropy: 0.018509162589907646, Accuracy: 0.9937539961636829\n",
      "Iter 113 / 2000, Loss: 145852585.875, CrossEntropy: 0.01550188846886158, Accuracy: 0.9947130754475704\n",
      "Iter 114 / 2000, Loss: 157003402.15625, CrossEntropy: 0.019927771762013435, Accuracy: 0.993486253196931\n",
      "Iter 115 / 2000, Loss: 149658797.65625, CrossEntropy: 0.016999509185552597, Accuracy: 0.9940537084398977\n",
      "Iter 116 / 2000, Loss: 150741124.875, CrossEntropy: 0.017405247315764427, Accuracy: 0.9937460038363172\n",
      "Iter 117 / 2000, Loss: 147483546.53125, CrossEntropy: 0.016207478940486908, Accuracy: 0.994341432225064\n",
      "Iter 118 / 2000, Loss: 154708115.8125, CrossEntropy: 0.018989894539117813, Accuracy: 0.9934343030690538\n",
      "Iter 119 / 2000, Loss: 147167708.21875, CrossEntropy: 0.015955226495862007, Accuracy: 0.9944653132992327\n",
      "Iter 120 / 2000, Loss: 147247517.59375, CrossEntropy: 0.016013948246836662, Accuracy: 0.9945412404092072\n",
      "Iter 121 / 2000, Loss: 145474461.03125, CrossEntropy: 0.015328054316341877, Accuracy: 0.9945212595907928\n",
      "Iter 122 / 2000, Loss: 146395581.75, CrossEntropy: 0.01566162519156933, Accuracy: 0.9948809143222507\n",
      "Iter 123 / 2000, Loss: 147949625.0, CrossEntropy: 0.016233021393418312, Accuracy: 0.9944653132992327\n",
      "Iter 124 / 2000, Loss: 145599289.53125, CrossEntropy: 0.015285981819033623, Accuracy: 0.9948249680306905\n",
      "Iter 125 / 2000, Loss: 146183896.6875, CrossEntropy: 0.015557708218693733, Accuracy: 0.9949328644501279\n",
      "Iter 126 / 2000, Loss: 148739943.59375, CrossEntropy: 0.016528304666280746, Accuracy: 0.9942255434782609\n",
      "Iter 127 / 2000, Loss: 152841014.46875, CrossEntropy: 0.018243614584207535, Accuracy: 0.9938898657289001\n",
      "Iter 128 / 2000, Loss: 149608281.8125, CrossEntropy: 0.016890184953808784, Accuracy: 0.9945732097186701\n",
      "Iter 129 / 2000, Loss: 143877636.84375, CrossEntropy: 0.014556952752172947, Accuracy: 0.9950647378516624\n",
      "Iter 130 / 2000, Loss: 147533068.1875, CrossEntropy: 0.016019532456994057, Accuracy: 0.9947250639386189\n",
      "Iter 131 / 2000, Loss: 144075897.9375, CrossEntropy: 0.014633812941610813, Accuracy: 0.9952046035805626\n",
      "Iter 132 / 2000, Loss: 148351011.46875, CrossEntropy: 0.016372699290513992, Accuracy: 0.9944413363171356\n",
      "Iter 133 / 2000, Loss: 143642328.6875, CrossEntropy: 0.014431379735469818, Accuracy: 0.9948649296675192\n",
      "Iter 134 / 2000, Loss: 140481572.6875, CrossEntropy: 0.013177091255784035, Accuracy: 0.9954124040920717\n",
      "Iter 135 / 2000, Loss: 143610008.8125, CrossEntropy: 0.0144016919657588, Accuracy: 0.9949848145780051\n",
      "Iter 136 / 2000, Loss: 145250623.5, CrossEntropy: 0.015056449919939041, Accuracy: 0.9949248721227621\n",
      "Iter 137 / 2000, Loss: 145515297.46875, CrossEntropy: 0.015149212442338467, Accuracy: 0.9945851982097187\n",
      "Iter 138 / 2000, Loss: 141334525.375, CrossEntropy: 0.013483921997249126, Accuracy: 0.9951526534526854\n",
      "Iter 139 / 2000, Loss: 146089546.0625, CrossEntropy: 0.015360679477453232, Accuracy: 0.9948649296675192\n",
      "Iter 140 / 2000, Loss: 139092694.96875, CrossEntropy: 0.01255567092448473, Accuracy: 0.9957041240409207\n",
      "Iter 141 / 2000, Loss: 139719072.40625, CrossEntropy: 0.012801817618310452, Accuracy: 0.9955642583120204\n",
      "Iter 142 / 2000, Loss: 147746877.90625, CrossEntropy: 0.01600438728928566, Accuracy: 0.9947450447570333\n",
      "Iter 143 / 2000, Loss: 144342957.78125, CrossEntropy: 0.014631330966949463, Accuracy: 0.9949648337595908\n",
      "Iter 144 / 2000, Loss: 143555318.46875, CrossEntropy: 0.014308251440525055, Accuracy: 0.9953244884910486\n",
      "Iter 145 / 2000, Loss: 146936490.46875, CrossEntropy: 0.015653390437364578, Accuracy: 0.994605179028133\n",
      "Iter 146 / 2000, Loss: 139840202.65625, CrossEntropy: 0.012836961075663567, Accuracy: 0.9956521739130435\n",
      "Iter 147 / 2000, Loss: 143966467.9375, CrossEntropy: 0.014448009431362152, Accuracy: 0.995104699488491\n",
      "Iter 148 / 2000, Loss: 138856612.4375, CrossEntropy: 0.012413926422595978, Accuracy: 0.9956521739130435\n",
      "Iter 149 / 2000, Loss: 139272193.84375, CrossEntropy: 0.01256842352449894, Accuracy: 0.9957320971867007\n",
      "Iter 150 / 2000, Loss: 145130167.5625, CrossEntropy: 0.014891012571752071, Accuracy: 0.9947050831202046\n",
      "Iter 151 / 2000, Loss: 142960253.4375, CrossEntropy: 0.01402549259364605, Accuracy: 0.9952725383631714\n",
      "Iter 152 / 2000, Loss: 140348178.6875, CrossEntropy: 0.01300724782049656, Accuracy: 0.9955123081841433\n",
      "Iter 153 / 2000, Loss: 137226566.21875, CrossEntropy: 0.011719128116965294, Accuracy: 0.9962116368286446\n",
      "Iter 154 / 2000, Loss: 145496924.875, CrossEntropy: 0.015052126720547676, Accuracy: 0.9951806265984655\n",
      "Iter 155 / 2000, Loss: 136494425.0, CrossEntropy: 0.011403977870941162, Accuracy: 0.9960437979539642\n",
      "Iter 156 / 2000, Loss: 137060415.96875, CrossEntropy: 0.011621015146374702, Accuracy: 0.9958639705882353\n",
      "Iter 157 / 2000, Loss: 140625861.15625, CrossEntropy: 0.0130338529124856, Accuracy: 0.9955242966751918\n",
      "Iter 158 / 2000, Loss: 138792811.125, CrossEntropy: 0.01229212898761034, Accuracy: 0.9956641624040921\n",
      "Iter 159 / 2000, Loss: 141450090.6875, CrossEntropy: 0.013403935357928276, Accuracy: 0.9954923273657289\n",
      "Iter 160 / 2000, Loss: 139423381.0, CrossEntropy: 0.0125492038205266, Accuracy: 0.9956321930946291\n",
      "Iter 161 / 2000, Loss: 140896405.15625, CrossEntropy: 0.013127611018717289, Accuracy: 0.9953524616368287\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Iter 162 / 2000, Loss: 136427410.90625, CrossEntropy: 0.011319207958877087, Accuracy: 0.9962835677749361\n",
      "Iter 163 / 2000, Loss: 137939950.4375, CrossEntropy: 0.011977321468293667, Accuracy: 0.9959518861892583\n",
      "Iter 164 / 2000, Loss: 140655295.53125, CrossEntropy: 0.013050086796283722, Accuracy: 0.9955402813299233\n",
      "Iter 165 / 2000, Loss: 140412581.5, CrossEntropy: 0.012890801765024662, Accuracy: 0.9955442774936062\n",
      "Iter 166 / 2000, Loss: 134569155.625, CrossEntropy: 0.010541852563619614, Accuracy: 0.9965233375959079\n",
      "Iter 167 / 2000, Loss: 141762778.125, CrossEntropy: 0.013410837389528751, Accuracy: 0.9953244884910486\n",
      "Iter 168 / 2000, Loss: 137768569.90625, CrossEntropy: 0.011805448681116104, Accuracy: 0.9959638746803069\n",
      "Iter 169 / 2000, Loss: 140961400.53125, CrossEntropy: 0.013084802776575089, Accuracy: 0.9955242966751918\n",
      "Iter 170 / 2000, Loss: 137009133.375, CrossEntropy: 0.01148665975779295, Accuracy: 0.9960837595907929\n",
      "Iter 171 / 2000, Loss: 140005688.3125, CrossEntropy: 0.012676732614636421, Accuracy: 0.9958639705882353\n",
      "Iter 172 / 2000, Loss: 138703027.8125, CrossEntropy: 0.01215207390487194, Accuracy: 0.9957440856777494\n",
      "Iter 173 / 2000, Loss: 139263664.0625, CrossEntropy: 0.012461469508707523, Accuracy: 0.9959598785166242\n",
      "Iter 174 / 2000, Loss: 135133365.1875, CrossEntropy: 0.010710545815527439, Accuracy: 0.9963235294117647\n",
      "Iter 175 / 2000, Loss: 139561089.125, CrossEntropy: 0.012510831467807293, Accuracy: 0.996019820971867\n",
      "Iter 176 / 2000, Loss: 134447836.53125, CrossEntropy: 0.01041528582572937, Accuracy: 0.9965233375959079\n",
      "Iter 177 / 2000, Loss: 141509426.8125, CrossEntropy: 0.013230174779891968, Accuracy: 0.9951446611253197\n",
      "Iter 178 / 2000, Loss: 137642929.53125, CrossEntropy: 0.011709393933415413, Accuracy: 0.9963914641943734\n",
      "Iter 179 / 2000, Loss: 135566850.5625, CrossEntropy: 0.01084177941083908, Accuracy: 0.9962436061381074\n",
      "Iter 180 / 2000, Loss: 138510292.28125, CrossEntropy: 0.012018892914056778, Accuracy: 0.9961237212276215\n",
      "Iter 181 / 2000, Loss: 138558006.34375, CrossEntropy: 0.012022176757454872, Accuracy: 0.995784047314578\n",
      "Iter 182 / 2000, Loss: 135452904.90625, CrossEntropy: 0.01077079027891159, Accuracy: 0.9964434143222506\n",
      "Iter 183 / 2000, Loss: 138968669.0, CrossEntropy: 0.012168423272669315, Accuracy: 0.995724104859335\n",
      "Iter 184 / 2000, Loss: 136917760.40625, CrossEntropy: 0.011340431869029999, Accuracy: 0.9959838554987213\n",
      "Iter 185 / 2000, Loss: 132931798.71875, CrossEntropy: 0.009765677154064178, Accuracy: 0.996551310741688\n",
      "Iter 186 / 2000, Loss: 135962105.5, CrossEntropy: 0.010942833498120308, Accuracy: 0.9962436061381074\n",
      "Iter 187 / 2000, Loss: 136421733.4375, CrossEntropy: 0.011118809692561626, Accuracy: 0.9960837595907929\n",
      "Iter 188 / 2000, Loss: 136604842.96875, CrossEntropy: 0.011186009272933006, Accuracy: 0.996343510230179\n",
      "Iter 189 / 2000, Loss: 134917019.84375, CrossEntropy: 0.010580887086689472, Accuracy: 0.9964394181585678\n",
      "Iter 190 / 2000, Loss: 133529106.75, CrossEntropy: 0.009942086413502693, Accuracy: 0.996403452685422\n",
      "Iter 191 / 2000, Loss: 135677541.5625, CrossEntropy: 0.010806496255099773, Accuracy: 0.9962715792838875\n",
      "Iter 192 / 2000, Loss: 132655696.75, CrossEntropy: 0.009625907056033611, Accuracy: 0.9969709079283888\n",
      "Iter 193 / 2000, Loss: 135385290.90625, CrossEntropy: 0.010666667483747005, Accuracy: 0.9962116368286446\n",
      "Iter 194 / 2000, Loss: 136411496.34375, CrossEntropy: 0.011116821318864822, Accuracy: 0.9963914641943734\n",
      "Iter 195 / 2000, Loss: 136837031.9375, CrossEntropy: 0.011337701231241226, Accuracy: 0.996199648337596\n",
      "Iter 196 / 2000, Loss: 136787696.0625, CrossEntropy: 0.011195863597095013, Accuracy: 0.9962436061381074\n",
      "Iter 197 / 2000, Loss: 132099566.0625, CrossEntropy: 0.009330390952527523, Accuracy: 0.9965113491048594\n",
      "Iter 198 / 2000, Loss: 130390922.65625, CrossEntropy: 0.008624162524938583, Accuracy: 0.996843030690537\n",
      "Iter 199 / 2000, Loss: 134754307.875, CrossEntropy: 0.01036027166992426, Accuracy: 0.9965233375959079\n",
      "Iter 200 / 2000, Loss: 133888747.875, CrossEntropy: 0.010048318654298782, Accuracy: 0.9965712915601024\n",
      "Iter 201 / 2000, Loss: 134886065.9375, CrossEntropy: 0.010435320436954498, Accuracy: 0.9961117327365729\n",
      "Iter 202 / 2000, Loss: 133415878.78125, CrossEntropy: 0.009804406203329563, Accuracy: 0.9967031649616368\n",
      "Iter 203 / 2000, Loss: 133866242.1875, CrossEntropy: 0.009980795904994011, Accuracy: 0.9965233375959079\n",
      "Iter 204 / 2000, Loss: 135331654.125, CrossEntropy: 0.01056216936558485, Accuracy: 0.9967631074168798\n",
      "Iter 205 / 2000, Loss: 135208633.9375, CrossEntropy: 0.010500328615307808, Accuracy: 0.9962835677749361\n",
      "Iter 206 / 2000, Loss: 134192475.40625, CrossEntropy: 0.010087798349559307, Accuracy: 0.9964434143222506\n",
      "Iter 207 / 2000, Loss: 132983755.1875, CrossEntropy: 0.00960585568100214, Accuracy: 0.9966232416879796\n",
      "Iter 208 / 2000, Loss: 131786416.09375, CrossEntropy: 0.009108497761189938, Accuracy: 0.9968630115089514\n",
      "Iter 209 / 2000, Loss: 130791484.46875, CrossEntropy: 0.008701789192855358, Accuracy: 0.9969429347826086\n",
      "Iter 210 / 2000, Loss: 138638029.875, CrossEntropy: 0.011828210204839706, Accuracy: 0.9957640664961637\n",
      "Iter 211 / 2000, Loss: 135979437.875, CrossEntropy: 0.010764792561531067, Accuracy: 0.9961636828644501\n",
      "Iter 212 / 2000, Loss: 130226364.21875, CrossEntropy: 0.00845804437994957, Accuracy: 0.9973225703324808\n",
      "Iter 213 / 2000, Loss: 127653461.75, CrossEntropy: 0.007454404607415199, Accuracy: 0.9974704283887468\n",
      "Iter 214 / 2000, Loss: 137571923.75, CrossEntropy: 0.011386062949895859, Accuracy: 0.9963714833759592\n",
      "Iter 215 / 2000, Loss: 132722795.0625, CrossEntropy: 0.00944314245134592, Accuracy: 0.9965912723785166\n",
      "Iter 216 / 2000, Loss: 130926767.09375, CrossEntropy: 0.008710058405995369, Accuracy: 0.9971427429667519\n",
      "Iter 217 / 2000, Loss: 133625584.53125, CrossEntropy: 0.009802055545151234, Accuracy: 0.9963994565217392\n",
      "Iter 218 / 2000, Loss: 132459333.09375, CrossEntropy: 0.009332072921097279, Accuracy: 0.9966911764705882\n",
      "Iter 219 / 2000, Loss: 131151657.0, CrossEntropy: 0.008782202377915382, Accuracy: 0.9968110613810742\n",
      "Iter 220 / 2000, Loss: 131241904.34375, CrossEntropy: 0.008800450712442398, Accuracy: 0.9973225703324808\n",
      "Iter 221 / 2000, Loss: 133439356.0625, CrossEntropy: 0.00966943334788084, Accuracy: 0.9969429347826086\n",
      "Iter 222 / 2000, Loss: 131283663.46875, CrossEntropy: 0.00880212988704443, Accuracy: 0.996962915601023\n",
      "Iter 223 / 2000, Loss: 133087103.625, CrossEntropy: 0.009515495039522648, Accuracy: 0.9969429347826086\n",
      "Iter 224 / 2000, Loss: 131704965.3125, CrossEntropy: 0.0089688366279006, Accuracy: 0.9967111572890026\n",
      "Iter 225 / 2000, Loss: 131643607.90625, CrossEntropy: 0.008982562460005283, Accuracy: 0.9969789002557545\n",
      "Iter 226 / 2000, Loss: 135785942.65625, CrossEntropy: 0.01057136058807373, Accuracy: 0.9965233375959079\n",
      "Iter 227 / 2000, Loss: 131037260.75, CrossEntropy: 0.008665253408253193, Accuracy: 0.9968230498721228\n",
      "Iter 228 / 2000, Loss: 134016346.1875, CrossEntropy: 0.009859468787908554, Accuracy: 0.9965233375959079\n",
      "Iter 229 / 2000, Loss: 132730311.5, CrossEntropy: 0.009331636130809784, Accuracy: 0.9967031649616368\n",
      "Iter 230 / 2000, Loss: 132961035.28125, CrossEntropy: 0.009438656270503998, Accuracy: 0.9969509271099745\n",
      "Iter 231 / 2000, Loss: 131556937.71875, CrossEntropy: 0.008853199891746044, Accuracy: 0.997170716112532\n",
      "Iter 232 / 2000, Loss: 132894284.25, CrossEntropy: 0.009392790496349335, Accuracy: 0.9966512148337596\n",
      "Iter 233 / 2000, Loss: 131902958.4375, CrossEntropy: 0.008992294780910015, Accuracy: 0.9969309462915601\n",
      "Iter 234 / 2000, Loss: 133155542.0625, CrossEntropy: 0.009461071342229843, Accuracy: 0.9966831841432225\n",
      "Iter 235 / 2000, Loss: 131282165.84375, CrossEntropy: 0.008715412579476833, Accuracy: 0.9968230498721228\n",
      "Iter 236 / 2000, Loss: 131784809.65625, CrossEntropy: 0.008897255174815655, Accuracy: 0.9967830882352942\n",
      "Iter 237 / 2000, Loss: 130975096.625, CrossEntropy: 0.008566506206989288, Accuracy: 0.9971427429667519\n",
      "Iter 238 / 2000, Loss: 129525397.28125, CrossEntropy: 0.007982020266354084, Accuracy: 0.9973425511508951\n",
      "Iter 239 / 2000, Loss: 135865117.21875, CrossEntropy: 0.010505572892725468, Accuracy: 0.9966632033248082\n",
      "Iter 240 / 2000, Loss: 130224707.4375, CrossEntropy: 0.008307392708957195, Accuracy: 0.9974904092071611\n",
      "Iter 241 / 2000, Loss: 137383267.1875, CrossEntropy: 0.011145581491291523, Accuracy: 0.9963315217391304\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Iter 242 / 2000, Loss: 128604048.59375, CrossEntropy: 0.007582635153084993, Accuracy: 0.9971627237851662\n",
      "Iter 243 / 2000, Loss: 132898398.3125, CrossEntropy: 0.009293334558606148, Accuracy: 0.9968030690537084\n",
      "Iter 244 / 2000, Loss: 129726371.34375, CrossEntropy: 0.008019079454243183, Accuracy: 0.9973025895140665\n",
      "Iter 245 / 2000, Loss: 130193844.53125, CrossEntropy: 0.008195937611162663, Accuracy: 0.9974224744245525\n",
      "Iter 246 / 2000, Loss: 131542584.5625, CrossEntropy: 0.008727283217012882, Accuracy: 0.9970428388746803\n",
      "Iter 247 / 2000, Loss: 127440875.0, CrossEntropy: 0.0070918952114880085, Accuracy: 0.9974104859335039\n",
      "Iter 248 / 2000, Loss: 130284508.0, CrossEntropy: 0.008215007372200489, Accuracy: 0.9971027813299232\n",
      "Iter 249 / 2000, Loss: 129975403.8125, CrossEntropy: 0.008079557679593563, Accuracy: 0.9970028772378516\n",
      "Iter 250 / 2000, Loss: 136310063.4375, CrossEntropy: 0.010603840462863445, Accuracy: 0.9965233375959079\n",
      "Iter 251 / 2000, Loss: 132251594.21875, CrossEntropy: 0.008975251577794552, Accuracy: 0.9970028772378516\n",
      "Iter 252 / 2000, Loss: 129813003.03125, CrossEntropy: 0.008031606674194336, Accuracy: 0.9976502557544757\n",
      "Iter 253 / 2000, Loss: 132310985.90625, CrossEntropy: 0.008983864448964596, Accuracy: 0.9971427429667519\n",
      "Iter 254 / 2000, Loss: 132272605.15625, CrossEntropy: 0.008966203778982162, Accuracy: 0.997022858056266\n",
      "Iter 255 / 2000, Loss: 127669088.5, CrossEntropy: 0.007114870008081198, Accuracy: 0.9975423593350383\n",
      "Iter 256 / 2000, Loss: 129721472.71875, CrossEntropy: 0.007956111803650856, Accuracy: 0.9971507352941177\n",
      "Iter 257 / 2000, Loss: 132670391.125, CrossEntropy: 0.009098924696445465, Accuracy: 0.99690297314578\n",
      "Iter 258 / 2000, Loss: 127933835.0, CrossEntropy: 0.007207789923995733, Accuracy: 0.9975423593350383\n",
      "Iter 259 / 2000, Loss: 133255538.96875, CrossEntropy: 0.009319235570728779, Accuracy: 0.9971227621483376\n",
      "Iter 260 / 2000, Loss: 131448438.46875, CrossEntropy: 0.008592171594500542, Accuracy: 0.9973025895140665\n",
      "Iter 261 / 2000, Loss: 126060786.15625, CrossEntropy: 0.006442260928452015, Accuracy: 0.9977101982097187\n",
      "Iter 262 / 2000, Loss: 131554655.5, CrossEntropy: 0.008630050346255302, Accuracy: 0.9970708120204604\n",
      "Iter 263 / 2000, Loss: 130426224.65625, CrossEntropy: 0.008159033954143524, Accuracy: 0.9974624360613811\n",
      "Iter 264 / 2000, Loss: 127876820.84375, CrossEntropy: 0.007133886683732271, Accuracy: 0.9975223785166241\n",
      "Iter 265 / 2000, Loss: 131339389.46875, CrossEntropy: 0.008533076383173466, Accuracy: 0.9972306585677749\n",
      "Iter 266 / 2000, Loss: 131775757.0, CrossEntropy: 0.008734269067645073, Accuracy: 0.9970907928388747\n",
      "Iter 267 / 2000, Loss: 131247844.3125, CrossEntropy: 0.008476008661091328, Accuracy: 0.9972706202046037\n",
      "Iter 268 / 2000, Loss: 131330189.03125, CrossEntropy: 0.008489742875099182, Accuracy: 0.9971827046035806\n",
      "Iter 269 / 2000, Loss: 130971675.6875, CrossEntropy: 0.008334112353622913, Accuracy: 0.9970428388746803\n",
      "Iter 270 / 2000, Loss: 131367280.4375, CrossEntropy: 0.008490498177707195, Accuracy: 0.9971027813299232\n",
      "Iter 271 / 2000, Loss: 128156624.96875, CrossEntropy: 0.007199466694146395, Accuracy: 0.9974824168797954\n",
      "Iter 272 / 2000, Loss: 132340753.5, CrossEntropy: 0.008859820663928986, Accuracy: 0.9973425511508951\n",
      "Iter 273 / 2000, Loss: 129378820.46875, CrossEntropy: 0.007738418877124786, Accuracy: 0.997346547314578\n",
      "Iter 274 / 2000, Loss: 130916508.1875, CrossEntropy: 0.00827798806130886, Accuracy: 0.9969429347826086\n",
      "Iter 275 / 2000, Loss: 129069279.25, CrossEntropy: 0.007530746050179005, Accuracy: 0.9973825127877238\n",
      "Iter 276 / 2000, Loss: 129654696.71875, CrossEntropy: 0.007758865598589182, Accuracy: 0.9973625319693095\n",
      "Iter 277 / 2000, Loss: 130881203.46875, CrossEntropy: 0.008243193849921227, Accuracy: 0.9974224744245525\n",
      "Iter 278 / 2000, Loss: 127130847.59375, CrossEntropy: 0.006744798738509417, Accuracy: 0.99767023657289\n",
      "Iter 279 / 2000, Loss: 130773597.9375, CrossEntropy: 0.008194190450012684, Accuracy: 0.9974304667519182\n",
      "Iter 280 / 2000, Loss: 129369392.84375, CrossEntropy: 0.007617687340825796, Accuracy: 0.997582320971867\n",
      "Iter 281 / 2000, Loss: 129493041.84375, CrossEntropy: 0.0076630376279354095, Accuracy: 0.9973225703324808\n",
      "Iter 282 / 2000, Loss: 125828671.84375, CrossEntropy: 0.00626468425616622, Accuracy: 0.9976982097186702\n",
      "Iter 283 / 2000, Loss: 129181446.875, CrossEntropy: 0.007523904088884592, Accuracy: 0.9974424552429667\n",
      "Iter 284 / 2000, Loss: 128241998.78125, CrossEntropy: 0.0071551683358848095, Accuracy: 0.9974904092071611\n",
      "Iter 285 / 2000, Loss: 128179572.25, CrossEntropy: 0.007115135435014963, Accuracy: 0.9974424552429667\n",
      "Iter 286 / 2000, Loss: 130370430.75, CrossEntropy: 0.007976065389811993, Accuracy: 0.9972626278772379\n",
      "Iter 287 / 2000, Loss: 127033877.21875, CrossEntropy: 0.006634848657995462, Accuracy: 0.9977621483375959\n",
      "Iter 288 / 2000, Loss: 133598774.65625, CrossEntropy: 0.009250480681657791, Accuracy: 0.9968030690537084\n",
      "Iter 289 / 2000, Loss: 129910377.1875, CrossEntropy: 0.00777764618396759, Accuracy: 0.9974304667519182\n",
      "Iter 290 / 2000, Loss: 127254485.5625, CrossEntropy: 0.0067012375220656395, Accuracy: 0.9976422634271099\n",
      "Iter 291 / 2000, Loss: 129127665.71875, CrossEntropy: 0.007441869005560875, Accuracy: 0.9972826086956522\n",
      "Iter 292 / 2000, Loss: 129077636.46875, CrossEntropy: 0.0074409376829862595, Accuracy: 0.9975103900255755\n",
      "Iter 293 / 2000, Loss: 128462955.28125, CrossEntropy: 0.007163264788687229, Accuracy: 0.9972826086956522\n",
      "Iter 294 / 2000, Loss: 128194080.8125, CrossEntropy: 0.0070818825624883175, Accuracy: 0.9974504475703325\n",
      "Iter 295 / 2000, Loss: 128738287.4375, CrossEntropy: 0.007260901387780905, Accuracy: 0.9975223785166241\n",
      "Iter 296 / 2000, Loss: 128452136.6875, CrossEntropy: 0.007137516047805548, Accuracy: 0.9976822250639387\n",
      "Iter 297 / 2000, Loss: 129710416.875, CrossEntropy: 0.00763341598212719, Accuracy: 0.9974824168797954\n",
      "Iter 298 / 2000, Loss: 130554796.625, CrossEntropy: 0.00796777568757534, Accuracy: 0.9973025895140665\n",
      "Iter 299 / 2000, Loss: 128969325.46875, CrossEntropy: 0.007323357742279768, Accuracy: 0.9973425511508951\n",
      "Iter 300 / 2000, Loss: 130724935.625, CrossEntropy: 0.008023886941373348, Accuracy: 0.9973425511508951\n",
      "Iter 301 / 2000, Loss: 129497493.90625, CrossEntropy: 0.007572066504508257, Accuracy: 0.9974704283887468\n",
      "Iter 302 / 2000, Loss: 126885115.34375, CrossEntropy: 0.006470297928899527, Accuracy: 0.9979419757033248\n",
      "Iter 303 / 2000, Loss: 131799356.21875, CrossEntropy: 0.008426998741924763, Accuracy: 0.9972426470588235\n",
      "Iter 304 / 2000, Loss: 128163706.71875, CrossEntropy: 0.006967597641050816, Accuracy: 0.9977621483375959\n",
      "Iter 305 / 2000, Loss: 129870853.125, CrossEntropy: 0.0076435524970293045, Accuracy: 0.9972426470588235\n",
      "Iter 306 / 2000, Loss: 129108268.34375, CrossEntropy: 0.007331651169806719, Accuracy: 0.9973625319693095\n",
      "Iter 307 / 2000, Loss: 128354209.1875, CrossEntropy: 0.007030772976577282, Accuracy: 0.9974424552429667\n",
      "Iter 308 / 2000, Loss: 125383433.53125, CrossEntropy: 0.005830653943121433, Accuracy: 0.9982217071611253\n",
      "Iter 309 / 2000, Loss: 133815760.0, CrossEntropy: 0.009191598743200302, Accuracy: 0.9966831841432225\n",
      "Iter 310 / 2000, Loss: 123388628.5, CrossEntropy: 0.00501934252679348, Accuracy: 0.998141783887468\n",
      "Iter 311 / 2000, Loss: 132529295.15625, CrossEntropy: 0.008664455264806747, Accuracy: 0.9970428388746803\n",
      "Iter 312 / 2000, Loss: 127752516.8125, CrossEntropy: 0.006769981700927019, Accuracy: 0.9976302749360614\n",
      "Iter 313 / 2000, Loss: 127086129.25, CrossEntropy: 0.006474795285612345, Accuracy: 0.9978620524296675\n",
      "Iter 314 / 2000, Loss: 129653719.5625, CrossEntropy: 0.007493560668081045, Accuracy: 0.9971627237851662\n",
      "Iter 315 / 2000, Loss: 126884878.15625, CrossEntropy: 0.006381618790328503, Accuracy: 0.9978620524296675\n",
      "Iter 316 / 2000, Loss: 129997779.8125, CrossEntropy: 0.007629916537553072, Accuracy: 0.9971507352941177\n",
      "Iter 317 / 2000, Loss: 129968735.59375, CrossEntropy: 0.007600716780871153, Accuracy: 0.9974424552429667\n",
      "Iter 318 / 2000, Loss: 127163569.3125, CrossEntropy: 0.006471011321991682, Accuracy: 0.9978021099744245\n",
      "Iter 319 / 2000, Loss: 126003277.5, CrossEntropy: 0.00601185392588377, Accuracy: 0.997910006393862\n",
      "Iter 320 / 2000, Loss: 131448725.5625, CrossEntropy: 0.008170738816261292, Accuracy: 0.9974024936061381\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Iter 321 / 2000, Loss: 130859633.96875, CrossEntropy: 0.007930709980428219, Accuracy: 0.9973625319693095\n",
      "Iter 322 / 2000, Loss: 128231470.53125, CrossEntropy: 0.006896582897752523, Accuracy: 0.9979899296675192\n",
      "Iter 323 / 2000, Loss: 132242568.8125, CrossEntropy: 0.008471230044960976, Accuracy: 0.9970428388746803\n",
      "Iter 324 / 2000, Loss: 127625662.125, CrossEntropy: 0.006617938168346882, Accuracy: 0.9977821291560103\n",
      "Iter 325 / 2000, Loss: 126438353.25, CrossEntropy: 0.006162535399198532, Accuracy: 0.997850063938619\n",
      "Iter 326 / 2000, Loss: 129108961.65625, CrossEntropy: 0.007197102066129446, Accuracy: 0.9975623401534527\n",
      "Iter 327 / 2000, Loss: 129440657.96875, CrossEntropy: 0.0073557449504733086, Accuracy: 0.9974704283887468\n",
      "Iter 328 / 2000, Loss: 128284576.0625, CrossEntropy: 0.006851927377283573, Accuracy: 0.9976023017902813\n",
      "Iter 329 / 2000, Loss: 129962617.5, CrossEntropy: 0.007515594828873873, Accuracy: 0.9973625319693095\n",
      "Iter 330 / 2000, Loss: 127776209.625, CrossEntropy: 0.006680395919829607, Accuracy: 0.9976902173913044\n",
      "Iter 331 / 2000, Loss: 127163091.15625, CrossEntropy: 0.006383445113897324, Accuracy: 0.9980418797953964\n",
      "Iter 332 / 2000, Loss: 127493023.75, CrossEntropy: 0.006509591359645128, Accuracy: 0.9977621483375959\n",
      "Iter 333 / 2000, Loss: 127772164.25, CrossEntropy: 0.006620257161557674, Accuracy: 0.9977821291560103\n",
      "Iter 334 / 2000, Loss: 127661318.0, CrossEntropy: 0.006568091455847025, Accuracy: 0.9977221867007673\n",
      "Iter 335 / 2000, Loss: 127211512.75, CrossEntropy: 0.006376117002218962, Accuracy: 0.9977421675191815\n",
      "Iter 336 / 2000, Loss: 125013402.625, CrossEntropy: 0.005491812247782946, Accuracy: 0.9980418797953964\n",
      "Iter 337 / 2000, Loss: 129295525.875, CrossEntropy: 0.00719731580466032, Accuracy: 0.997582320971867\n",
      "Iter 338 / 2000, Loss: 126698713.4375, CrossEntropy: 0.006158681586384773, Accuracy: 0.9979419757033248\n",
      "Iter 339 / 2000, Loss: 125316861.53125, CrossEntropy: 0.005592662841081619, Accuracy: 0.9980618606138107\n",
      "Iter 340 / 2000, Loss: 130608655.625, CrossEntropy: 0.007700114976614714, Accuracy: 0.9974424552429667\n",
      "Iter 341 / 2000, Loss: 131133952.5, CrossEntropy: 0.007903864607214928, Accuracy: 0.9974424552429667\n",
      "Iter 342 / 2000, Loss: 129787993.34375, CrossEntropy: 0.007395951077342033, Accuracy: 0.9974904092071611\n",
      "Iter 343 / 2000, Loss: 125193387.46875, CrossEntropy: 0.005543277133256197, Accuracy: 0.9979699488491048\n",
      "Iter 344 / 2000, Loss: 128637652.0, CrossEntropy: 0.006886990275233984, Accuracy: 0.9977621483375959\n",
      "Iter 345 / 2000, Loss: 124987038.84375, CrossEntropy: 0.005428762175142765, Accuracy: 0.9979619565217391\n",
      "Iter 346 / 2000, Loss: 125177814.3125, CrossEntropy: 0.005491477902978659, Accuracy: 0.9979819373401535\n",
      "Iter 347 / 2000, Loss: 129812733.28125, CrossEntropy: 0.007337093353271484, Accuracy: 0.9978220907928389\n",
      "Iter 348 / 2000, Loss: 127372135.25, CrossEntropy: 0.006357555277645588, Accuracy: 0.9979819373401535\n",
      "Iter 349 / 2000, Loss: 127562830.3125, CrossEntropy: 0.006444063503295183, Accuracy: 0.9979699488491048\n",
      "Iter 350 / 2000, Loss: 123693773.03125, CrossEntropy: 0.0048728929832577705, Accuracy: 0.998261668797954\n",
      "Iter 351 / 2000, Loss: 128041334.96875, CrossEntropy: 0.0066537694074213505, Accuracy: 0.9980698529411764\n",
      "Iter 352 / 2000, Loss: 122968352.40625, CrossEntropy: 0.004573511891067028, Accuracy: 0.9984614769820972\n",
      "Iter 353 / 2000, Loss: 129970066.46875, CrossEntropy: 0.007361201569437981, Accuracy: 0.9977621483375959\n",
      "Iter 354 / 2000, Loss: 126444031.59375, CrossEntropy: 0.005947357043623924, Accuracy: 0.9981218030690537\n",
      "Iter 355 / 2000, Loss: 127727790.0625, CrossEntropy: 0.0064578489400446415, Accuracy: 0.9977621483375959\n",
      "Iter 356 / 2000, Loss: 126603523.59375, CrossEntropy: 0.005996836349368095, Accuracy: 0.9980218989769821\n",
      "Iter 357 / 2000, Loss: 128402994.90625, CrossEntropy: 0.006709742825478315, Accuracy: 0.9979020140664961\n",
      "Iter 358 / 2000, Loss: 125632306.65625, CrossEntropy: 0.005595613270998001, Accuracy: 0.9980019181585678\n",
      "Iter 359 / 2000, Loss: 127870997.15625, CrossEntropy: 0.00648548174649477, Accuracy: 0.9978420716112532\n",
      "Iter 360 / 2000, Loss: 132037601.96875, CrossEntropy: 0.008142208680510521, Accuracy: 0.9975623401534527\n",
      "Iter 361 / 2000, Loss: 125708020.90625, CrossEntropy: 0.005608820356428623, Accuracy: 0.9980218989769821\n",
      "Iter 362 / 2000, Loss: 127100687.375, CrossEntropy: 0.00616062618792057, Accuracy: 0.9976422634271099\n",
      "Iter 363 / 2000, Loss: 128849849.5, CrossEntropy: 0.0068589914590120316, Accuracy: 0.9973905051150895\n",
      "Iter 364 / 2000, Loss: 125527339.5625, CrossEntropy: 0.0055216639302670956, Accuracy: 0.998201726342711\n",
      "Iter 365 / 2000, Loss: 125933710.0, CrossEntropy: 0.0056719351559877396, Accuracy: 0.9979819373401535\n",
      "Iter 366 / 2000, Loss: 130911219.9375, CrossEntropy: 0.00765463151037693, Accuracy: 0.9973625319693095\n",
      "Iter 367 / 2000, Loss: 126137890.21875, CrossEntropy: 0.005738462787121534, Accuracy: 0.9980418797953964\n",
      "Iter 368 / 2000, Loss: 126418764.6875, CrossEntropy: 0.005846786312758923, Accuracy: 0.9979020140664961\n",
      "Iter 369 / 2000, Loss: 127850072.875, CrossEntropy: 0.00640924321487546, Accuracy: 0.9979419757033248\n",
      "Iter 370 / 2000, Loss: 127091215.46875, CrossEntropy: 0.006100666709244251, Accuracy: 0.9980618606138107\n",
      "Iter 371 / 2000, Loss: 124490975.75, CrossEntropy: 0.005098753143101931, Accuracy: 0.9982496803069054\n",
      "Iter 372 / 2000, Loss: 131361746.03125, CrossEntropy: 0.007793993689119816, Accuracy: 0.9972226662404092\n",
      "Iter 373 / 2000, Loss: 126333243.8125, CrossEntropy: 0.005779611878097057, Accuracy: 0.9980818414322251\n",
      "Iter 374 / 2000, Loss: 127467086.875, CrossEntropy: 0.006227046251296997, Accuracy: 0.9980019181585678\n",
      "Iter 375 / 2000, Loss: 125101332.125, CrossEntropy: 0.005273923743516207, Accuracy: 0.9983615728900256\n",
      "Iter 376 / 2000, Loss: 127140101.34375, CrossEntropy: 0.006085701286792755, Accuracy: 0.9979020140664961\n",
      "Iter 377 / 2000, Loss: 128273425.375, CrossEntropy: 0.006535853259265423, Accuracy: 0.9977821291560103\n",
      "Iter 378 / 2000, Loss: 128090853.75, CrossEntropy: 0.006456972565501928, Accuracy: 0.9978021099744245\n",
      "Iter 379 / 2000, Loss: 127445568.75, CrossEntropy: 0.006185638252645731, Accuracy: 0.9979619565217391\n",
      "Iter 380 / 2000, Loss: 126830741.65625, CrossEntropy: 0.005936021450906992, Accuracy: 0.9980418797953964\n",
      "Iter 381 / 2000, Loss: 128160715.125, CrossEntropy: 0.006467684172093868, Accuracy: 0.9978101023017903\n",
      "Iter 382 / 2000, Loss: 125463274.28125, CrossEntropy: 0.005375365726649761, Accuracy: 0.9980019181585678\n",
      "Iter 383 / 2000, Loss: 126137737.875, CrossEntropy: 0.0056482465006411076, Accuracy: 0.9983096227621484\n",
      "Iter 384 / 2000, Loss: 124714457.0625, CrossEntropy: 0.005062204319983721, Accuracy: 0.998201726342711\n",
      "Iter 385 / 2000, Loss: 129090018.03125, CrossEntropy: 0.0068040345795452595, Accuracy: 0.9979419757033248\n",
      "Iter 386 / 2000, Loss: 125503598.90625, CrossEntropy: 0.005366983357816935, Accuracy: 0.998261668797954\n",
      "Iter 387 / 2000, Loss: 123546665.46875, CrossEntropy: 0.004578000865876675, Accuracy: 0.9985414002557544\n",
      "Iter 388 / 2000, Loss: 127023905.28125, CrossEntropy: 0.005975763313472271, Accuracy: 0.9978101023017903\n",
      "Iter 389 / 2000, Loss: 127786056.0, CrossEntropy: 0.00625821016728878, Accuracy: 0.9978420716112532\n",
      "Iter 390 / 2000, Loss: 127955768.8125, CrossEntropy: 0.0063196211121976376, Accuracy: 0.9980818414322251\n",
      "Iter 391 / 2000, Loss: 125629148.90625, CrossEntropy: 0.0053840880282223225, Accuracy: 0.998261668797954\n",
      "Iter 392 / 2000, Loss: 125377012.09375, CrossEntropy: 0.005276893265545368, Accuracy: 0.998201726342711\n",
      "Iter 393 / 2000, Loss: 128646667.0, CrossEntropy: 0.006578095257282257, Accuracy: 0.9980019181585678\n",
      "Iter 394 / 2000, Loss: 127881898.21875, CrossEntropy: 0.006268588360399008, Accuracy: 0.9979219948849105\n",
      "Iter 395 / 2000, Loss: 125509991.625, CrossEntropy: 0.005311685614287853, Accuracy: 0.9982816496163683\n",
      "Iter 396 / 2000, Loss: 126074204.5625, CrossEntropy: 0.005558012053370476, Accuracy: 0.9981497762148338\n",
      "Iter 397 / 2000, Loss: 126615266.46875, CrossEntropy: 0.005743423011153936, Accuracy: 0.9981817455242967\n",
      "Iter 398 / 2000, Loss: 128411304.1875, CrossEntropy: 0.006460574921220541, Accuracy: 0.99767023657289\n",
      "Iter 399 / 2000, Loss: 124439231.4375, CrossEntropy: 0.004858546424657106, Accuracy: 0.9983615728900256\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Iter 400 / 2000, Loss: 127781992.65625, CrossEntropy: 0.006194037850946188, Accuracy: 0.9979020140664961\n",
      "Iter 401 / 2000, Loss: 128105938.5625, CrossEntropy: 0.0063124909065663815, Accuracy: 0.9978620524296675\n",
      "Iter 402 / 2000, Loss: 130404213.59375, CrossEntropy: 0.007223429158329964, Accuracy: 0.9978021099744245\n",
      "Iter 403 / 2000, Loss: 126600271.15625, CrossEntropy: 0.00569757167249918, Accuracy: 0.9979020140664961\n",
      "Iter 404 / 2000, Loss: 125301046.0625, CrossEntropy: 0.005174572113901377, Accuracy: 0.9981817455242967\n",
      "Iter 405 / 2000, Loss: 129672860.3125, CrossEntropy: 0.006911832373589277, Accuracy: 0.9978420716112532\n",
      "Iter 406 / 2000, Loss: 127436571.78125, CrossEntropy: 0.006068980321288109, Accuracy: 0.997786125319693\n",
      "Iter 407 / 2000, Loss: 123634032.3125, CrossEntropy: 0.004486407618969679, Accuracy: 0.9984215153452686\n",
      "Iter 408 / 2000, Loss: 127992333.25, CrossEntropy: 0.0062419516034424305, Accuracy: 0.997790121483376\n",
      "Iter 409 / 2000, Loss: 128600312.25, CrossEntropy: 0.006480933632701635, Accuracy: 0.997790121483376\n",
      "Iter 410 / 2000, Loss: 126300045.34375, CrossEntropy: 0.005544792395085096, Accuracy: 0.9981897378516624\n",
      "Iter 411 / 2000, Loss: 123992479.53125, CrossEntropy: 0.004608322866261005, Accuracy: 0.9985214194373402\n",
      "Iter 412 / 2000, Loss: 128529728.6875, CrossEntropy: 0.006432336755096912, Accuracy: 0.9978300831202046\n",
      "Iter 413 / 2000, Loss: 129262513.9375, CrossEntropy: 0.006701385602355003, Accuracy: 0.9977621483375959\n",
      "Iter 414 / 2000, Loss: 126450835.65625, CrossEntropy: 0.005568411201238632, Accuracy: 0.9981018222506394\n",
      "Iter 415 / 2000, Loss: 128791622.90625, CrossEntropy: 0.0065077836625278, Accuracy: 0.9978620524296675\n",
      "Iter 416 / 2000, Loss: 127018230.84375, CrossEntropy: 0.005792400799691677, Accuracy: 0.9981297953964194\n",
      "Iter 417 / 2000, Loss: 125416145.25, CrossEntropy: 0.005142144858837128, Accuracy: 0.9982217071611253\n",
      "Iter 418 / 2000, Loss: 125446749.40625, CrossEntropy: 0.0051437062211334705, Accuracy: 0.9983415920716112\n",
      "Iter 419 / 2000, Loss: 126440553.96875, CrossEntropy: 0.0055352505296468735, Accuracy: 0.9980618606138107\n",
      "Iter 420 / 2000, Loss: 128723000.65625, CrossEntropy: 0.006440300494432449, Accuracy: 0.9976622442455243\n",
      "Iter 421 / 2000, Loss: 126406379.25, CrossEntropy: 0.005509185139089823, Accuracy: 0.9981018222506394\n",
      "Iter 422 / 2000, Loss: 125672851.53125, CrossEntropy: 0.005212554708123207, Accuracy: 0.998201726342711\n",
      "Iter 423 / 2000, Loss: 125850090.25, CrossEntropy: 0.005305581726133823, Accuracy: 0.9984694693094629\n",
      "Iter 424 / 2000, Loss: 126711266.375, CrossEntropy: 0.00561309140175581, Accuracy: 0.9982217071611253\n",
      "Iter 425 / 2000, Loss: 126276714.46875, CrossEntropy: 0.0054330588318407536, Accuracy: 0.9980618606138107\n",
      "Iter 426 / 2000, Loss: 129201633.1875, CrossEntropy: 0.006594946142286062, Accuracy: 0.9977022058823529\n",
      "Iter 427 / 2000, Loss: 126977295.4375, CrossEntropy: 0.0056997667998075485, Accuracy: 0.9981018222506394\n",
      "Iter 428 / 2000, Loss: 125309304.40625, CrossEntropy: 0.005027463659644127, Accuracy: 0.9982816496163683\n",
      "Iter 429 / 2000, Loss: 128733990.6875, CrossEntropy: 0.006437913980334997, Accuracy: 0.9976382672634272\n",
      "Iter 430 / 2000, Loss: 125655240.875, CrossEntropy: 0.005195724777877331, Accuracy: 0.998229699488491\n",
      "Iter 431 / 2000, Loss: 123413508.59375, CrossEntropy: 0.004273793660104275, Accuracy: 0.9983695652173913\n",
      "Iter 432 / 2000, Loss: 123240935.625, CrossEntropy: 0.004177364055067301, Accuracy: 0.9983615728900256\n",
      "Iter 433 / 2000, Loss: 127208456.71875, CrossEntropy: 0.005757107399404049, Accuracy: 0.9980019181585678\n",
      "Iter 434 / 2000, Loss: 131901378.25, CrossEntropy: 0.007625129073858261, Accuracy: 0.9973425511508951\n",
      "Iter 435 / 2000, Loss: 125923426.21875, CrossEntropy: 0.005230070557445288, Accuracy: 0.9982217071611253\n",
      "Iter 436 / 2000, Loss: 129611240.8125, CrossEntropy: 0.006700677797198296, Accuracy: 0.9976822250639387\n",
      "Iter 437 / 2000, Loss: 129140153.96875, CrossEntropy: 0.0065038506872951984, Accuracy: 0.9980618606138107\n",
      "Iter 438 / 2000, Loss: 127278464.28125, CrossEntropy: 0.005761229898780584, Accuracy: 0.998201726342711\n",
      "Iter 439 / 2000, Loss: 127047897.875, CrossEntropy: 0.005657108034938574, Accuracy: 0.9980418797953964\n",
      "Iter 440 / 2000, Loss: 128125121.40625, CrossEntropy: 0.006079507060348988, Accuracy: 0.9981018222506394\n",
      "Iter 441 / 2000, Loss: 126801350.71875, CrossEntropy: 0.0055445111356675625, Accuracy: 0.9980218989769821\n",
      "Iter 442 / 2000, Loss: 128745690.84375, CrossEntropy: 0.006315300706773996, Accuracy: 0.9980218989769821\n",
      "Iter 443 / 2000, Loss: 125553991.4375, CrossEntropy: 0.005034224595874548, Accuracy: 0.99838155370844\n",
      "Iter 444 / 2000, Loss: 127481836.75, CrossEntropy: 0.005798774771392345, Accuracy: 0.9980418797953964\n",
      "Iter 445 / 2000, Loss: 124430905.6875, CrossEntropy: 0.004574157763272524, Accuracy: 0.9982816496163683\n",
      "Iter 446 / 2000, Loss: 125298005.90625, CrossEntropy: 0.004967579618096352, Accuracy: 0.9983296035805627\n",
      "Iter 447 / 2000, Loss: 124067939.09375, CrossEntropy: 0.00441751629114151, Accuracy: 0.9984215153452686\n",
      "Iter 448 / 2000, Loss: 125768214.09375, CrossEntropy: 0.0050914091989398, Accuracy: 0.9983415920716112\n",
      "Iter 449 / 2000, Loss: 130547191.09375, CrossEntropy: 0.006997543387115002, Accuracy: 0.9976622442455243\n",
      "Iter 450 / 2000, Loss: 128846125.3125, CrossEntropy: 0.006323189940303564, Accuracy: 0.9980099104859336\n",
      "Iter 451 / 2000, Loss: 125813614.875, CrossEntropy: 0.005090523045510054, Accuracy: 0.9982416879795396\n",
      "Iter 452 / 2000, Loss: 126069229.09375, CrossEntropy: 0.005296754650771618, Accuracy: 0.998229699488491\n",
      "Iter 453 / 2000, Loss: 127960023.65625, CrossEntropy: 0.005935882218182087, Accuracy: 0.998141783887468\n",
      "Iter 454 / 2000, Loss: 126989653.625, CrossEntropy: 0.005544747691601515, Accuracy: 0.9980418797953964\n",
      "Iter 455 / 2000, Loss: 128843619.75, CrossEntropy: 0.006282296497374773, Accuracy: 0.9979219948849105\n",
      "Iter 456 / 2000, Loss: 126969389.5625, CrossEntropy: 0.005524771753698587, Accuracy: 0.9981617647058824\n",
      "Iter 457 / 2000, Loss: 125516020.28125, CrossEntropy: 0.004938625730574131, Accuracy: 0.9984015345268542\n",
      "Iter 458 / 2000, Loss: 126696804.46875, CrossEntropy: 0.005427811294794083, Accuracy: 0.9983375959079285\n",
      "Iter 459 / 2000, Loss: 126212193.96875, CrossEntropy: 0.00520332669839263, Accuracy: 0.9984215153452686\n",
      "Iter 460 / 2000, Loss: 126687659.21875, CrossEntropy: 0.00538856303319335, Accuracy: 0.998201726342711\n",
      "Iter 461 / 2000, Loss: 128764040.0, CrossEntropy: 0.0062315622344613075, Accuracy: 0.9979499680306906\n",
      "Iter 462 / 2000, Loss: 127230555.09375, CrossEntropy: 0.0055946651846170425, Accuracy: 0.9981218030690537\n",
      "Iter 463 / 2000, Loss: 126765575.09375, CrossEntropy: 0.0054007843136787415, Accuracy: 0.9985014386189258\n",
      "Iter 464 / 2000, Loss: 124990094.15625, CrossEntropy: 0.004706294741481543, Accuracy: 0.9982696611253197\n",
      "Iter 465 / 2000, Loss: 124025315.0625, CrossEntropy: 0.004297106061130762, Accuracy: 0.9986213235294118\n",
      "Iter 466 / 2000, Loss: 125457158.84375, CrossEntropy: 0.00486451992765069, Accuracy: 0.998321611253197\n",
      "Iter 467 / 2000, Loss: 126551412.0, CrossEntropy: 0.005314547568559647, Accuracy: 0.9983895460358057\n",
      "Iter 468 / 2000, Loss: 124571416.15625, CrossEntropy: 0.004498851951211691, Accuracy: 0.9984814578005116\n",
      "Iter 469 / 2000, Loss: 126856469.625, CrossEntropy: 0.005402932874858379, Accuracy: 0.9980618606138107\n",
      "Iter 470 / 2000, Loss: 128656988.03125, CrossEntropy: 0.006156452931463718, Accuracy: 0.9979499680306906\n",
      "Iter 471 / 2000, Loss: 123471206.46875, CrossEntropy: 0.004039142280817032, Accuracy: 0.9987212276214834\n",
      "Iter 472 / 2000, Loss: 128280534.53125, CrossEntropy: 0.005956306587904692, Accuracy: 0.9980019181585678\n",
      "Iter 473 / 2000, Loss: 124252079.1875, CrossEntropy: 0.004342294298112392, Accuracy: 0.9986812659846548\n",
      "Iter 474 / 2000, Loss: 125395738.3125, CrossEntropy: 0.004791663959622383, Accuracy: 0.9984414961636828\n",
      "Iter 475 / 2000, Loss: 127175047.46875, CrossEntropy: 0.005496473051607609, Accuracy: 0.9980418797953964\n",
      "Iter 476 / 2000, Loss: 126087623.65625, CrossEntropy: 0.005056016147136688, Accuracy: 0.9981617647058824\n",
      "Iter 477 / 2000, Loss: 125511371.03125, CrossEntropy: 0.004819713998585939, Accuracy: 0.998321611253197\n",
      "Iter 478 / 2000, Loss: 128221944.125, CrossEntropy: 0.005896976683288813, Accuracy: 0.9981018222506394\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Iter 479 / 2000, Loss: 126307168.4375, CrossEntropy: 0.005126962438225746, Accuracy: 0.9982816496163683\n",
      "Iter 480 / 2000, Loss: 126073132.875, CrossEntropy: 0.005054300185292959, Accuracy: 0.9981697570332481\n",
      "Iter 481 / 2000, Loss: 127524509.875, CrossEntropy: 0.00560176745057106, Accuracy: 0.9983016304347826\n",
      "Iter 482 / 2000, Loss: 123048457.5625, CrossEntropy: 0.003819049336016178, Accuracy: 0.9984295076726343\n",
      "Iter 483 / 2000, Loss: 126786610.34375, CrossEntropy: 0.0052956449799239635, Accuracy: 0.9981018222506394\n",
      "Iter 484 / 2000, Loss: 127307703.4375, CrossEntropy: 0.005497869104146957, Accuracy: 0.998141783887468\n",
      "Iter 485 / 2000, Loss: 124382579.96875, CrossEntropy: 0.004324048757553101, Accuracy: 0.9986013427109974\n",
      "Iter 486 / 2000, Loss: 127318503.21875, CrossEntropy: 0.005493671167641878, Accuracy: 0.9980618606138107\n",
      "Iter 487 / 2000, Loss: 124602222.9375, CrossEntropy: 0.004400414414703846, Accuracy: 0.9982816496163683\n",
      "Iter 488 / 2000, Loss: 129268136.875, CrossEntropy: 0.006262519396841526, Accuracy: 0.9980818414322251\n",
      "Iter 489 / 2000, Loss: 125357657.46875, CrossEntropy: 0.004697308409959078, Accuracy: 0.9985014386189258\n",
      "Iter 490 / 2000, Loss: 126873520.3125, CrossEntropy: 0.005292088724672794, Accuracy: 0.9981817455242967\n",
      "Iter 491 / 2000, Loss: 124590574.34375, CrossEntropy: 0.004373028874397278, Accuracy: 0.9985613810741688\n",
      "Iter 492 / 2000, Loss: 126189338.25, CrossEntropy: 0.005005632061511278, Accuracy: 0.998261668797954\n",
      "Iter 493 / 2000, Loss: 125618530.625, CrossEntropy: 0.004784366115927696, Accuracy: 0.998349584398977\n",
      "Iter 494 / 2000, Loss: 126330811.84375, CrossEntropy: 0.005050089210271835, Accuracy: 0.9982816496163683\n",
      "Iter 495 / 2000, Loss: 124053687.0625, CrossEntropy: 0.004135235212743282, Accuracy: 0.998641304347826\n",
      "Iter 496 / 2000, Loss: 127480223.25, CrossEntropy: 0.005498634185642004, Accuracy: 0.9981617647058824\n",
      "Iter 497 / 2000, Loss: 128111827.5, CrossEntropy: 0.005745710805058479, Accuracy: 0.9979619565217391\n",
      "Iter 498 / 2000, Loss: 122626752.0625, CrossEntropy: 0.0035485131666064262, Accuracy: 0.9986013427109974\n",
      "Iter 499 / 2000, Loss: 126933363.84375, CrossEntropy: 0.005265115760266781, Accuracy: 0.9985613810741688\n",
      "Iter 500 / 2000, Loss: 125384300.71875, CrossEntropy: 0.004666561726480722, Accuracy: 0.9984295076726343\n",
      "Iter 501 / 2000, Loss: 128383695.90625, CrossEntropy: 0.005835938733071089, Accuracy: 0.9980218989769821\n",
      "Iter 502 / 2000, Loss: 122027522.0, CrossEntropy: 0.0032881535589694977, Accuracy: 0.9989010549872123\n",
      "Iter 503 / 2000, Loss: 128083542.53125, CrossEntropy: 0.005701575428247452, Accuracy: 0.9981617647058824\n",
      "Iter 504 / 2000, Loss: 125091912.8125, CrossEntropy: 0.004500953946262598, Accuracy: 0.9984215153452686\n",
      "Iter 505 / 2000, Loss: 128200058.40625, CrossEntropy: 0.005738408770412207, Accuracy: 0.9979020140664961\n",
      "Iter 506 / 2000, Loss: 124809476.28125, CrossEntropy: 0.004385299980640411, Accuracy: 0.9985014386189258\n",
      "Iter 507 / 2000, Loss: 129979405.78125, CrossEntropy: 0.006437588017433882, Accuracy: 0.9979819373401535\n",
      "Iter 508 / 2000, Loss: 123263815.0, CrossEntropy: 0.003748709335923195, Accuracy: 0.9987811700767263\n",
      "Iter 509 / 2000, Loss: 130023377.875, CrossEntropy: 0.006443183869123459, Accuracy: 0.998141783887468\n",
      "Iter 510 / 2000, Loss: 126543230.625, CrossEntropy: 0.005046405829489231, Accuracy: 0.9985014386189258\n",
      "Iter 511 / 2000, Loss: 129787779.53125, CrossEntropy: 0.006338386330753565, Accuracy: 0.9979819373401535\n",
      "Iter 512 / 2000, Loss: 126310892.375, CrossEntropy: 0.004945010878145695, Accuracy: 0.9983615728900256\n",
      "Iter 513 / 2000, Loss: 126467866.0, CrossEntropy: 0.005009095184504986, Accuracy: 0.9984694693094629\n",
      "Iter 514 / 2000, Loss: 127500723.65625, CrossEntropy: 0.005458479281514883, Accuracy: 0.9981897378516624\n",
      "Iter 515 / 2000, Loss: 124335657.875, CrossEntropy: 0.004136516246944666, Accuracy: 0.998701246803069\n",
      "Iter 516 / 2000, Loss: 126427909.625, CrossEntropy: 0.004967804998159409, Accuracy: 0.9984015345268542\n",
      "Iter 517 / 2000, Loss: 127233399.15625, CrossEntropy: 0.005283795762807131, Accuracy: 0.998201726342711\n",
      "Iter 518 / 2000, Loss: 126561046.25, CrossEntropy: 0.005011492874473333, Accuracy: 0.9984614769820972\n",
      "Iter 519 / 2000, Loss: 127128829.15625, CrossEntropy: 0.005232539027929306, Accuracy: 0.9982217071611253\n",
      "Iter 520 / 2000, Loss: 129570674.6875, CrossEntropy: 0.0062017543241381645, Accuracy: 0.9981018222506394\n",
      "Iter 521 / 2000, Loss: 124266830.6875, CrossEntropy: 0.00407812325283885, Accuracy: 0.9985414002557544\n",
      "Iter 522 / 2000, Loss: 125539065.0625, CrossEntropy: 0.004579988773912191, Accuracy: 0.9986812659846548\n",
      "Iter 523 / 2000, Loss: 124686003.03125, CrossEntropy: 0.0042342194356024265, Accuracy: 0.9985613810741688\n",
      "Iter 524 / 2000, Loss: 127788662.53125, CrossEntropy: 0.0054856594651937485, Accuracy: 0.9980099104859336\n",
      "Iter 525 / 2000, Loss: 125628579.15625, CrossEntropy: 0.004598652943968773, Accuracy: 0.9984814578005116\n",
      "Iter 526 / 2000, Loss: 124975508.84375, CrossEntropy: 0.004338380880653858, Accuracy: 0.9986812659846548\n",
      "Iter 527 / 2000, Loss: 129440961.59375, CrossEntropy: 0.00611129542812705, Accuracy: 0.9980218989769821\n",
      "Iter 528 / 2000, Loss: 126383659.65625, CrossEntropy: 0.004890224896371365, Accuracy: 0.9983415920716112\n",
      "Iter 529 / 2000, Loss: 123848035.09375, CrossEntropy: 0.0038650506176054478, Accuracy: 0.9986612851662404\n",
      "Iter 530 / 2000, Loss: 123818112.03125, CrossEntropy: 0.00384855386801064, Accuracy: 0.9986013427109974\n",
      "Iter 531 / 2000, Loss: 127513727.75, CrossEntropy: 0.005319246556609869, Accuracy: 0.998261668797954\n",
      "Iter 532 / 2000, Loss: 128007430.40625, CrossEntropy: 0.005512123927474022, Accuracy: 0.9979419757033248\n",
      "Iter 533 / 2000, Loss: 124930272.125, CrossEntropy: 0.004275300074368715, Accuracy: 0.9984015345268542\n",
      "Iter 534 / 2000, Loss: 127081844.5625, CrossEntropy: 0.005129612982273102, Accuracy: 0.9981218030690537\n",
      "Iter 535 / 2000, Loss: 125042844.28125, CrossEntropy: 0.004308921284973621, Accuracy: 0.9986013427109974\n",
      "Iter 536 / 2000, Loss: 126585231.46875, CrossEntropy: 0.004920481238514185, Accuracy: 0.9983615728900256\n",
      "Iter 537 / 2000, Loss: 125387144.21875, CrossEntropy: 0.00443722540512681, Accuracy: 0.9987412084398977\n",
      "Iter 538 / 2000, Loss: 126226302.34375, CrossEntropy: 0.004766364581882954, Accuracy: 0.9984614769820972\n",
      "Iter 539 / 2000, Loss: 126881277.78125, CrossEntropy: 0.005022738594561815, Accuracy: 0.9982416879795396\n",
      "Iter 540 / 2000, Loss: 126548199.0, CrossEntropy: 0.004884207621216774, Accuracy: 0.9984614769820972\n",
      "Iter 541 / 2000, Loss: 126194345.5, CrossEntropy: 0.004738352261483669, Accuracy: 0.998701246803069\n",
      "Iter 542 / 2000, Loss: 125410089.875, CrossEntropy: 0.004419348668307066, Accuracy: 0.998641304347826\n",
      "Iter 543 / 2000, Loss: 128110304.8125, CrossEntropy: 0.005492789670825005, Accuracy: 0.998261668797954\n",
      "Iter 544 / 2000, Loss: 126451382.5625, CrossEntropy: 0.00482397573068738, Accuracy: 0.9985613810741688\n",
      "Iter 545 / 2000, Loss: 126524616.09375, CrossEntropy: 0.004849538207054138, Accuracy: 0.998321611253197\n",
      "Iter 546 / 2000, Loss: 127587101.34375, CrossEntropy: 0.00526784872636199, Accuracy: 0.9981817455242967\n",
      "Iter 547 / 2000, Loss: 126686169.8125, CrossEntropy: 0.0049019125290215015, Accuracy: 0.9985214194373402\n",
      "Iter 548 / 2000, Loss: 126271234.5625, CrossEntropy: 0.00473072798922658, Accuracy: 0.9984215153452686\n",
      "Iter 549 / 2000, Loss: 123176783.75, CrossEntropy: 0.0034905129577964544, Accuracy: 0.9987811700767263\n",
      "Iter 550 / 2000, Loss: 126748458.1875, CrossEntropy: 0.004910553805530071, Accuracy: 0.9983615728900256\n",
      "Iter 551 / 2000, Loss: 125842769.65625, CrossEntropy: 0.004559810739010572, Accuracy: 0.9984694693094629\n",
      "Iter 552 / 2000, Loss: 125617765.78125, CrossEntropy: 0.004448324907571077, Accuracy: 0.9986213235294118\n",
      "Iter 553 / 2000, Loss: 124436269.6875, CrossEntropy: 0.003971691709011793, Accuracy: 0.998641304347826\n",
      "Iter 554 / 2000, Loss: 127342352.75, CrossEntropy: 0.005176811013370752, Accuracy: 0.998289641943734\n",
      "Iter 555 / 2000, Loss: 124815091.09375, CrossEntropy: 0.0041370210237801075, Accuracy: 0.9987691815856777\n",
      "Iter 556 / 2000, Loss: 127125679.46875, CrossEntropy: 0.005036534741520882, Accuracy: 0.9982416879795396\n",
      "Iter 557 / 2000, Loss: 123454201.28125, CrossEntropy: 0.0035573849454522133, Accuracy: 0.9986213235294118\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Iter 558 / 2000, Loss: 124478072.40625, CrossEntropy: 0.003961860202252865, Accuracy: 0.9986812659846548\n",
      "Iter 559 / 2000, Loss: 131341758.75, CrossEntropy: 0.006699005141854286, Accuracy: 0.9981018222506394\n",
      "Iter 560 / 2000, Loss: 126890192.75, CrossEntropy: 0.004914476536214352, Accuracy: 0.9984015345268542\n",
      "Iter 561 / 2000, Loss: 127364188.53125, CrossEntropy: 0.005099470727145672, Accuracy: 0.9983615728900256\n",
      "Iter 562 / 2000, Loss: 127448984.1875, CrossEntropy: 0.00512884883210063, Accuracy: 0.9984814578005116\n",
      "Iter 563 / 2000, Loss: 127821093.9375, CrossEntropy: 0.005292982794344425, Accuracy: 0.9983096227621484\n",
      "Iter 564 / 2000, Loss: 126305248.625, CrossEntropy: 0.004689032211899757, Accuracy: 0.9983096227621484\n",
      "Iter 565 / 2000, Loss: 127276631.3125, CrossEntropy: 0.005043043754994869, Accuracy: 0.998261668797954\n",
      "Iter 566 / 2000, Loss: 125325468.34375, CrossEntropy: 0.004322207998484373, Accuracy: 0.998617327365729\n",
      "Iter 567 / 2000, Loss: 127482103.96875, CrossEntropy: 0.005114630796015263, Accuracy: 0.9985214194373402\n",
      "Iter 568 / 2000, Loss: 125949836.3125, CrossEntropy: 0.00449947826564312, Accuracy: 0.998641304347826\n",
      "Iter 569 / 2000, Loss: 124963694.8125, CrossEntropy: 0.004097841214388609, Accuracy: 0.9986213235294118\n",
      "Iter 570 / 2000, Loss: 130418116.96875, CrossEntropy: 0.006316803395748138, Accuracy: 0.9981297953964194\n",
      "Iter 571 / 2000, Loss: 124376676.40625, CrossEntropy: 0.0038529164157807827, Accuracy: 0.9986013427109974\n",
      "Iter 572 / 2000, Loss: 125494208.15625, CrossEntropy: 0.004294674843549728, Accuracy: 0.9986812659846548\n",
      "Iter 573 / 2000, Loss: 129673647.25, CrossEntropy: 0.00599024910479784, Accuracy: 0.9980698529411764\n",
      "Iter 574 / 2000, Loss: 124367434.875, CrossEntropy: 0.003841965924948454, Accuracy: 0.9989410166240409\n",
      "Iter 575 / 2000, Loss: 124263898.6875, CrossEntropy: 0.0038629991468042135, Accuracy: 0.9986373081841433\n",
      "Iter 576 / 2000, Loss: 128920924.75, CrossEntropy: 0.005642613861709833, Accuracy: 0.9982816496163683\n",
      "Iter 577 / 2000, Loss: 125704602.5625, CrossEntropy: 0.004353011958301067, Accuracy: 0.998821131713555\n",
      "Iter 578 / 2000, Loss: 124718378.96875, CrossEntropy: 0.003953456878662109, Accuracy: 0.9986213235294118\n",
      "Iter 579 / 2000, Loss: 126024161.75, CrossEntropy: 0.004496055189520121, Accuracy: 0.9984694693094629\n",
      "Iter 580 / 2000, Loss: 125163422.25, CrossEntropy: 0.004121182486414909, Accuracy: 0.9986013427109974\n",
      "Iter 581 / 2000, Loss: 127402834.03125, CrossEntropy: 0.0050776260904967785, Accuracy: 0.9983695652173913\n",
      "Iter 582 / 2000, Loss: 126495220.4375, CrossEntropy: 0.004644540138542652, Accuracy: 0.9984614769820972\n",
      "Iter 583 / 2000, Loss: 125199626.8125, CrossEntropy: 0.004121971316635609, Accuracy: 0.99838155370844\n",
      "Iter 584 / 2000, Loss: 127342068.71875, CrossEntropy: 0.004971739370375872, Accuracy: 0.9984015345268542\n",
      "Iter 585 / 2000, Loss: 124779967.375, CrossEntropy: 0.003943432588130236, Accuracy: 0.998641304347826\n",
      "Iter 586 / 2000, Loss: 126907715.09375, CrossEntropy: 0.004797478672116995, Accuracy: 0.99840952685422\n",
      "Iter 587 / 2000, Loss: 124684159.09375, CrossEntropy: 0.0038951041642576456, Accuracy: 0.9985414002557544\n",
      "Iter 588 / 2000, Loss: 124490642.46875, CrossEntropy: 0.003840389661490917, Accuracy: 0.9987492007672635\n",
      "Iter 589 / 2000, Loss: 126814319.40625, CrossEntropy: 0.004737361799925566, Accuracy: 0.9985414002557544\n",
      "Iter 590 / 2000, Loss: 128675681.21875, CrossEntropy: 0.005485003348439932, Accuracy: 0.9980818414322251\n",
      "Iter 591 / 2000, Loss: 126305033.53125, CrossEntropy: 0.004524285439401865, Accuracy: 0.9985214194373402\n",
      "Iter 592 / 2000, Loss: 126477118.375, CrossEntropy: 0.004586267285048962, Accuracy: 0.9984414961636828\n",
      "Iter 593 / 2000, Loss: 126938312.46875, CrossEntropy: 0.004765992518514395, Accuracy: 0.9985613810741688\n",
      "Iter 594 / 2000, Loss: 127232532.375, CrossEntropy: 0.004877092316746712, Accuracy: 0.9984614769820972\n",
      "Iter 595 / 2000, Loss: 125280184.09375, CrossEntropy: 0.004091794602572918, Accuracy: 0.998701246803069\n",
      "Iter 596 / 2000, Loss: 127005584.6875, CrossEntropy: 0.004779748152941465, Accuracy: 0.9984414961636828\n",
      "Iter 597 / 2000, Loss: 125679212.625, CrossEntropy: 0.004242207854986191, Accuracy: 0.9985214194373402\n",
      "Iter 598 / 2000, Loss: 127547293.9375, CrossEntropy: 0.004994410090148449, Accuracy: 0.9983895460358057\n",
      "Iter 599 / 2000, Loss: 124070694.59375, CrossEntropy: 0.0036185041535645723, Accuracy: 0.9988091432225065\n",
      "Iter 600 / 2000, Loss: 126459984.0625, CrossEntropy: 0.004544353112578392, Accuracy: 0.9985414002557544\n",
      "Iter 601 / 2000, Loss: 127796204.03125, CrossEntropy: 0.00506643345579505, Accuracy: 0.9984015345268542\n",
      "Iter 602 / 2000, Loss: 125006465.3125, CrossEntropy: 0.00394681328907609, Accuracy: 0.9987212276214834\n",
      "Iter 603 / 2000, Loss: 126368655.53125, CrossEntropy: 0.004486253950744867, Accuracy: 0.9985414002557544\n",
      "Iter 604 / 2000, Loss: 126223905.84375, CrossEntropy: 0.004464804194867611, Accuracy: 0.9985893542199489\n",
      "Iter 605 / 2000, Loss: 124684139.90625, CrossEntropy: 0.003803204046562314, Accuracy: 0.9987212276214834\n",
      "Iter 606 / 2000, Loss: 125770898.09375, CrossEntropy: 0.004233142826706171, Accuracy: 0.9987811700767263\n",
      "Iter 607 / 2000, Loss: 126733810.0, CrossEntropy: 0.00461215665563941, Accuracy: 0.9985214194373402\n",
      "Iter 608 / 2000, Loss: 124619186.40625, CrossEntropy: 0.00376279279589653, Accuracy: 0.9986213235294118\n",
      "Iter 609 / 2000, Loss: 124112343.21875, CrossEntropy: 0.0035562566481530666, Accuracy: 0.9987412084398977\n",
      "Iter 610 / 2000, Loss: 126901826.34375, CrossEntropy: 0.004666390363126993, Accuracy: 0.9984015345268542\n",
      "Iter 611 / 2000, Loss: 125668017.15625, CrossEntropy: 0.004167474806308746, Accuracy: 0.998701246803069\n",
      "Iter 612 / 2000, Loss: 129025058.375, CrossEntropy: 0.005504612345248461, Accuracy: 0.998261668797954\n",
      "Iter 613 / 2000, Loss: 124053588.84375, CrossEntropy: 0.0035132949706166983, Accuracy: 0.9988610933503836\n",
      "Iter 614 / 2000, Loss: 124474558.71875, CrossEntropy: 0.0036763797979801893, Accuracy: 0.9990009590792839\n",
      "Iter 615 / 2000, Loss: 130218251.65625, CrossEntropy: 0.0059683481231331825, Accuracy: 0.998141783887468\n",
      "Iter 616 / 2000, Loss: 125428669.09375, CrossEntropy: 0.00404773373156786, Accuracy: 0.9986812659846548\n",
      "Iter 617 / 2000, Loss: 125587439.875, CrossEntropy: 0.0041063702665269375, Accuracy: 0.9985813618925832\n",
      "Iter 618 / 2000, Loss: 125290193.5625, CrossEntropy: 0.003982879221439362, Accuracy: 0.9988610933503836\n",
      "Iter 619 / 2000, Loss: 126315188.875, CrossEntropy: 0.00438902759924531, Accuracy: 0.9984614769820972\n",
      "Iter 620 / 2000, Loss: 125347715.21875, CrossEntropy: 0.004014418926090002, Accuracy: 0.9986093350383632\n",
      "Iter 621 / 2000, Loss: 126011940.875, CrossEntropy: 0.004255978856235743, Accuracy: 0.9986013427109974\n",
      "Iter 622 / 2000, Loss: 126406361.21875, CrossEntropy: 0.004415118135511875, Accuracy: 0.9985613810741688\n",
      "Iter 623 / 2000, Loss: 125079584.78125, CrossEntropy: 0.003921779338270426, Accuracy: 0.9988091432225065\n",
      "Iter 624 / 2000, Loss: 125720969.78125, CrossEntropy: 0.004126184619963169, Accuracy: 0.998641304347826\n",
      "Iter 625 / 2000, Loss: 127025070.875, CrossEntropy: 0.004643185064196587, Accuracy: 0.9984215153452686\n",
      "Iter 626 / 2000, Loss: 126997373.96875, CrossEntropy: 0.0046269516460597515, Accuracy: 0.9984614769820972\n",
      "Iter 627 / 2000, Loss: 126713652.1875, CrossEntropy: 0.00451196264475584, Accuracy: 0.9981817455242967\n",
      "Iter 628 / 2000, Loss: 126525468.6875, CrossEntropy: 0.004429022781550884, Accuracy: 0.9987412084398977\n",
      "Iter 629 / 2000, Loss: 126215017.71875, CrossEntropy: 0.0042997440323233604, Accuracy: 0.9985214194373402\n",
      "Iter 630 / 2000, Loss: 126862083.0, CrossEntropy: 0.004553818143904209, Accuracy: 0.9985613810741688\n",
      "Iter 631 / 2000, Loss: 126022218.46875, CrossEntropy: 0.004214976914227009, Accuracy: 0.99838155370844\n",
      "Iter 632 / 2000, Loss: 127357138.4375, CrossEntropy: 0.004741596523672342, Accuracy: 0.9984015345268542\n",
      "Iter 633 / 2000, Loss: 125884938.125, CrossEntropy: 0.004229424521327019, Accuracy: 0.998617327365729\n",
      "Iter 634 / 2000, Loss: 126020409.96875, CrossEntropy: 0.004196367226541042, Accuracy: 0.998761189258312\n",
      "Iter 635 / 2000, Loss: 127246478.4375, CrossEntropy: 0.0046821641735732555, Accuracy: 0.9984015345268542\n",
      "Iter 636 / 2000, Loss: 124695239.6875, CrossEntropy: 0.0036574651021510363, Accuracy: 0.9986213235294118\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Iter 637 / 2000, Loss: 125256249.4375, CrossEntropy: 0.0038826174568384886, Accuracy: 0.9988011508951407\n",
      "Iter 638 / 2000, Loss: 126519320.3125, CrossEntropy: 0.004376781638711691, Accuracy: 0.9986213235294118\n",
      "Iter 639 / 2000, Loss: 129263471.0, CrossEntropy: 0.005467970389872789, Accuracy: 0.9983615728900256\n",
      "Iter 640 / 2000, Loss: 126938967.40625, CrossEntropy: 0.004534845240414143, Accuracy: 0.998701246803069\n",
      "Iter 641 / 2000, Loss: 125539605.3125, CrossEntropy: 0.003971392288804054, Accuracy: 0.9987212276214834\n",
      "Iter 642 / 2000, Loss: 129362764.46875, CrossEntropy: 0.0055023180320858955, Accuracy: 0.998349584398977\n",
      "Iter 643 / 2000, Loss: 125807372.65625, CrossEntropy: 0.004068497568368912, Accuracy: 0.998761189258312\n",
      "Iter 644 / 2000, Loss: 127444076.125, CrossEntropy: 0.004716977011412382, Accuracy: 0.9985414002557544\n",
      "Iter 645 / 2000, Loss: 126276055.84375, CrossEntropy: 0.004259483888745308, Accuracy: 0.9986492966751919\n",
      "Iter 646 / 2000, Loss: 128022453.34375, CrossEntropy: 0.004938721191138029, Accuracy: 0.9985414002557544\n",
      "Iter 647 / 2000, Loss: 129304729.75, CrossEntropy: 0.00544627895578742, Accuracy: 0.9983415920716112\n",
      "Iter 648 / 2000, Loss: 125002553.375, CrossEntropy: 0.0037220853846520185, Accuracy: 0.9987412084398977\n",
      "Iter 649 / 2000, Loss: 123833282.40625, CrossEntropy: 0.003256449243053794, Accuracy: 0.9988610933503836\n",
      "Iter 650 / 2000, Loss: 125207526.0, CrossEntropy: 0.003796095959842205, Accuracy: 0.9986213235294118\n",
      "Iter 651 / 2000, Loss: 125860547.46875, CrossEntropy: 0.004050564952194691, Accuracy: 0.9985014386189258\n",
      "Iter 652 / 2000, Loss: 125831641.625, CrossEntropy: 0.0040352558717131615, Accuracy: 0.9987412084398977\n",
      "Iter 653 / 2000, Loss: 126715700.78125, CrossEntropy: 0.0043833437375724316, Accuracy: 0.9986612851662404\n",
      "Iter 654 / 2000, Loss: 124903391.25, CrossEntropy: 0.0036552082747220993, Accuracy: 0.9988411125319693\n",
      "Iter 655 / 2000, Loss: 126558956.25, CrossEntropy: 0.004391956143081188, Accuracy: 0.9985094309462916\n",
      "Iter 656 / 2000, Loss: 124920173.375, CrossEntropy: 0.0036519095301628113, Accuracy: 0.9988810741687979\n",
      "Iter 657 / 2000, Loss: 125417159.09375, CrossEntropy: 0.0038461301010102034, Accuracy: 0.9987212276214834\n",
      "Iter 658 / 2000, Loss: 124588053.8125, CrossEntropy: 0.0035127245355397463, Accuracy: 0.9987412084398977\n",
      "Iter 659 / 2000, Loss: 127002221.53125, CrossEntropy: 0.004469929728657007, Accuracy: 0.9983415920716112\n",
      "Iter 660 / 2000, Loss: 123974582.75, CrossEntropy: 0.0032561318948864937, Accuracy: 0.9986612851662404\n",
      "Iter 661 / 2000, Loss: 125072854.09375, CrossEntropy: 0.0036911331117153168, Accuracy: 0.998761189258312\n",
      "Iter 662 / 2000, Loss: 126501361.40625, CrossEntropy: 0.004258288536220789, Accuracy: 0.9988411125319693\n",
      "Iter 663 / 2000, Loss: 127489789.28125, CrossEntropy: 0.004647227469831705, Accuracy: 0.9983615728900256\n",
      "Iter 664 / 2000, Loss: 126897877.90625, CrossEntropy: 0.004407181870192289, Accuracy: 0.9987212276214834\n",
      "Iter 665 / 2000, Loss: 128058665.78125, CrossEntropy: 0.004865548107773066, Accuracy: 0.9984414961636828\n",
      "Iter 666 / 2000, Loss: 126928497.3125, CrossEntropy: 0.004408661276102066, Accuracy: 0.9985214194373402\n",
      "Iter 667 / 2000, Loss: 125301773.125, CrossEntropy: 0.0037553037982434034, Accuracy: 0.998641304347826\n",
      "Iter 668 / 2000, Loss: 124362406.3125, CrossEntropy: 0.003375112544745207, Accuracy: 0.9988610933503836\n",
      "Iter 669 / 2000, Loss: 128693243.6875, CrossEntropy: 0.005099436268210411, Accuracy: 0.9983016304347826\n",
      "Iter 670 / 2000, Loss: 125972687.0625, CrossEntropy: 0.004009679891169071, Accuracy: 0.9985613810741688\n",
      "Iter 671 / 2000, Loss: 126066693.46875, CrossEntropy: 0.004147381987422705, Accuracy: 0.9986492966751919\n",
      "Iter 672 / 2000, Loss: 126849771.0, CrossEntropy: 0.004349302034825087, Accuracy: 0.9985414002557544\n",
      "Iter 673 / 2000, Loss: 126196454.53125, CrossEntropy: 0.004083344247192144, Accuracy: 0.9986213235294118\n",
      "Iter 674 / 2000, Loss: 126675536.53125, CrossEntropy: 0.004330122377723455, Accuracy: 0.9984494884910486\n",
      "Iter 675 / 2000, Loss: 130064450.09375, CrossEntropy: 0.005642987322062254, Accuracy: 0.998349584398977\n",
      "Iter 676 / 2000, Loss: 124890644.4375, CrossEntropy: 0.003631831146776676, Accuracy: 0.9987372122762149\n",
      "Iter 677 / 2000, Loss: 127228961.40625, CrossEntropy: 0.0044769844971597195, Accuracy: 0.9986213235294118\n",
      "Iter 678 / 2000, Loss: 127345009.5625, CrossEntropy: 0.004597228951752186, Accuracy: 0.9985374040920717\n",
      "Iter 679 / 2000, Loss: 124903027.125, CrossEntropy: 0.003538735443726182, Accuracy: 0.9987811700767263\n",
      "Iter 680 / 2000, Loss: 125697811.5, CrossEntropy: 0.003857898758724332, Accuracy: 0.9987412084398977\n",
      "Iter 681 / 2000, Loss: 128222556.28125, CrossEntropy: 0.0048597087152302265, Accuracy: 0.9984414961636828\n",
      "Iter 682 / 2000, Loss: 125363114.59375, CrossEntropy: 0.0037219442892819643, Accuracy: 0.9987292199488491\n",
      "Iter 683 / 2000, Loss: 125796379.9375, CrossEntropy: 0.00387581717222929, Accuracy: 0.998701246803069\n",
      "Iter 684 / 2000, Loss: 124235192.34375, CrossEntropy: 0.0032477464992552996, Accuracy: 0.9990009590792839\n",
      "Iter 685 / 2000, Loss: 127859605.15625, CrossEntropy: 0.004713403061032295, Accuracy: 0.9985693734015345\n",
      "Iter 686 / 2000, Loss: 126167956.5, CrossEntropy: 0.00401089433580637, Accuracy: 0.9985813618925832\n",
      "Iter 687 / 2000, Loss: 125879044.71875, CrossEntropy: 0.003890544641762972, Accuracy: 0.9987212276214834\n",
      "Iter 688 / 2000, Loss: 126547107.5, CrossEntropy: 0.0041540698148310184, Accuracy: 0.9987811700767263\n",
      "Iter 689 / 2000, Loss: 126671258.46875, CrossEntropy: 0.004198197741061449, Accuracy: 0.9986612851662404\n",
      "Iter 690 / 2000, Loss: 125979157.5625, CrossEntropy: 0.003921471070498228, Accuracy: 0.9987212276214834\n",
      "Iter 691 / 2000, Loss: 126096145.15625, CrossEntropy: 0.003959137015044689, Accuracy: 0.9987212276214834\n",
      "Iter 692 / 2000, Loss: 125579097.125, CrossEntropy: 0.003747955895960331, Accuracy: 0.998821131713555\n",
      "Iter 693 / 2000, Loss: 127329986.875, CrossEntropy: 0.0044447616674005985, Accuracy: 0.9984614769820972\n",
      "Iter 694 / 2000, Loss: 128747290.71875, CrossEntropy: 0.005004246719181538, Accuracy: 0.9984814578005116\n",
      "Iter 695 / 2000, Loss: 129388189.15625, CrossEntropy: 0.005254887975752354, Accuracy: 0.9984015345268542\n",
      "Iter 696 / 2000, Loss: 125137716.28125, CrossEntropy: 0.0036123183090239763, Accuracy: 0.9989090473145781\n",
      "Iter 697 / 2000, Loss: 126190651.28125, CrossEntropy: 0.004002497531473637, Accuracy: 0.9985693734015345\n",
      "Iter 698 / 2000, Loss: 125359181.1875, CrossEntropy: 0.0036318846978247166, Accuracy: 0.998761189258312\n",
      "Iter 699 / 2000, Loss: 127379957.125, CrossEntropy: 0.0044344221241772175, Accuracy: 0.9985813618925832\n",
      "Iter 700 / 2000, Loss: 127655654.5625, CrossEntropy: 0.004559764638543129, Accuracy: 0.9986892583120205\n",
      "Iter 701 / 2000, Loss: 124909092.0, CrossEntropy: 0.00344644021242857, Accuracy: 0.9987691815856777\n",
      "Iter 702 / 2000, Loss: 127365406.3125, CrossEntropy: 0.004415537230670452, Accuracy: 0.9987811700767263\n",
      "Iter 703 / 2000, Loss: 127452161.8125, CrossEntropy: 0.004445320926606655, Accuracy: 0.9985813618925832\n",
      "Iter 704 / 2000, Loss: 124995169.0, CrossEntropy: 0.0034684999845921993, Accuracy: 0.9987492007672635\n",
      "Iter 705 / 2000, Loss: 126819903.5625, CrossEntropy: 0.004183440003544092, Accuracy: 0.9987412084398977\n",
      "Iter 706 / 2000, Loss: 127103249.6875, CrossEntropy: 0.004292544908821583, Accuracy: 0.9986213235294118\n",
      "Iter 707 / 2000, Loss: 127352766.78125, CrossEntropy: 0.004387154709547758, Accuracy: 0.9984814578005116\n",
      "Iter 708 / 2000, Loss: 127565372.90625, CrossEntropy: 0.004466872662305832, Accuracy: 0.9984414961636828\n",
      "Iter 709 / 2000, Loss: 125192903.84375, CrossEntropy: 0.003514290787279606, Accuracy: 0.9988411125319693\n",
      "Iter 710 / 2000, Loss: 129673618.25, CrossEntropy: 0.005304121412336826, Accuracy: 0.9984015345268542\n",
      "Iter 711 / 2000, Loss: 126247076.15625, CrossEntropy: 0.003931899555027485, Accuracy: 0.998821131713555\n",
      "Iter 712 / 2000, Loss: 125748067.34375, CrossEntropy: 0.003722907043993473, Accuracy: 0.9986612851662404\n",
      "Iter 713 / 2000, Loss: 126000910.6875, CrossEntropy: 0.00381892709992826, Accuracy: 0.9988011508951407\n",
      "Iter 714 / 2000, Loss: 125788504.71875, CrossEntropy: 0.0037300027906894684, Accuracy: 0.998761189258312\n",
      "Iter 715 / 2000, Loss: 124293270.75, CrossEntropy: 0.0031685330905020237, Accuracy: 0.9988570971867008\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Iter 716 / 2000, Loss: 129354973.40625, CrossEntropy: 0.005146046634763479, Accuracy: 0.9984015345268542\n",
      "Iter 717 / 2000, Loss: 126013854.3125, CrossEntropy: 0.003812221810221672, Accuracy: 0.9987811700767263\n",
      "Iter 718 / 2000, Loss: 126463488.40625, CrossEntropy: 0.004027649760246277, Accuracy: 0.9987092391304349\n",
      "Iter 719 / 2000, Loss: 123575772.59375, CrossEntropy: 0.0028234284836798906, Accuracy: 0.9989809782608695\n",
      "Iter 720 / 2000, Loss: 126442343.03125, CrossEntropy: 0.003964555915445089, Accuracy: 0.9987412084398977\n",
      "Iter 721 / 2000, Loss: 126725190.3125, CrossEntropy: 0.004072797019034624, Accuracy: 0.998641304347826\n",
      "Iter 722 / 2000, Loss: 127868164.53125, CrossEntropy: 0.004524844698607922, Accuracy: 0.9984414961636828\n",
      "Iter 723 / 2000, Loss: 125540041.96875, CrossEntropy: 0.0035905404947698116, Accuracy: 0.9988411125319693\n",
      "Iter 724 / 2000, Loss: 126248714.5625, CrossEntropy: 0.003869046922773123, Accuracy: 0.998821131713555\n",
      "Iter 725 / 2000, Loss: 127067058.1875, CrossEntropy: 0.004196823574602604, Accuracy: 0.9985014386189258\n",
      "Iter 726 / 2000, Loss: 126024363.09375, CrossEntropy: 0.0037904721684753895, Accuracy: 0.9986892583120205\n",
      "Iter 727 / 2000, Loss: 126287038.96875, CrossEntropy: 0.003874434158205986, Accuracy: 0.9987811700767263\n",
      "Iter 728 / 2000, Loss: 127780562.875, CrossEntropy: 0.004464594181627035, Accuracy: 0.9985613810741688\n",
      "Iter 729 / 2000, Loss: 127085471.15625, CrossEntropy: 0.004182614851742983, Accuracy: 0.9987212276214834\n",
      "Iter 730 / 2000, Loss: 124630493.65625, CrossEntropy: 0.0031970604322850704, Accuracy: 0.9988610933503836\n",
      "Iter 731 / 2000, Loss: 126343211.53125, CrossEntropy: 0.003877441631630063, Accuracy: 0.9985214194373402\n",
      "Iter 732 / 2000, Loss: 125263808.375, CrossEntropy: 0.00344269210472703, Accuracy: 0.9987412084398977\n",
      "Iter 733 / 2000, Loss: 127185334.4375, CrossEntropy: 0.00424582976847887, Accuracy: 0.9985294117647059\n",
      "Iter 734 / 2000, Loss: 123540669.53125, CrossEntropy: 0.002747816266492009, Accuracy: 0.9991008631713555\n",
      "Iter 735 / 2000, Loss: 126105684.84375, CrossEntropy: 0.0037655376363545656, Accuracy: 0.998701246803069\n",
      "Iter 736 / 2000, Loss: 126202571.65625, CrossEntropy: 0.00380675564520061, Accuracy: 0.9987412084398977\n",
      "Iter 737 / 2000, Loss: 126668046.28125, CrossEntropy: 0.003982032183557749, Accuracy: 0.9986612851662404\n",
      "Iter 738 / 2000, Loss: 121833600.90625, CrossEntropy: 0.0020459701772779226, Accuracy: 0.9994005754475703\n",
      "Iter 739 / 2000, Loss: 130596296.1875, CrossEntropy: 0.005543248262256384, Accuracy: 0.9982416879795396\n",
      "Iter 740 / 2000, Loss: 122895063.90625, CrossEntropy: 0.0024631160777062178, Accuracy: 0.9992007672634271\n",
      "Iter 741 / 2000, Loss: 126421916.5625, CrossEntropy: 0.003906558733433485, Accuracy: 0.9987891624040921\n",
      "Iter 742 / 2000, Loss: 124628384.40625, CrossEntropy: 0.0031483087223023176, Accuracy: 0.998821131713555\n",
      "Iter 743 / 2000, Loss: 127807154.75, CrossEntropy: 0.004413787741214037, Accuracy: 0.9987412084398977\n",
      "Iter 744 / 2000, Loss: 126987035.78125, CrossEntropy: 0.004082222934812307, Accuracy: 0.9986812659846548\n",
      "Iter 745 / 2000, Loss: 125253870.40625, CrossEntropy: 0.0033851447515189648, Accuracy: 0.9988610933503836\n",
      "Iter 746 / 2000, Loss: 127970244.65625, CrossEntropy: 0.004479005467146635, Accuracy: 0.9986492966751919\n",
      "Iter 747 / 2000, Loss: 127333675.3125, CrossEntropy: 0.004236765671521425, Accuracy: 0.9985294117647059\n",
      "Iter 748 / 2000, Loss: 128473367.90625, CrossEntropy: 0.004657509736716747, Accuracy: 0.9985414002557544\n",
      "Iter 749 / 2000, Loss: 123756301.78125, CrossEntropy: 0.002769531449303031, Accuracy: 0.9989809782608695\n",
      "Iter 750 / 2000, Loss: 128055263.875, CrossEntropy: 0.00448265578597784, Accuracy: 0.9984814578005116\n",
      "Iter 751 / 2000, Loss: 124762178.25, CrossEntropy: 0.003179699182510376, Accuracy: 0.9988411125319693\n",
      "Iter 752 / 2000, Loss: 127922388.15625, CrossEntropy: 0.004422362893819809, Accuracy: 0.9986812659846548\n",
      "Iter 753 / 2000, Loss: 126475258.9375, CrossEntropy: 0.003840724704787135, Accuracy: 0.9987811700767263\n",
      "Iter 754 / 2000, Loss: 125659975.59375, CrossEntropy: 0.003509286092594266, Accuracy: 0.9988810741687979\n",
      "Iter 755 / 2000, Loss: 125672339.3125, CrossEntropy: 0.003509902162477374, Accuracy: 0.9989410166240409\n",
      "Iter 756 / 2000, Loss: 126727102.65625, CrossEntropy: 0.003927568439394236, Accuracy: 0.9987811700767263\n",
      "Iter 757 / 2000, Loss: 128219911.09375, CrossEntropy: 0.004519154783338308, Accuracy: 0.9985414002557544\n",
      "Iter 758 / 2000, Loss: 126053614.125, CrossEntropy: 0.003675434971228242, Accuracy: 0.9988890664961637\n",
      "Iter 759 / 2000, Loss: 125872430.40625, CrossEntropy: 0.003572432789951563, Accuracy: 0.998761189258312\n",
      "Iter 760 / 2000, Loss: 125024986.65625, CrossEntropy: 0.0032423678785562515, Accuracy: 0.9989490089514067\n",
      "Iter 761 / 2000, Loss: 127438650.96875, CrossEntropy: 0.004191477783024311, Accuracy: 0.998641304347826\n",
      "Iter 762 / 2000, Loss: 126601682.3125, CrossEntropy: 0.0038526987191289663, Accuracy: 0.9986612851662404\n",
      "Iter 763 / 2000, Loss: 129127803.90625, CrossEntropy: 0.004858051892369986, Accuracy: 0.9984015345268542\n",
      "Iter 764 / 2000, Loss: 126170909.0, CrossEntropy: 0.0036752570886164904, Accuracy: 0.9987412084398977\n",
      "Iter 765 / 2000, Loss: 125609244.71875, CrossEntropy: 0.003451381577178836, Accuracy: 0.9988610933503836\n",
      "Iter 766 / 2000, Loss: 124777706.28125, CrossEntropy: 0.0031470907852053642, Accuracy: 0.9990688938618926\n",
      "Iter 767 / 2000, Loss: 129040230.53125, CrossEntropy: 0.0048066009767353535, Accuracy: 0.9986612851662404\n",
      "Iter 768 / 2000, Loss: 127620068.1875, CrossEntropy: 0.0042350031435489655, Accuracy: 0.998761189258312\n",
      "Iter 769 / 2000, Loss: 125348867.59375, CrossEntropy: 0.003323544980958104, Accuracy: 0.9989010549872123\n",
      "Iter 770 / 2000, Loss: 125658503.1875, CrossEntropy: 0.0034468716476112604, Accuracy: 0.9988610933503836\n",
      "Iter 771 / 2000, Loss: 129640722.03125, CrossEntropy: 0.005029517691582441, Accuracy: 0.9983415920716112\n",
      "Iter 772 / 2000, Loss: 127974097.78125, CrossEntropy: 0.0043608457781374454, Accuracy: 0.9987412084398977\n",
      "Iter 773 / 2000, Loss: 125602789.78125, CrossEntropy: 0.003407224314287305, Accuracy: 0.9989410166240409\n",
      "Iter 774 / 2000, Loss: 127264406.25, CrossEntropy: 0.004067215137183666, Accuracy: 0.998641304347826\n",
      "Iter 775 / 2000, Loss: 127890822.375, CrossEntropy: 0.0043200356885790825, Accuracy: 0.9985613810741688\n",
      "Iter 776 / 2000, Loss: 124460582.09375, CrossEntropy: 0.002938805613666773, Accuracy: 0.9990609015345269\n",
      "Iter 777 / 2000, Loss: 127131059.28125, CrossEntropy: 0.004005757160484791, Accuracy: 0.9987212276214834\n",
      "Iter 778 / 2000, Loss: 126513404.46875, CrossEntropy: 0.0037535587325692177, Accuracy: 0.9987412084398977\n",
      "Iter 779 / 2000, Loss: 127147966.21875, CrossEntropy: 0.003998998552560806, Accuracy: 0.9985813618925832\n",
      "Iter 780 / 2000, Loss: 126674755.375, CrossEntropy: 0.0038257131818681955, Accuracy: 0.9987891624040921\n",
      "Iter 781 / 2000, Loss: 127192927.375, CrossEntropy: 0.004008932039141655, Accuracy: 0.998761189258312\n",
      "Iter 782 / 2000, Loss: 127025376.625, CrossEntropy: 0.003955424763262272, Accuracy: 0.9988291240409207\n",
      "Iter 783 / 2000, Loss: 128065295.75, CrossEntropy: 0.004348620306700468, Accuracy: 0.9986812659846548\n",
      "Iter 784 / 2000, Loss: 129885078.625, CrossEntropy: 0.005078965798020363, Accuracy: 0.9985014386189258\n",
      "Iter 785 / 2000, Loss: 125273982.21875, CrossEntropy: 0.0032246734481304884, Accuracy: 0.9989010549872123\n",
      "Iter 786 / 2000, Loss: 126524099.4375, CrossEntropy: 0.003719643922522664, Accuracy: 0.9987811700767263\n",
      "Iter 787 / 2000, Loss: 126973616.96875, CrossEntropy: 0.003930980339646339, Accuracy: 0.9986692774936061\n",
      "Iter 788 / 2000, Loss: 128439052.09375, CrossEntropy: 0.0045282780192792416, Accuracy: 0.9984574808184143\n",
      "Iter 789 / 2000, Loss: 124497484.9375, CrossEntropy: 0.002898377599194646, Accuracy: 0.9991008631713555\n",
      "Iter 790 / 2000, Loss: 128881237.3125, CrossEntropy: 0.004678049124777317, Accuracy: 0.9984494884910486\n",
      "Iter 791 / 2000, Loss: 126044961.9375, CrossEntropy: 0.0035083198454231024, Accuracy: 0.9989809782608695\n",
      "Iter 792 / 2000, Loss: 127806567.1875, CrossEntropy: 0.004210357088595629, Accuracy: 0.9986612851662404\n",
      "Iter 793 / 2000, Loss: 128929212.03125, CrossEntropy: 0.004659191705286503, Accuracy: 0.9986013427109974\n",
      "Iter 794 / 2000, Loss: 125196949.15625, CrossEntropy: 0.003165885806083679, Accuracy: 0.9989809782608695\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Iter 795 / 2000, Loss: 128382846.5625, CrossEntropy: 0.004429686348885298, Accuracy: 0.9987212276214834\n",
      "Iter 796 / 2000, Loss: 128664494.78125, CrossEntropy: 0.004534875508397818, Accuracy: 0.9986612851662404\n",
      "Iter 797 / 2000, Loss: 126465434.96875, CrossEntropy: 0.0036519698332995176, Accuracy: 0.9987412084398977\n",
      "Iter 798 / 2000, Loss: 124035277.125, CrossEntropy: 0.0026772315613925457, Accuracy: 0.9990209398976982\n",
      "Iter 799 / 2000, Loss: 127284900.375, CrossEntropy: 0.003971862141042948, Accuracy: 0.9986013427109974\n",
      "Iter 800 / 2000, Loss: 125546098.46875, CrossEntropy: 0.0032731485553085804, Accuracy: 0.9990009590792839\n",
      "Iter 801 / 2000, Loss: 126280906.125, CrossEntropy: 0.003563451115041971, Accuracy: 0.9989210358056266\n",
      "Iter 802 / 2000, Loss: 126217172.15625, CrossEntropy: 0.003533458337187767, Accuracy: 0.9987811700767263\n",
      "Iter 803 / 2000, Loss: 125861540.1875, CrossEntropy: 0.0033875396475195885, Accuracy: 0.9989410166240409\n",
      "Iter 804 / 2000, Loss: 126956757.84375, CrossEntropy: 0.003821279387921095, Accuracy: 0.998761189258312\n",
      "Iter 805 / 2000, Loss: 126351343.25, CrossEntropy: 0.003576570190489292, Accuracy: 0.9987412084398977\n",
      "Iter 806 / 2000, Loss: 128714986.0, CrossEntropy: 0.0045166355557739735, Accuracy: 0.9986013427109974\n",
      "Iter 807 / 2000, Loss: 127489750.25, CrossEntropy: 0.004022585693746805, Accuracy: 0.998761189258312\n",
      "Iter 808 / 2000, Loss: 127047705.78125, CrossEntropy: 0.003841932164505124, Accuracy: 0.998821131713555\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-9-4d021ee3df5a>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     12\u001b[0m \u001b[0;31m#         images = images.view(images.shape[0], -1)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 14\u001b[0;31m         loss, y_pred,_ = sgd_model.training_step(\n\u001b[0m\u001b[1;32m     15\u001b[0m             \u001b[0mbatch\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mimages\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     16\u001b[0m             \u001b[0mN\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mN\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/Langevin_Variational_Inference/Deep_Nets/CNNs/ResNet/src/components.py\u001b[0m in \u001b[0;36mtraining_step\u001b[0;34m(self, batch, N, vi_batch_size, deterministic_weights)\u001b[0m\n\u001b[1;32m    380\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    381\u001b[0m         \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbatch\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 382\u001b[0;31m         Y_hat, sample_dict = self.sample_pred(\n\u001b[0m\u001b[1;32m    383\u001b[0m             \u001b[0mX\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    384\u001b[0m             \u001b[0mdeterministic\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdeterministic_weights\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/Langevin_Variational_Inference/Deep_Nets/CNNs/ResNet/src/components.py\u001b[0m in \u001b[0;36msample_pred\u001b[0;34m(self, X, deterministic, vi_batch_size, for_training)\u001b[0m\n\u001b[1;32m    359\u001b[0m         )\n\u001b[1;32m    360\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 361\u001b[0;31m         Y_hat = self.forward(\n\u001b[0m\u001b[1;32m    362\u001b[0m             \u001b[0mX\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    363\u001b[0m             \u001b[0msampled_weights\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msample_dict\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/Langevin_Variational_Inference/Deep_Nets/CNNs/ResNet/src/components.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, X, sampled_weights)\u001b[0m\n\u001b[1;32m    818\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    819\u001b[0m                 \u001b[0mtemp_h\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconv_b\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtemp_h\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconv_b_weight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbias\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 820\u001b[0;31m                 \u001b[0mtemp_h\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mact_func\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbatch_norms\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mconv_b_name\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtemp_h\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0mtemp_h\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mtemp_h\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    821\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    822\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0ms\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m1\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0min_planes\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mplanes\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "num_epochs = 2000\n",
    "criterion = torch.nn.CrossEntropyLoss()  # loss function\n",
    "\n",
    "for i in range(num_epochs):\n",
    "    losses = []\n",
    "    cross_losses = []\n",
    "    accuracy = []\n",
    "    \n",
    "    for images, labels in trainloader:\n",
    "\n",
    "        # Flatten MNIST images into a 784 long vector\n",
    "#         images = images.view(images.shape[0], -1)\n",
    "\n",
    "        loss, y_pred,_ = sgd_model.training_step(\n",
    "            batch=(images, labels),\n",
    "            N=N,\n",
    "            deterministic_weights=True,\n",
    "            vi_batch_size=None,\n",
    "        )\n",
    "        losses.append(loss)\n",
    "        \n",
    "        cross_loss = criterion(y_pred.squeeze(0), labels)\n",
    "        cross_losses.append(cross_loss)\n",
    "        accuracy.append((torch.max(y_pred.squeeze(0),-1).indices == labels).sum().item() / labels.size(0))\n",
    "        \n",
    "\n",
    "#     if (i+1) % 10**math.floor(math.log10(i+1)) == 0:  # True when i+1 \\in {1, 2, ..., 10, 20, ..., 100, 200, ..., 1000, 2000, ...}\n",
    "    print(\"Iter {} / {}, Loss: {}, CrossEntropy: {}, Accuracy: {}\".format(i+1, num_epochs, sum(losses), sum(cross_losses)/len(cross_losses), sum(accuracy)/len(accuracy)))\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "EVALUATION with last weights -> Loss: 12420517.0, CrossEntropy: 1.9860225915908813, Accuracy: 0.8195213607594937\n"
     ]
    }
   ],
   "source": [
    "#testing the model with its last weights \n",
    "\n",
    "losses = []\n",
    "cross_losses = []\n",
    "accuracy = []\n",
    "\n",
    "for images, labels in testloader:\n",
    "    inner_cross_losses = []\n",
    "    inner_accuracy = []\n",
    "\n",
    "    # Flatten MNIST images into a 784 long vector\n",
    "#     images = images.view(images.shape[0], -1)\n",
    "\n",
    "    loss, y_pred = sgd_model.evaluate(batch=(images, labels),\n",
    "                N=N,\n",
    "                num_samples=None,\n",
    "                deterministic_weights=True)\n",
    "\n",
    "    losses.append(loss)\n",
    "    cross_loss = criterion(y_pred.squeeze(0), labels)\n",
    "    inner_cross_losses.append(cross_loss)\n",
    "    inner_accuracy.append((torch.max(y_pred.squeeze(0),-1).indices == labels).sum().item() / labels.size(0))\n",
    "\n",
    "    accuracy.append(sum(inner_accuracy)/len(inner_accuracy))\n",
    "    cross_losses.append(sum(inner_cross_losses)/len(inner_cross_losses))\n",
    "\n",
    "print(\"EVALUATION with last weights -> Loss: {}, CrossEntropy: {}, Accuracy: {}\".format(sum(losses)/len(losses), sum(cross_losses)/len(cross_losses), sum(accuracy)/len(accuracy)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# torch.save(sgd_model.state_dict(), \"./sgd_resnet20_svhn_map.pt\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# LVI or non-LVI models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "291376\n"
     ]
    }
   ],
   "source": [
    "import pickle\n",
    "\n",
    "sgd_model = BayesianResNet20(**pickle.load(open(\"./resnet20_sgd_model_params.pickle\", \"rb\")))\n",
    "sgd_model.load_state_dict(torch.load(\"./sgd_resnet20_svhn_map.pt\", map_location=dev))\n",
    "# sgd_model.load_state_dict(torch.load(\"./cnn_svhn_non_lvi.pt\", map_location=dev))\n",
    "\n",
    "\n",
    "num_stoch_params = 0\n",
    "for param in sgd_model.get_stochastic_params():\n",
    "    param_size = 1\n",
    "    for dim in param.shape:\n",
    "        param_size *= dim\n",
    "    num_stoch_params += param_size\n",
    "print(num_stoch_params)\n",
    "\n",
    "lvi_model_params = pickle.load(open(\"./resnet20_sgd_model_params.pickle\", \"rb\"))\n",
    "lvi_model_params[\"group_by_layers\"] = False\n",
    "lvi_model_params[\"use_random_groups\"] = False\n",
    "lvi_model_params[\"use_permuted_groups\"] = True\n",
    "lvi_model_params[\"max_groups\"] = all\n",
    "lvi_model_params[\"dropout_prob\"] = 0.1\n",
    "lvi_model_params[\"chain_length\"] = 5000\n",
    "lvi_model_params[\"prior_std\"] = 0.3\n",
    "# lvi_model_params[\"output_distribution\"] = \"categorical\"\n",
    "# lvi_model_params[\"output_dist_const_params\"] = dict(scale=1.0)\n",
    "\n",
    "lvi_model_params[\"init_values\"] = {k:v.theta_actual.data for k,v in sgd_model.tensor_dict.items()}\n",
    "del sgd_model\n",
    "\n",
    "lvi_model = BayesianResNet20(**lvi_model_params)\n",
    "\n",
    "lvi_model.initialize_optimizer(\n",
    "    update_determ=False, \n",
    "    update_stoch=True, \n",
    "    lr=1e-2,\n",
    "#     lr=1e-3, \n",
    "    rmsprop=False,\n",
    "    sgd=False, \n",
    "    sgld=False, \n",
    "    psgld=False,\n",
    "    sghmc = True\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# dev = torch.device('cpu')\n",
    "# dev = torch.device('cuda:1')\n",
    "\n",
    "lvi_model = lvi_model.to(dev)\n",
    "for n, t in lvi_model.tensor_dict.items():\n",
    "    if isinstance(t, StochasticTensor):\n",
    "        t.prior_dist.loc = t.prior_dist.loc.to(dev)\n",
    "        t.prior_dist.scale = t.prior_dist.scale.to(dev)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Before initialization: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
      "        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
      "        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
      "        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
      "        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
      "        0, 0, 0, 0, 0, 0, 0, 0], device='cuda:6')\n",
      "After initialization: tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
      "        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
      "        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
      "        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
      "        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
      "        1, 1, 1, 1, 1, 1, 1, 1], device='cuda:6')\n"
     ]
    }
   ],
   "source": [
    "# print(\"Before initialization: {}\".format(sgld_model.num_samples_per_group))\n",
    "# sgld_model.init_chains()\n",
    "# print(\"After initialization: {}\".format(sgld_model.num_samples_per_group))\n",
    "print(\"Before initialization: {}\".format(lvi_model.num_samples_per_group))\n",
    "lvi_model.init_chains()\n",
    "print(\"After initialization: {}\".format(lvi_model.num_samples_per_group))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# for lvi\n",
    "\n",
    "def evaluation(lvi_model, testloader):\n",
    "    losses = []\n",
    "    cross_losses = []\n",
    "    accuracy = []\n",
    "\n",
    "    for images, labels in testloader:\n",
    "        inner_cross_losses = []\n",
    "        inner_accuracy = []\n",
    "\n",
    "        loss, y_pred = lvi_model.evaluate(batch=(images, labels),\n",
    "                    N=N,\n",
    "                    num_samples=100,\n",
    "                    deterministic_weights=False)\n",
    "\n",
    "        losses.append(loss)\n",
    "        for j in range(y_pred.shape[0]):\n",
    "            cross_loss = criterion(y_pred.squeeze(0)[j], labels)\n",
    "            inner_cross_losses.append(cross_loss)\n",
    "            inner_accuracy.append((torch.max(y_pred.squeeze(0)[j],-1).indices == labels).sum().item() / labels.size(0))\n",
    "\n",
    "            accuracy.append(sum(inner_accuracy)/len(inner_accuracy))\n",
    "            cross_losses.append(sum(inner_cross_losses)/len(inner_cross_losses))\n",
    "\n",
    "    print(\"EVALUATION with 100 samples -> Loss: {}, CrossEntropy: {}, Accuracy: {}\".format(sum(losses)/len(losses), sum(cross_losses)/len(cross_losses), sum(accuracy)/len(accuracy)))\n",
    "    return sum(accuracy)/len(accuracy)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Iter 1 / 2000, Loss: 2715210.5089514065, CrossEntropy: 0.34937024116516113, Accuracy: 0.87894820971867\n",
      "Elapsed time for the training: 15.139737606048584\n",
      "EVALUATION with 100 samples -> Loss: 6490845.5, CrossEntropy: 1.0225529670715332, Accuracy: 0.7031376084652786\n",
      "Iter 2 / 2000, Loss: 36006551.84654731, CrossEntropy: 0.5586907863616943, Accuracy: 0.8099144820971866\n",
      "Elapsed time for the training: 57.09121513366699\n",
      "EVALUATION with 100 samples -> Loss: 5634281.0, CrossEntropy: 0.8897112607955933, Accuracy: 0.7325287703975695\n",
      "Iter 3 / 2000, Loss: 34219296.1432225, CrossEntropy: 0.5308890342712402, Accuracy: 0.8181056186061382\n",
      "Elapsed time for the training: 57.54124903678894\n",
      "EVALUATION with 100 samples -> Loss: 5490728.0, CrossEntropy: 0.8720085620880127, Accuracy: 0.7366975759270077\n",
      "Iter 4 / 2000, Loss: 32570892.29411765, CrossEntropy: 0.5050051808357239, Accuracy: 0.8257997322570333\n",
      "Elapsed time for the training: 58.128554582595825\n",
      "EVALUATION with 100 samples -> Loss: 5335754.5, CrossEntropy: 0.8413407802581787, Accuracy: 0.7428319596414269\n",
      "Iter 5 / 2000, Loss: 31068478.29667519, CrossEntropy: 0.48169320821762085, Accuracy: 0.8325437579923274\n",
      "Elapsed time for the training: 58.023194551467896\n",
      "EVALUATION with 100 samples -> Loss: 5203860.5, CrossEntropy: 0.8179665207862854, Accuracy: 0.7485111908591329\n",
      "Iter 6 / 2000, Loss: 30061359.2685422, CrossEntropy: 0.4657258987426758, Accuracy: 0.8376658407928389\n",
      "Elapsed time for the training: 58.38040232658386\n",
      "EVALUATION with 100 samples -> Loss: 5095143.0, CrossEntropy: 0.8000845909118652, Accuracy: 0.7525793483193883\n",
      "Iter 7 / 2000, Loss: 28737963.421994884, CrossEntropy: 0.4451114535331726, Accuracy: 0.8445617207480819\n",
      "Elapsed time for the training: 58.26767683029175\n",
      "EVALUATION with 100 samples -> Loss: 5013134.5, CrossEntropy: 0.790156900882721, Accuracy: 0.7557025295896844\n",
      "Iter 8 / 2000, Loss: 27445089.67774936, CrossEntropy: 0.4247331917285919, Accuracy: 0.8512537963554988\n",
      "Elapsed time for the training: 58.30250787734985\n",
      "EVALUATION with 100 samples -> Loss: 4916383.0, CrossEntropy: 0.7778034210205078, Accuracy: 0.7592713571765247\n",
      "Iter 9 / 2000, Loss: 26464456.841432225, CrossEntropy: 0.40954411029815674, Accuracy: 0.8558698649296674\n",
      "Elapsed time for the training: 58.24937438964844\n",
      "EVALUATION with 100 samples -> Loss: 4824370.0, CrossEntropy: 0.7577549815177917, Accuracy: 0.7641617243730557\n",
      "Iter 10 / 2000, Loss: 25315535.017902814, CrossEntropy: 0.3915291428565979, Accuracy: 0.8620584239130435\n",
      "Elapsed time for the training: 58.29200530052185\n",
      "EVALUATION with 100 samples -> Loss: 4754489.0, CrossEntropy: 0.756524920463562, Accuracy: 0.7669264854508956\n",
      "Iter 11 / 2000, Loss: 24386826.50383632, CrossEntropy: 0.3769836127758026, Accuracy: 0.8665321291560102\n",
      "Elapsed time for the training: 58.252137184143066\n",
      "EVALUATION with 100 samples -> Loss: 4662873.0, CrossEntropy: 0.7385167479515076, Accuracy: 0.7698022189140216\n",
      "Iter 12 / 2000, Loss: 23400282.301790282, CrossEntropy: 0.36157792806625366, Accuracy: 0.8729439737851662\n",
      "Elapsed time for the training: 58.192901611328125\n",
      "EVALUATION with 100 samples -> Loss: 4600047.5, CrossEntropy: 0.7275495529174805, Accuracy: 0.7715276886536184\n",
      "Iter 13 / 2000, Loss: 22514201.350383632, CrossEntropy: 0.3478020429611206, Accuracy: 0.8776099944053708\n",
      "Elapsed time for the training: 58.25336766242981\n",
      "EVALUATION with 100 samples -> Loss: 4535557.0, CrossEntropy: 0.713505744934082, Accuracy: 0.7751382179834779\n",
      "Iter 14 / 2000, Loss: 21590397.43478261, CrossEntropy: 0.33331599831581116, Accuracy: 0.8822974944053709\n",
      "Elapsed time for the training: 58.24562692642212\n",
      "EVALUATION with 100 samples -> Loss: 4475005.5, CrossEntropy: 0.7066295742988586, Accuracy: 0.7773761275888398\n",
      "Iter 15 / 2000, Loss: 20758315.815856777, CrossEntropy: 0.32031360268592834, Accuracy: 0.8874595388427109\n",
      "Elapsed time for the training: 58.14265704154968\n",
      "EVALUATION with 100 samples -> Loss: 4449866.0, CrossEntropy: 0.7045375108718872, Accuracy: 0.7794122900090137\n",
      "Iter 16 / 2000, Loss: 19944812.884910487, CrossEntropy: 0.30757784843444824, Accuracy: 0.8926390664961638\n",
      "Elapsed time for the training: 58.3108229637146\n",
      "EVALUATION with 100 samples -> Loss: 4381132.0, CrossEntropy: 0.6994114518165588, Accuracy: 0.7801813452489521\n",
      "Iter 17 / 2000, Loss: 19074721.230179027, CrossEntropy: 0.29391711950302124, Accuracy: 0.8975763267263427\n",
      "Elapsed time for the training: 58.27527737617493\n",
      "EVALUATION with 100 samples -> Loss: 4350551.0, CrossEntropy: 0.6859989762306213, Accuracy: 0.784547478944558\n",
      "Iter 18 / 2000, Loss: 18446432.416879795, CrossEntropy: 0.28403961658477783, Accuracy: 0.9016494165601023\n",
      "Elapsed time for the training: 58.333086013793945\n",
      "EVALUATION with 100 samples -> Loss: 4295310.5, CrossEntropy: 0.678143322467804, Accuracy: 0.7862900623962698\n",
      "Iter 19 / 2000, Loss: 17698100.76214834, CrossEntropy: 0.2723377048969269, Accuracy: 0.9056995284526853\n",
      "Elapsed time for the training: 58.216978549957275\n",
      "EVALUATION with 100 samples -> Loss: 4266839.0, CrossEntropy: 0.6687498688697815, Accuracy: 0.7888184006030716\n",
      "Iter 20 / 2000, Loss: 17063194.805626597, CrossEntropy: 0.2624349594116211, Accuracy: 0.9090822810102301\n",
      "Elapsed time for the training: 58.146708488464355\n",
      "EVALUATION with 100 samples -> Loss: 4220381.5, CrossEntropy: 0.6616508364677429, Accuracy: 0.7905065918187608\n",
      "Iter 21 / 2000, Loss: 16487118.997442456, CrossEntropy: 0.2534387707710266, Accuracy: 0.9137193294437339\n",
      "Elapsed time for the training: 58.225372076034546\n",
      "EVALUATION with 100 samples -> Loss: 4202930.5, CrossEntropy: 0.6616676449775696, Accuracy: 0.7900729142359404\n",
      "Iter 22 / 2000, Loss: 15817090.864450129, CrossEntropy: 0.24300271272659302, Accuracy: 0.9178633511828644\n",
      "Elapsed time for the training: 58.268900871276855\n",
      "EVALUATION with 100 samples -> Loss: 4171649.25, CrossEntropy: 0.6633673310279846, Accuracy: 0.7906537819045801\n",
      "Iter 23 / 2000, Loss: 15346437.098465472, CrossEntropy: 0.23578953742980957, Accuracy: 0.9207470828005115\n",
      "Elapsed time for the training: 58.300007343292236\n",
      "EVALUATION with 100 samples -> Loss: 4167087.5, CrossEntropy: 0.6569070816040039, Accuracy: 0.7917399825807723\n",
      "Iter 24 / 2000, Loss: 14873653.01662404, CrossEntropy: 0.22832949459552765, Accuracy: 0.923854599584399\n",
      "Elapsed time for the training: 58.41043496131897\n",
      "EVALUATION with 100 samples -> Loss: 4161580.75, CrossEntropy: 0.656834602355957, Accuracy: 0.7919076577345401\n",
      "Iter 25 / 2000, Loss: 14366339.234015346, CrossEntropy: 0.22042059898376465, Accuracy: 0.9267053628516624\n",
      "Elapsed time for the training: 58.240822553634644\n",
      "EVALUATION with 100 samples -> Loss: 4126158.75, CrossEntropy: 0.6506656408309937, Accuracy: 0.7927317622513432\n",
      "Iter 26 / 2000, Loss: 13791410.996163683, CrossEntropy: 0.21128027141094208, Accuracy: 0.9308848505434784\n",
      "Elapsed time for the training: 58.210659980773926\n",
      "EVALUATION with 100 samples -> Loss: 4113280.0, CrossEntropy: 0.6523059010505676, Accuracy: 0.793620858814208\n",
      "Iter 27 / 2000, Loss: 13370046.60230179, CrossEntropy: 0.20481133460998535, Accuracy: 0.9335687739769821\n",
      "Elapsed time for the training: 58.242884397506714\n",
      "EVALUATION with 100 samples -> Loss: 4112111.0, CrossEntropy: 0.6482946276664734, Accuracy: 0.793634125619934\n",
      "Iter 28 / 2000, Loss: 12948222.643222507, CrossEntropy: 0.198109969496727, Accuracy: 0.9359689697890026\n",
      "Elapsed time for the training: 58.26414155960083\n",
      "EVALUATION with 100 samples -> Loss: 4104707.0, CrossEntropy: 0.647655189037323, Accuracy: 0.7945977485618578\n",
      "Iter 29 / 2000, Loss: 12511343.836317135, CrossEntropy: 0.19124479591846466, Accuracy: 0.9385879555626598\n",
      "Elapsed time for the training: 58.275410413742065\n",
      "EVALUATION with 100 samples -> Loss: 4128134.5, CrossEntropy: 0.6533522605895996, Accuracy: 0.79472158598423\n",
      "Iter 30 / 2000, Loss: 12063135.58056266, CrossEntropy: 0.18426720798015594, Accuracy: 0.9418658088235294\n",
      "Elapsed time for the training: 58.22355890274048\n",
      "EVALUATION with 100 samples -> Loss: 4096961.25, CrossEntropy: 0.6447520852088928, Accuracy: 0.7958969758605823\n",
      "Iter 31 / 2000, Loss: 11749798.39769821, CrossEntropy: 0.17936266958713531, Accuracy: 0.9437045436381074\n",
      "Elapsed time for the training: 58.34224820137024\n",
      "EVALUATION with 100 samples -> Loss: 4091440.25, CrossEntropy: 0.645074188709259, Accuracy: 0.796928065600784\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Iter 32 / 2000, Loss: 11227373.754475703, CrossEntropy: 0.17120607197284698, Accuracy: 0.9472881034207161\n",
      "Elapsed time for the training: 58.203850746154785\n",
      "EVALUATION with 100 samples -> Loss: 4091329.75, CrossEntropy: 0.6422914862632751, Accuracy: 0.7983548971798862\n",
      "Iter 33 / 2000, Loss: 10853733.891304348, CrossEntropy: 0.16542907059192657, Accuracy: 0.9493406329923273\n",
      "Elapsed time for the training: 58.30840277671814\n",
      "EVALUATION with 100 samples -> Loss: 4093507.75, CrossEntropy: 0.6443257927894592, Accuracy: 0.7993177519496338\n",
      "Iter 34 / 2000, Loss: 10464495.960358057, CrossEntropy: 0.15930786728858948, Accuracy: 0.9520375439578005\n",
      "Elapsed time for the training: 58.25183987617493\n",
      "EVALUATION with 100 samples -> Loss: 4108738.25, CrossEntropy: 0.647723376750946, Accuracy: 0.7983505111638076\n",
      "Iter 35 / 2000, Loss: 10153301.851662405, CrossEntropy: 0.15450531244277954, Accuracy: 0.953960697730179\n",
      "Elapsed time for the training: 58.22048616409302\n",
      "EVALUATION with 100 samples -> Loss: 4105592.5, CrossEntropy: 0.643559992313385, Accuracy: 0.8015274421670078\n",
      "Iter 36 / 2000, Loss: 9816364.411764706, CrossEntropy: 0.1491750180721283, Accuracy: 0.9561396059782609\n",
      "Elapsed time for the training: 58.24428367614746\n",
      "EVALUATION with 100 samples -> Loss: 4103226.0, CrossEntropy: 0.646404504776001, Accuracy: 0.8003176076518028\n",
      "Iter 37 / 2000, Loss: 9450478.42455243, CrossEntropy: 0.1434851735830307, Accuracy: 0.9588075447570332\n",
      "Elapsed time for the training: 58.16686820983887\n",
      "EVALUATION with 100 samples -> Loss: 4127484.5, CrossEntropy: 0.6532695293426514, Accuracy: 0.7990922353921156\n",
      "Iter 38 / 2000, Loss: 9149835.12404092, CrossEntropy: 0.13877905905246735, Accuracy: 0.9604934263107417\n",
      "Elapsed time for the training: 58.13064956665039\n",
      "EVALUATION with 100 samples -> Loss: 4141978.0, CrossEntropy: 0.6568807363510132, Accuracy: 0.7986965393531672\n",
      "Iter 39 / 2000, Loss: 8860421.460358057, CrossEntropy: 0.13425065577030182, Accuracy: 0.9623331601662404\n",
      "Elapsed time for the training: 58.2777533531189\n",
      "EVALUATION with 100 samples -> Loss: 4164575.0, CrossEntropy: 0.6638577580451965, Accuracy: 0.7994619985022291\n",
      "Iter 40 / 2000, Loss: 8594284.586956521, CrossEntropy: 0.13010022044181824, Accuracy: 0.9635080322890026\n",
      "Elapsed time for the training: 58.227158069610596\n",
      "EVALUATION with 100 samples -> Loss: 4158115.25, CrossEntropy: 0.6583829522132874, Accuracy: 0.7999868920172655\n",
      "Iter 41 / 2000, Loss: 8278818.521739131, CrossEntropy: 0.12517328560352325, Accuracy: 0.9657189098465474\n",
      "Elapsed time for the training: 58.23072123527527\n",
      "EVALUATION with 100 samples -> Loss: 4181502.5, CrossEntropy: 0.6557952165603638, Accuracy: 0.8024038698510472\n",
      "Iter 42 / 2000, Loss: 7999723.35230179, CrossEntropy: 0.12080828100442886, Accuracy: 0.9676970108695653\n",
      "Elapsed time for the training: 58.30859303474426\n",
      "EVALUATION with 100 samples -> Loss: 4193482.75, CrossEntropy: 0.6574153304100037, Accuracy: 0.8033174280993876\n",
      "Iter 43 / 2000, Loss: 7716963.082480818, CrossEntropy: 0.1164073720574379, Accuracy: 0.9694778013107418\n",
      "Elapsed time for the training: 58.33590865135193\n",
      "EVALUATION with 100 samples -> Loss: 4203298.0, CrossEntropy: 0.6678599715232849, Accuracy: 0.8011655575734401\n",
      "Iter 44 / 2000, Loss: 7455064.34398977, CrossEntropy: 0.11228858679533005, Accuracy: 0.9711956521739131\n",
      "Elapsed time for the training: 58.47918963432312\n",
      "EVALUATION with 100 samples -> Loss: 4254207.0, CrossEntropy: 0.6715497374534607, Accuracy: 0.801403591368872\n",
      "Iter 45 / 2000, Loss: 7190415.192455243, CrossEntropy: 0.1081685870885849, Accuracy: 0.9727002078005115\n",
      "Elapsed time for the training: 58.31327772140503\n",
      "EVALUATION with 100 samples -> Loss: 4254080.0, CrossEntropy: 0.6791713833808899, Accuracy: 0.8012435462386112\n",
      "Iter 46 / 2000, Loss: 6930206.431585678, CrossEntropy: 0.1041182279586792, Accuracy: 0.9742442255434782\n",
      "Elapsed time for the training: 58.28481674194336\n",
      "EVALUATION with 100 samples -> Loss: 4275100.5, CrossEntropy: 0.6743646860122681, Accuracy: 0.802110021215529\n",
      "Iter 47 / 2000, Loss: 6703953.883631714, CrossEntropy: 0.1006372720003128, Accuracy: 0.9755065137468031\n",
      "Elapsed time for the training: 58.3128297328949\n",
      "EVALUATION with 100 samples -> Loss: 4293329.5, CrossEntropy: 0.6755470633506775, Accuracy: 0.801872499183443\n",
      "Iter 48 / 2000, Loss: 6455543.913682864, CrossEntropy: 0.09675006568431854, Accuracy: 0.9775485533887468\n",
      "Elapsed time for the training: 58.349971771240234\n",
      "EVALUATION with 100 samples -> Loss: 4322287.5, CrossEntropy: 0.6786208748817444, Accuracy: 0.8015695033559618\n",
      "Iter 49 / 2000, Loss: 6236530.302429668, CrossEntropy: 0.09331529587507248, Accuracy: 0.9784986413043478\n",
      "Elapsed time for the training: 58.26516675949097\n",
      "EVALUATION with 100 samples -> Loss: 4304287.5, CrossEntropy: 0.6826485395431519, Accuracy: 0.8008475059379871\n",
      "Iter 50 / 2000, Loss: 6003210.863171356, CrossEntropy: 0.08959576487541199, Accuracy: 0.9800521499360614\n",
      "Elapsed time for the training: 58.213847160339355\n",
      "EVALUATION with 100 samples -> Loss: 4331434.5, CrossEntropy: 0.6834745407104492, Accuracy: 0.8011569606214455\n",
      "Iter 51 / 2000, Loss: 5790052.6745524295, CrossEntropy: 0.08636517822742462, Accuracy: 0.9812395100703324\n",
      "Elapsed time for the training: 58.24240684509277\n",
      "EVALUATION with 100 samples -> Loss: 4348336.5, CrossEntropy: 0.6874404549598694, Accuracy: 0.800709097613957\n",
      "Iter 52 / 2000, Loss: 5600443.956521739, CrossEntropy: 0.08336268365383148, Accuracy: 0.9821171675191815\n",
      "Elapsed time for the training: 58.206743478775024\n",
      "EVALUATION with 100 samples -> Loss: 4372783.5, CrossEntropy: 0.6831952929496765, Accuracy: 0.8033868138557285\n",
      "Iter 53 / 2000, Loss: 5402235.454603581, CrossEntropy: 0.08035971224308014, Accuracy: 0.9835922514386188\n",
      "Elapsed time for the training: 58.25810623168945\n",
      "EVALUATION with 100 samples -> Loss: 4414266.5, CrossEntropy: 0.6986412405967712, Accuracy: 0.7994828266285087\n",
      "Iter 54 / 2000, Loss: 5229511.129795397, CrossEntropy: 0.07755948603153229, Accuracy: 0.9846002837276215\n",
      "Elapsed time for the training: 58.23746347427368\n",
      "EVALUATION with 100 samples -> Loss: 4408239.0, CrossEntropy: 0.7025784850120544, Accuracy: 0.7997213727518411\n",
      "Iter 55 / 2000, Loss: 5016604.355498721, CrossEntropy: 0.0742368996143341, Accuracy: 0.9857321970907928\n",
      "Elapsed time for the training: 58.218098878860474\n",
      "EVALUATION with 100 samples -> Loss: 4418250.5, CrossEntropy: 0.6933525800704956, Accuracy: 0.801337325848001\n",
      "Iter 56 / 2000, Loss: 4816891.0095907925, CrossEntropy: 0.07110615819692612, Accuracy: 0.9867532169117648\n",
      "Elapsed time for the training: 58.117037534713745\n",
      "EVALUATION with 100 samples -> Loss: 4441827.0, CrossEntropy: 0.6960241198539734, Accuracy: 0.80221860500092\n",
      "Iter 57 / 2000, Loss: 4653614.064578005, CrossEntropy: 0.06864708662033081, Accuracy: 0.987534966432225\n",
      "Elapsed time for the training: 58.17896318435669\n",
      "EVALUATION with 100 samples -> Loss: 4475525.5, CrossEntropy: 0.7031693458557129, Accuracy: 0.8012096150803905\n",
      "Iter 58 / 2000, Loss: 4516779.84398977, CrossEntropy: 0.06647295504808426, Accuracy: 0.9883916440217392\n",
      "Elapsed time for the training: 58.186877489089966\n",
      "EVALUATION with 100 samples -> Loss: 4483438.5, CrossEntropy: 0.7023869752883911, Accuracy: 0.8021773805631912\n",
      "Iter 59 / 2000, Loss: 4374148.43797954, CrossEntropy: 0.06426024436950684, Accuracy: 0.9889825767263427\n",
      "Elapsed time for the training: 58.233476638793945\n",
      "EVALUATION with 100 samples -> Loss: 4510657.5, CrossEntropy: 0.7096408605575562, Accuracy: 0.8007075861965416\n",
      "Iter 60 / 2000, Loss: 4226795.982097187, CrossEntropy: 0.06192311644554138, Accuracy: 0.9900165840792838\n",
      "Elapsed time for the training: 58.30101990699768\n",
      "EVALUATION with 100 samples -> Loss: 4517834.0, CrossEntropy: 0.7081893086433411, Accuracy: 0.8022048405848088\n",
      "Iter 61 / 2000, Loss: 4066564.635549872, CrossEntropy: 0.059396892786026, Accuracy: 0.9907303988171355\n",
      "Elapsed time for the training: 58.24307680130005\n",
      "EVALUATION with 100 samples -> Loss: 4559413.5, CrossEntropy: 0.7174937129020691, Accuracy: 0.8009037904054587\n",
      "Iter 62 / 2000, Loss: 3939964.6537723783, CrossEntropy: 0.05746256932616234, Accuracy: 0.9912853660485933\n",
      "Elapsed time for the training: 58.299384117126465\n",
      "EVALUATION with 100 samples -> Loss: 4569835.5, CrossEntropy: 0.7210066914558411, Accuracy: 0.8013768561818081\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Iter 63 / 2000, Loss: 3810382.0530690537, CrossEntropy: 0.05543358623981476, Accuracy: 0.991837336157289\n",
      "Elapsed time for the training: 58.31538271903992\n",
      "EVALUATION with 100 samples -> Loss: 4588125.5, CrossEntropy: 0.7355416417121887, Accuracy: 0.7994975992133789\n",
      "Iter 64 / 2000, Loss: 3650785.6310741687, CrossEntropy: 0.053015369921922684, Accuracy: 0.9927040041560102\n",
      "Elapsed time for the training: 58.2716121673584\n",
      "EVALUATION with 100 samples -> Loss: 4618348.5, CrossEntropy: 0.728718101978302, Accuracy: 0.8016551143471826\n",
      "Iter 65 / 2000, Loss: 3527882.682225064, CrossEntropy: 0.05100064352154732, Accuracy: 0.9930626598465474\n",
      "Elapsed time for the training: 58.47449088096619\n",
      "EVALUATION with 100 samples -> Loss: 4637619.5, CrossEntropy: 0.7311728000640869, Accuracy: 0.8027164162181606\n",
      "Iter 66 / 2000, Loss: 3409866.854539642, CrossEntropy: 0.04917813837528229, Accuracy: 0.9936545915920717\n",
      "Elapsed time for the training: 58.30227446556091\n",
      "EVALUATION with 100 samples -> Loss: 4656242.0, CrossEntropy: 0.7287371754646301, Accuracy: 0.8035862533648287\n",
      "Iter 67 / 2000, Loss: 3332107.783248082, CrossEntropy: 0.047999076545238495, Accuracy: 0.9937205282928389\n",
      "Elapsed time for the training: 58.23735523223877\n",
      "EVALUATION with 100 samples -> Loss: 4684396.0, CrossEntropy: 0.7416803240776062, Accuracy: 0.8027839486451971\n",
      "Iter 68 / 2000, Loss: 3179875.9171994883, CrossEntropy: 0.045581310987472534, Accuracy: 0.9945802030051151\n",
      "Elapsed time for the training: 58.19329333305359\n",
      "EVALUATION with 100 samples -> Loss: 4712326.0, CrossEntropy: 0.7505578994750977, Accuracy: 0.8007445574819573\n",
      "Iter 69 / 2000, Loss: 3104564.310421995, CrossEntropy: 0.044442757964134216, Accuracy: 0.9947205682544757\n",
      "Elapsed time for the training: 58.203654289245605\n",
      "EVALUATION with 100 samples -> Loss: 4712303.5, CrossEntropy: 0.7457011938095093, Accuracy: 0.8030592421638698\n",
      "Iter 70 / 2000, Loss: 3011868.360933504, CrossEntropy: 0.04300656542181969, Accuracy: 0.9952755354859336\n",
      "Elapsed time for the training: 58.32969617843628\n",
      "EVALUATION with 100 samples -> Loss: 4773372.5, CrossEntropy: 0.7541062235832214, Accuracy: 0.8026885548290084\n",
      "Iter 71 / 2000, Loss: 2879186.036445013, CrossEntropy: 0.04091423377394676, Accuracy: 0.9958145180626599\n",
      "Elapsed time for the training: 58.39107894897461\n",
      "EVALUATION with 100 samples -> Loss: 4779980.5, CrossEntropy: 0.7513427138328552, Accuracy: 0.8038321705258681\n",
      "Iter 72 / 2000, Loss: 2793410.3698849105, CrossEntropy: 0.039574384689331055, Accuracy: 0.996046295556266\n",
      "Elapsed time for the training: 58.24119758605957\n",
      "EVALUATION with 100 samples -> Loss: 4781717.0, CrossEntropy: 0.7533137202262878, Accuracy: 0.8031289790435056\n",
      "Iter 73 / 2000, Loss: 2732568.23657289, CrossEntropy: 0.03862486779689789, Accuracy: 0.9962411085358056\n",
      "Elapsed time for the training: 58.3854124546051\n",
      "EVALUATION with 100 samples -> Loss: 4818387.5, CrossEntropy: 0.7610064148902893, Accuracy: 0.8020568689441689\n",
      "Iter 74 / 2000, Loss: 2655348.9996803068, CrossEntropy: 0.037458378821611404, Accuracy: 0.9963924632352941\n",
      "Elapsed time for the training: 58.314574003219604\n",
      "EVALUATION with 100 samples -> Loss: 4835274.0, CrossEntropy: 0.7700203061103821, Accuracy: 0.8025899918132646\n",
      "Iter 75 / 2000, Loss: 2560744.2084398977, CrossEntropy: 0.035984914749860764, Accuracy: 0.996806565696931\n",
      "Elapsed time for the training: 58.274885416030884\n",
      "EVALUATION with 100 samples -> Loss: 4867415.0, CrossEntropy: 0.7635794878005981, Accuracy: 0.8030008880853126\n",
      "Iter 76 / 2000, Loss: 2487516.179347826, CrossEntropy: 0.03484726697206497, Accuracy: 0.9969449328644501\n",
      "Elapsed time for the training: 58.224263191223145\n",
      "EVALUATION with 100 samples -> Loss: 4920846.5, CrossEntropy: 0.783163845539093, Accuracy: 0.8023910070473579\n",
      "Iter 77 / 2000, Loss: 2399495.5617007674, CrossEntropy: 0.03346317261457443, Accuracy: 0.9972486413043479\n",
      "Elapsed time for the training: 58.21888065338135\n",
      "EVALUATION with 100 samples -> Loss: 4898337.5, CrossEntropy: 0.7804448008537292, Accuracy: 0.8024287324328382\n",
      "Iter 78 / 2000, Loss: 2333932.588235294, CrossEntropy: 0.032460249960422516, Accuracy: 0.9973885070332481\n",
      "Elapsed time for the training: 58.269996643066406\n",
      "EVALUATION with 100 samples -> Loss: 4926685.5, CrossEntropy: 0.7736069560050964, Accuracy: 0.803106326033666\n",
      "Iter 79 / 2000, Loss: 2244640.980818414, CrossEntropy: 0.03103495202958584, Accuracy: 0.9978370764066496\n",
      "Elapsed time for the training: 58.3096706867218\n",
      "EVALUATION with 100 samples -> Loss: 4942023.5, CrossEntropy: 0.7783955335617065, Accuracy: 0.8022642869705237\n",
      "Iter 80 / 2000, Loss: 2177342.514386189, CrossEntropy: 0.030030110850930214, Accuracy: 0.9978570572250639\n",
      "Elapsed time for the training: 58.1921603679657\n",
      "EVALUATION with 100 samples -> Loss: 4979078.5, CrossEntropy: 0.7840690612792969, Accuracy: 0.802556771095455\n",
      "Iter 81 / 2000, Loss: 2156097.8420716114, CrossEntropy: 0.02970331534743309, Accuracy: 0.9978096027813299\n",
      "Elapsed time for the training: 58.419703006744385\n",
      "EVALUATION with 100 samples -> Loss: 5041664.5, CrossEntropy: 0.7998002171516418, Accuracy: 0.8008425608869545\n",
      "Iter 82 / 2000, Loss: 2079319.8935421994, CrossEntropy: 0.028476107865571976, Accuracy: 0.9981567695012787\n",
      "Elapsed time for the training: 58.473438024520874\n",
      "EVALUATION with 100 samples -> Loss: 5056867.0, CrossEntropy: 0.7945358753204346, Accuracy: 0.8007628568746494\n",
      "Iter 83 / 2000, Loss: 2006784.4632352942, CrossEntropy: 0.027354178950190544, Accuracy: 0.9983835517902813\n",
      "Elapsed time for the training: 58.23772168159485\n",
      "EVALUATION with 100 samples -> Loss: 5057809.5, CrossEntropy: 0.7977258563041687, Accuracy: 0.8008656679731047\n",
      "Iter 84 / 2000, Loss: 2000216.1099744246, CrossEntropy: 0.027300158515572548, Accuracy: 0.9982187100383632\n",
      "Elapsed time for the training: 58.321335554122925\n",
      "EVALUATION with 100 samples -> Loss: 5111756.5, CrossEntropy: 0.8093382716178894, Accuracy: 0.8000818390711335\n",
      "Iter 85 / 2000, Loss: 1944466.7592710997, CrossEntropy: 0.026405934244394302, Accuracy: 0.9985039362212276\n",
      "Elapsed time for the training: 58.18046474456787\n",
      "EVALUATION with 100 samples -> Loss: 5081621.0, CrossEntropy: 0.800325334072113, Accuracy: 0.8001386449106591\n",
      "Iter 86 / 2000, Loss: 1889514.0615409208, CrossEntropy: 0.025564588606357574, Accuracy: 0.9985409007352941\n",
      "Elapsed time for the training: 58.33541989326477\n",
      "EVALUATION with 100 samples -> Loss: 5148384.5, CrossEntropy: 0.8080232739448547, Accuracy: 0.8000413530774195\n",
      "Iter 87 / 2000, Loss: 1840814.225703325, CrossEntropy: 0.024811869487166405, Accuracy: 0.9986178268861892\n",
      "Elapsed time for the training: 58.34962272644043\n",
      "EVALUATION with 100 samples -> Loss: 5145395.0, CrossEntropy: 0.811299204826355, Accuracy: 0.8003062620002401\n",
      "Iter 88 / 2000, Loss: 1811595.572090793, CrossEntropy: 0.024349190294742584, Accuracy: 0.9986622842071612\n",
      "Elapsed time for the training: 58.176780700683594\n",
      "EVALUATION with 100 samples -> Loss: 5203266.0, CrossEntropy: 0.8213995695114136, Accuracy: 0.7989282150788757\n",
      "Iter 89 / 2000, Loss: 1777148.4037723786, CrossEntropy: 0.02383875474333763, Accuracy: 0.9985608815537084\n",
      "Elapsed time for the training: 58.18376326560974\n",
      "EVALUATION with 100 samples -> Loss: 5193279.5, CrossEntropy: 0.8212692141532898, Accuracy: 0.7989031089376492\n",
      "Iter 90 / 2000, Loss: 1752370.9977621483, CrossEntropy: 0.02342107705771923, Accuracy: 0.9985289122442456\n",
      "Elapsed time for the training: 58.1248037815094\n",
      "EVALUATION with 100 samples -> Loss: 5217992.5, CrossEntropy: 0.8257759809494019, Accuracy: 0.8000632971430421\n",
      "Iter 91 / 2000, Loss: 1689036.2145140665, CrossEntropy: 0.022477740421891212, Accuracy: 0.9987796715153453\n",
      "Elapsed time for the training: 58.27134990692139\n",
      "EVALUATION with 100 samples -> Loss: 5242223.5, CrossEntropy: 0.8218739628791809, Accuracy: 0.8013627028984746\n",
      "Iter 92 / 2000, Loss: 1657639.9413363172, CrossEntropy: 0.021996475756168365, Accuracy: 0.9987836676790282\n",
      "Elapsed time for the training: 58.1583137512207\n",
      "EVALUATION with 100 samples -> Loss: 5278364.0, CrossEntropy: 0.8382292985916138, Accuracy: 0.799513285626122\n",
      "Iter 93 / 2000, Loss: 1607710.7605498722, CrossEntropy: 0.02124156802892685, Accuracy: 0.9989435142263428\n",
      "Elapsed time for the training: 58.09903812408447\n",
      "EVALUATION with 100 samples -> Loss: 5285412.0, CrossEntropy: 0.8342238068580627, Accuracy: 0.8016032548544516\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Iter 94 / 2000, Loss: 1587101.6104539642, CrossEntropy: 0.02088003046810627, Accuracy: 0.9989315257352942\n",
      "Elapsed time for the training: 58.231104373931885\n",
      "EVALUATION with 100 samples -> Loss: 5278421.0, CrossEntropy: 0.8425154089927673, Accuracy: 0.8008892868628594\n",
      "Iter 95 / 2000, Loss: 1532725.079124041, CrossEntropy: 0.020041856914758682, Accuracy: 0.9991682984335039\n",
      "Elapsed time for the training: 58.279860973358154\n",
      "EVALUATION with 100 samples -> Loss: 5319488.0, CrossEntropy: 0.852121889591217, Accuracy: 0.8011401493739003\n",
      "Iter 96 / 2000, Loss: 1520305.2507992326, CrossEntropy: 0.01986638642847538, Accuracy: 0.9991333320012787\n",
      "Elapsed time for the training: 58.23946523666382\n",
      "EVALUATION with 100 samples -> Loss: 5308474.5, CrossEntropy: 0.8499442338943481, Accuracy: 0.8025748775436612\n",
      "Iter 97 / 2000, Loss: 1496997.0132672635, CrossEntropy: 0.019516589120030403, Accuracy: 0.999130834398977\n",
      "Elapsed time for the training: 58.18217706680298\n",
      "EVALUATION with 100 samples -> Loss: 5328012.5, CrossEntropy: 0.8436151146888733, Accuracy: 0.8041033260077923\n",
      "Iter 98 / 2000, Loss: 1435457.903932225, CrossEntropy: 0.018567875027656555, Accuracy: 0.999225743286445\n",
      "Elapsed time for the training: 58.26764988899231\n",
      "EVALUATION with 100 samples -> Loss: 5341057.5, CrossEntropy: 0.8496484756469727, Accuracy: 0.8047242931717354\n",
      "Iter 99 / 2000, Loss: 1437756.6235613811, CrossEntropy: 0.018613066524267197, Accuracy: 0.9992991727941177\n",
      "Elapsed time for the training: 58.29657793045044\n",
      "EVALUATION with 100 samples -> Loss: 5370801.5, CrossEntropy: 0.8523765206336975, Accuracy: 0.80304471415588\n",
      "At iteration 100 we change the dropout rate from 0.1 to 0.2. \n",
      "Iter 100 / 2000, Loss: 1461160.6070971866, CrossEntropy: 0.017187979072332382, Accuracy: 0.9993446291560102\n",
      "Elapsed time for the training: 58.21733617782593\n",
      "EVALUATION with 100 samples -> Loss: 5415849.0, CrossEntropy: 0.856731116771698, Accuracy: 0.8029106790897752\n",
      "Iter 101 / 2000, Loss: 1418558.322730179, CrossEntropy: 0.016527634114027023, Accuracy: 0.9994215553069055\n",
      "Elapsed time for the training: 58.27258110046387\n",
      "EVALUATION with 100 samples -> Loss: 5419063.0, CrossEntropy: 0.8513650298118591, Accuracy: 0.8058846974577848\n",
      "Iter 102 / 2000, Loss: 1336382.9694693096, CrossEntropy: 0.015278236009180546, Accuracy: 0.9995749080882352\n",
      "Elapsed time for the training: 58.21898913383484\n",
      "EVALUATION with 100 samples -> Loss: 5468682.5, CrossEntropy: 0.8603703379631042, Accuracy: 0.8052323166591091\n",
      "Iter 103 / 2000, Loss: 1320416.4864130435, CrossEntropy: 0.015020978637039661, Accuracy: 0.9995704124040921\n",
      "Elapsed time for the training: 58.118234395980835\n",
      "EVALUATION with 100 samples -> Loss: 5459358.5, CrossEntropy: 0.853823721408844, Accuracy: 0.8069509182821334\n",
      "Iter 104 / 2000, Loss: 1253153.1628836317, CrossEntropy: 0.014001544564962387, Accuracy: 0.9996293558184143\n",
      "Elapsed time for the training: 58.26292634010315\n",
      "EVALUATION with 100 samples -> Loss: 5500383.0, CrossEntropy: 0.867034912109375, Accuracy: 0.8047036316650155\n",
      "Iter 105 / 2000, Loss: 1203680.8578964195, CrossEntropy: 0.013237114064395428, Accuracy: 0.999772718190537\n",
      "Elapsed time for the training: 58.09527015686035\n",
      "EVALUATION with 100 samples -> Loss: 5490353.5, CrossEntropy: 0.8628397583961487, Accuracy: 0.804963069975403\n",
      "Iter 106 / 2000, Loss: 1175676.6336317135, CrossEntropy: 0.012830392457544804, Accuracy: 0.9997452445652174\n",
      "Elapsed time for the training: 58.16455960273743\n",
      "EVALUATION with 100 samples -> Loss: 5542053.0, CrossEntropy: 0.8791576623916626, Accuracy: 0.8013275842789068\n",
      "Iter 107 / 2000, Loss: 1133640.5578644502, CrossEntropy: 0.01215438637882471, Accuracy: 0.999785206202046\n",
      "Elapsed time for the training: 58.23659682273865\n",
      "EVALUATION with 100 samples -> Loss: 5565834.5, CrossEntropy: 0.8712854385375977, Accuracy: 0.8055498394080298\n",
      "Iter 108 / 2000, Loss: 1101206.6341112533, CrossEntropy: 0.011699973605573177, Accuracy: 0.9998376558503836\n",
      "Elapsed time for the training: 58.17267608642578\n",
      "EVALUATION with 100 samples -> Loss: 5577997.0, CrossEntropy: 0.8777003288269043, Accuracy: 0.8036428727381644\n",
      "Iter 109 / 2000, Loss: 1074993.8030690537, CrossEntropy: 0.01127791777253151, Accuracy: 0.9998151774296675\n",
      "Elapsed time for the training: 58.17545032501221\n",
      "EVALUATION with 100 samples -> Loss: 5637452.5, CrossEntropy: 0.8823959827423096, Accuracy: 0.8040031640261935\n",
      "Iter 110 / 2000, Loss: 1039908.6040601023, CrossEntropy: 0.010774068534374237, Accuracy: 0.9998551390664961\n",
      "Elapsed time for the training: 58.28384447097778\n",
      "EVALUATION with 100 samples -> Loss: 5665202.0, CrossEntropy: 0.8885799646377563, Accuracy: 0.8032889724531397\n",
      "Iter 111 / 2000, Loss: 1015409.9522058824, CrossEntropy: 0.010396255180239677, Accuracy: 0.9998051870204604\n",
      "Elapsed time for the training: 58.153138875961304\n",
      "EVALUATION with 100 samples -> Loss: 5693410.0, CrossEntropy: 0.9028223752975464, Accuracy: 0.8003955710663179\n",
      "Iter 112 / 2000, Loss: 994827.496483376, CrossEntropy: 0.010099785402417183, Accuracy: 0.9998476462595908\n",
      "Elapsed time for the training: 58.200692892074585\n",
      "EVALUATION with 100 samples -> Loss: 5728731.5, CrossEntropy: 0.9129568934440613, Accuracy: 0.8003889133690929\n",
      "Iter 113 / 2000, Loss: 977081.8863491048, CrossEntropy: 0.009850726462900639, Accuracy: 0.9998826126918159\n",
      "Elapsed time for the training: 58.30773329734802\n",
      "EVALUATION with 100 samples -> Loss: 5745057.0, CrossEntropy: 0.9050006866455078, Accuracy: 0.8003356999328272\n",
      "Iter 114 / 2000, Loss: 939951.2154731458, CrossEntropy: 0.00928160548210144, Accuracy: 0.9999075887148338\n",
      "Elapsed time for the training: 58.3105731010437\n",
      "EVALUATION with 100 samples -> Loss: 5787141.5, CrossEntropy: 0.91242915391922, Accuracy: 0.800922631821782\n",
      "Iter 115 / 2000, Loss: 943630.2225063939, CrossEntropy: 0.009330838918685913, Accuracy: 0.999799192774936\n",
      "Elapsed time for the training: 58.24239373207092\n",
      "EVALUATION with 100 samples -> Loss: 5760921.5, CrossEntropy: 0.9095049500465393, Accuracy: 0.8005839354843701\n",
      "Iter 116 / 2000, Loss: 889240.3104219949, CrossEntropy: 0.008502116426825523, Accuracy: 0.9999725263746803\n",
      "Elapsed time for the training: 58.28271508216858\n",
      "EVALUATION with 100 samples -> Loss: 5789563.5, CrossEntropy: 0.9169051647186279, Accuracy: 0.7999019759990199\n",
      "Iter 117 / 2000, Loss: 884292.1226023018, CrossEntropy: 0.008443369530141354, Accuracy: 0.9999200767263428\n",
      "Elapsed time for the training: 58.26526618003845\n",
      "EVALUATION with 100 samples -> Loss: 5833406.5, CrossEntropy: 0.9131406545639038, Accuracy: 0.8009844662910448\n",
      "Iter 118 / 2000, Loss: 864664.405370844, CrossEntropy: 0.008141028694808483, Accuracy: 0.9999375599424553\n",
      "Elapsed time for the training: 58.16347122192383\n",
      "EVALUATION with 100 samples -> Loss: 5855818.0, CrossEntropy: 0.918221652507782, Accuracy: 0.8022292023634402\n",
      "Iter 119 / 2000, Loss: 848690.0023976982, CrossEntropy: 0.007945921272039413, Accuracy: 0.9998931026214835\n",
      "Elapsed time for the training: 58.3685827255249\n",
      "EVALUATION with 100 samples -> Loss: 5845783.0, CrossEntropy: 0.9332326650619507, Accuracy: 0.8011988715316453\n",
      "Iter 120 / 2000, Loss: 801855.6986892583, CrossEntropy: 0.0072125704027712345, Accuracy: 0.9999650335677749\n",
      "Elapsed time for the training: 58.29919242858887\n",
      "EVALUATION with 100 samples -> Loss: 5828397.0, CrossEntropy: 0.9148891568183899, Accuracy: 0.803500087654574\n",
      "Iter 121 / 2000, Loss: 805295.7576726343, CrossEntropy: 0.007283323910087347, Accuracy: 0.9999150815217391\n",
      "Elapsed time for the training: 58.35199213027954\n",
      "EVALUATION with 100 samples -> Loss: 5852983.0, CrossEntropy: 0.9346002340316772, Accuracy: 0.8028731888910987\n",
      "Iter 122 / 2000, Loss: 788093.9490089514, CrossEntropy: 0.007060373667627573, Accuracy: 0.9999725263746803\n",
      "Elapsed time for the training: 58.378124713897705\n",
      "EVALUATION with 100 samples -> Loss: 5893557.0, CrossEntropy: 0.9313960075378418, Accuracy: 0.8036003480619135\n",
      "Iter 123 / 2000, Loss: 769675.7997122762, CrossEntropy: 0.0067810183390975, Accuracy: 0.9999625359654731\n",
      "Elapsed time for the training: 58.28868889808655\n",
      "EVALUATION with 100 samples -> Loss: 5868639.0, CrossEntropy: 0.9437527656555176, Accuracy: 0.8029810787416198\n",
      "Iter 124 / 2000, Loss: 743120.9154411765, CrossEntropy: 0.006373031996190548, Accuracy: 0.9999450527493606\n",
      "Elapsed time for the training: 58.303338050842285\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "EVALUATION with 100 samples -> Loss: 5870103.5, CrossEntropy: 0.9242763519287109, Accuracy: 0.8047210610830137\n",
      "Iter 125 / 2000, Loss: 732704.5353260869, CrossEntropy: 0.006202626507729292, Accuracy: 0.9999675311700768\n",
      "Elapsed time for the training: 58.09262776374817\n",
      "EVALUATION with 100 samples -> Loss: 5901753.5, CrossEntropy: 0.9346365332603455, Accuracy: 0.8052960418378661\n",
      "Iter 126 / 2000, Loss: 724711.9737851663, CrossEntropy: 0.006098444107919931, Accuracy: 0.9999475503516624\n",
      "Elapsed time for the training: 58.20091247558594\n",
      "EVALUATION with 100 samples -> Loss: 5923078.0, CrossEntropy: 0.9368693232536316, Accuracy: 0.8052846905187397\n",
      "Iter 127 / 2000, Loss: 710587.988011509, CrossEntropy: 0.00589488772675395, Accuracy: 0.9999150815217391\n",
      "Elapsed time for the training: 58.179391860961914\n",
      "EVALUATION with 100 samples -> Loss: 5957695.5, CrossEntropy: 0.9367071390151978, Accuracy: 0.8049677015134677\n",
      "Iter 128 / 2000, Loss: 702798.5820012788, CrossEntropy: 0.005784789565950632, Accuracy: 0.9999750239769821\n",
      "Elapsed time for the training: 58.2786169052124\n",
      "EVALUATION with 100 samples -> Loss: 5940599.0, CrossEntropy: 0.9370959401130676, Accuracy: 0.8040995675940863\n",
      "Iter 129 / 2000, Loss: 686529.1878996163, CrossEntropy: 0.0055221267975866795, Accuracy: 0.9999900095907929\n",
      "Elapsed time for the training: 58.37019872665405\n",
      "EVALUATION with 100 samples -> Loss: 5978218.5, CrossEntropy: 0.9405765533447266, Accuracy: 0.8049159818659453\n",
      "Iter 130 / 2000, Loss: 676561.4876118926, CrossEntropy: 0.005417695268988609, Accuracy: 0.9999800191815856\n",
      "Elapsed time for the training: 58.26417851448059\n",
      "EVALUATION with 100 samples -> Loss: 6019146.5, CrossEntropy: 0.9600706100463867, Accuracy: 0.8013466465555301\n",
      "Iter 131 / 2000, Loss: 663519.7690217391, CrossEntropy: 0.005242020357400179, Accuracy: 0.9999950047953964\n",
      "Elapsed time for the training: 58.31755328178406\n",
      "EVALUATION with 100 samples -> Loss: 6028795.5, CrossEntropy: 0.9530194997787476, Accuracy: 0.8055012652661191\n",
      "Iter 132 / 2000, Loss: 654688.9920076727, CrossEntropy: 0.005113269668072462, Accuracy: 0.9999900095907929\n",
      "Elapsed time for the training: 58.38783574104309\n",
      "EVALUATION with 100 samples -> Loss: 6032136.0, CrossEntropy: 0.958609402179718, Accuracy: 0.8036656630249329\n",
      "Iter 133 / 2000, Loss: 650953.6158887468, CrossEntropy: 0.005046550650149584, Accuracy: 0.9999600383631714\n",
      "Elapsed time for the training: 58.145244121551514\n",
      "EVALUATION with 100 samples -> Loss: 6050798.5, CrossEntropy: 0.9679810404777527, Accuracy: 0.804269628392743\n",
      "Iter 134 / 2000, Loss: 643837.8164961637, CrossEntropy: 0.004973701201379299, Accuracy: 0.9999775215792839\n",
      "Elapsed time for the training: 58.22118663787842\n",
      "EVALUATION with 100 samples -> Loss: 6082693.5, CrossEntropy: 0.9536794424057007, Accuracy: 0.8053432382221593\n",
      "Iter 135 / 2000, Loss: 629125.5509111254, CrossEntropy: 0.004757818300276995, Accuracy: 0.9999725263746803\n",
      "Elapsed time for the training: 58.24265956878662\n",
      "EVALUATION with 100 samples -> Loss: 6083133.5, CrossEntropy: 0.9818621277809143, Accuracy: 0.8037720639789784\n",
      "Iter 136 / 2000, Loss: 619057.345508312, CrossEntropy: 0.004616371355950832, Accuracy: 0.9999675311700768\n",
      "Elapsed time for the training: 58.18114495277405\n",
      "EVALUATION with 100 samples -> Loss: 6138068.5, CrossEntropy: 0.9681404232978821, Accuracy: 0.8052222056157505\n",
      "Iter 137 / 2000, Loss: 608246.4706681586, CrossEntropy: 0.004460023250430822, Accuracy: 0.9999900095907929\n",
      "Elapsed time for the training: 58.30461049079895\n",
      "EVALUATION with 100 samples -> Loss: 6172906.5, CrossEntropy: 0.9807880520820618, Accuracy: 0.801335488480311\n",
      "Iter 138 / 2000, Loss: 604387.1771099744, CrossEntropy: 0.004423532634973526, Accuracy: 0.9999775215792839\n",
      "Elapsed time for the training: 58.387452602386475\n",
      "EVALUATION with 100 samples -> Loss: 6109452.0, CrossEntropy: 0.959118127822876, Accuracy: 0.8049395822591031\n",
      "Iter 139 / 2000, Loss: 589303.7898017903, CrossEntropy: 0.004205250181257725, Accuracy: 0.999987511988491\n",
      "Elapsed time for the training: 58.288264989852905\n",
      "EVALUATION with 100 samples -> Loss: 6185073.5, CrossEntropy: 0.9839416146278381, Accuracy: 0.8031494510818783\n",
      "Iter 140 / 2000, Loss: 587931.5721707161, CrossEntropy: 0.004206047393381596, Accuracy: 0.9999950047953964\n",
      "Elapsed time for the training: 58.40473246574402\n",
      "EVALUATION with 100 samples -> Loss: 6209720.5, CrossEntropy: 0.9821699857711792, Accuracy: 0.8043190321494168\n",
      "Iter 141 / 2000, Loss: 578473.3343190537, CrossEntropy: 0.004050405230373144, Accuracy: 0.999987511988491\n",
      "Elapsed time for the training: 58.25361394882202\n",
      "EVALUATION with 100 samples -> Loss: 6177501.5, CrossEntropy: 0.9749683737754822, Accuracy: 0.8036531346869533\n",
      "Iter 142 / 2000, Loss: 570166.9296675192, CrossEntropy: 0.003942063543945551, Accuracy: 0.9999950047953964\n",
      "Elapsed time for the training: 58.410324573516846\n",
      "EVALUATION with 100 samples -> Loss: 6157826.0, CrossEntropy: 0.9717227816581726, Accuracy: 0.8039562670955003\n",
      "Iter 143 / 2000, Loss: 562036.8521419438, CrossEntropy: 0.0038381381891667843, Accuracy: 0.9999925071930946\n",
      "Elapsed time for the training: 58.37204074859619\n",
      "EVALUATION with 100 samples -> Loss: 6179764.5, CrossEntropy: 0.9715122580528259, Accuracy: 0.8049888199894981\n",
      "Iter 144 / 2000, Loss: 559241.9597186701, CrossEntropy: 0.0038319737650454044, Accuracy: 0.9999825167838875\n",
      "Elapsed time for the training: 58.266900300979614\n",
      "EVALUATION with 100 samples -> Loss: 6203209.0, CrossEntropy: 0.9747737050056458, Accuracy: 0.8045121862399907\n",
      "Iter 145 / 2000, Loss: 557933.0710517903, CrossEntropy: 0.0038065221160650253, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.41984438896179\n",
      "EVALUATION with 100 samples -> Loss: 6210144.0, CrossEntropy: 0.9918214082717896, Accuracy: 0.8023356174912329\n",
      "Iter 146 / 2000, Loss: 549520.2991528133, CrossEntropy: 0.0036932933144271374, Accuracy: 0.9999975023976982\n",
      "Elapsed time for the training: 58.156649589538574\n",
      "EVALUATION with 100 samples -> Loss: 6187520.0, CrossEntropy: 0.9767943620681763, Accuracy: 0.804984109223146\n",
      "Iter 147 / 2000, Loss: 538409.1294757033, CrossEntropy: 0.003535233670845628, Accuracy: 0.9999600383631714\n",
      "Elapsed time for the training: 58.291749238967896\n",
      "EVALUATION with 100 samples -> Loss: 6190076.5, CrossEntropy: 0.9780751466751099, Accuracy: 0.8054258843871532\n",
      "Iter 148 / 2000, Loss: 536769.1886988491, CrossEntropy: 0.0035200677812099457, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.240564584732056\n",
      "EVALUATION with 100 samples -> Loss: 6238475.5, CrossEntropy: 0.9815401434898376, Accuracy: 0.8039428948779432\n",
      "Iter 149 / 2000, Loss: 531197.7886828644, CrossEntropy: 0.003439122810959816, Accuracy: 0.9999925071930946\n",
      "Elapsed time for the training: 58.334022521972656\n",
      "EVALUATION with 100 samples -> Loss: 6213067.5, CrossEntropy: 0.974777102470398, Accuracy: 0.804223434996219\n",
      "Iter 150 / 2000, Loss: 526972.8128996163, CrossEntropy: 0.0033987839706242085, Accuracy: 0.9999825167838875\n",
      "Elapsed time for the training: 58.13764953613281\n",
      "EVALUATION with 100 samples -> Loss: 6213638.0, CrossEntropy: 0.9947981238365173, Accuracy: 0.8047723732206904\n",
      "Iter 151 / 2000, Loss: 517703.1035006394, CrossEntropy: 0.0032564587891101837, Accuracy: 0.9999975023976982\n",
      "Elapsed time for the training: 58.19700217247009\n",
      "EVALUATION with 100 samples -> Loss: 6249171.5, CrossEntropy: 0.9953340291976929, Accuracy: 0.803534658918602\n",
      "Iter 152 / 2000, Loss: 512260.03620524297, CrossEntropy: 0.0031930238474160433, Accuracy: 0.999987511988491\n",
      "Elapsed time for the training: 58.20126962661743\n",
      "EVALUATION with 100 samples -> Loss: 6265473.5, CrossEntropy: 1.0042749643325806, Accuracy: 0.8034003307610974\n",
      "Iter 153 / 2000, Loss: 511966.68765984656, CrossEntropy: 0.003204544773325324, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.153828144073486\n",
      "EVALUATION with 100 samples -> Loss: 6261407.0, CrossEntropy: 0.9812919497489929, Accuracy: 0.8058771921066392\n",
      "Iter 154 / 2000, Loss: 503728.4098465473, CrossEntropy: 0.003118797205388546, Accuracy: 0.9999950047953964\n",
      "Elapsed time for the training: 58.272371768951416\n",
      "EVALUATION with 100 samples -> Loss: 6288968.0, CrossEntropy: 1.0014008283615112, Accuracy: 0.8048535590774121\n",
      "Iter 155 / 2000, Loss: 498036.82832480816, CrossEntropy: 0.003010964021086693, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.23477745056152\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "EVALUATION with 100 samples -> Loss: 6301143.0, CrossEntropy: 1.0134824514389038, Accuracy: 0.8033978224651909\n",
      "Iter 156 / 2000, Loss: 492929.3479859335, CrossEntropy: 0.0029555431101471186, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.25015950202942\n",
      "EVALUATION with 100 samples -> Loss: 6324526.0, CrossEntropy: 1.0056052207946777, Accuracy: 0.8025873056739855\n",
      "Iter 157 / 2000, Loss: 489559.68534207164, CrossEntropy: 0.002929803216829896, Accuracy: 0.999987511988491\n",
      "Elapsed time for the training: 58.15271496772766\n",
      "EVALUATION with 100 samples -> Loss: 6325698.5, CrossEntropy: 1.0075567960739136, Accuracy: 0.8031111467038202\n",
      "Iter 158 / 2000, Loss: 491151.4404571611, CrossEntropy: 0.0029574341606348753, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.102479696273804\n",
      "EVALUATION with 100 samples -> Loss: 6352975.0, CrossEntropy: 0.9956355094909668, Accuracy: 0.8066288161574765\n",
      "Iter 159 / 2000, Loss: 480465.93829923274, CrossEntropy: 0.0028012802358716726, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.04476761817932\n",
      "EVALUATION with 100 samples -> Loss: 6373744.5, CrossEntropy: 1.01130211353302, Accuracy: 0.8054776234698116\n",
      "Iter 160 / 2000, Loss: 478795.8121003836, CrossEntropy: 0.0027873634826391935, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.06833553314209\n",
      "EVALUATION with 100 samples -> Loss: 6330252.5, CrossEntropy: 0.9915899634361267, Accuracy: 0.8060027424650671\n",
      "Iter 161 / 2000, Loss: 466469.60086317133, CrossEntropy: 0.002611776115372777, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.02913784980774\n",
      "EVALUATION with 100 samples -> Loss: 6389482.0, CrossEntropy: 1.0044498443603516, Accuracy: 0.8040369080645905\n",
      "Iter 162 / 2000, Loss: 469074.6847026854, CrossEntropy: 0.0026893664617091417, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.04024934768677\n",
      "EVALUATION with 100 samples -> Loss: 6354483.5, CrossEntropy: 1.0056593418121338, Accuracy: 0.8052287693531871\n",
      "Iter 163 / 2000, Loss: 468668.2190696931, CrossEntropy: 0.0026764434296637774, Accuracy: 1.0\n",
      "Elapsed time for the training: 57.99895191192627\n",
      "EVALUATION with 100 samples -> Loss: 6422823.5, CrossEntropy: 1.0153765678405762, Accuracy: 0.8043084241259002\n",
      "Iter 164 / 2000, Loss: 464589.39242327365, CrossEntropy: 0.0026459386572241783, Accuracy: 0.9999650335677749\n",
      "Elapsed time for the training: 58.04504108428955\n",
      "EVALUATION with 100 samples -> Loss: 6412827.5, CrossEntropy: 1.0175485610961914, Accuracy: 0.8049362710823728\n",
      "Iter 165 / 2000, Loss: 457954.1768702046, CrossEntropy: 0.00254624430090189, Accuracy: 0.9999975023976982\n",
      "Elapsed time for the training: 57.91082453727722\n",
      "EVALUATION with 100 samples -> Loss: 6383153.0, CrossEntropy: 1.0054415464401245, Accuracy: 0.8032809857272087\n",
      "Iter 166 / 2000, Loss: 454564.94285485934, CrossEntropy: 0.0025148799177259207, Accuracy: 0.9999825167838875\n",
      "Elapsed time for the training: 57.98443555831909\n",
      "EVALUATION with 100 samples -> Loss: 6420802.5, CrossEntropy: 1.020889163017273, Accuracy: 0.8031786751769963\n",
      "Iter 167 / 2000, Loss: 452526.5483535806, CrossEntropy: 0.002496324712410569, Accuracy: 1.0\n",
      "Elapsed time for the training: 57.985978841781616\n",
      "EVALUATION with 100 samples -> Loss: 6449657.0, CrossEntropy: 1.014756679534912, Accuracy: 0.8041968497649127\n",
      "Iter 168 / 2000, Loss: 450096.29571611254, CrossEntropy: 0.0024836785160005093, Accuracy: 1.0\n",
      "Elapsed time for the training: 57.96230864524841\n",
      "EVALUATION with 100 samples -> Loss: 6448066.0, CrossEntropy: 1.0165319442749023, Accuracy: 0.8038741068746946\n",
      "Iter 169 / 2000, Loss: 445203.57376918156, CrossEntropy: 0.0024071845691651106, Accuracy: 1.0\n",
      "Elapsed time for the training: 57.92396330833435\n",
      "EVALUATION with 100 samples -> Loss: 6388286.5, CrossEntropy: 1.0201002359390259, Accuracy: 0.8045442531972561\n",
      "Iter 170 / 2000, Loss: 442402.26039002556, CrossEntropy: 0.0023673917166888714, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.104376554489136\n",
      "EVALUATION with 100 samples -> Loss: 6454503.5, CrossEntropy: 1.011574387550354, Accuracy: 0.8062060722019493\n",
      "Iter 171 / 2000, Loss: 436306.95044757036, CrossEntropy: 0.002306110691279173, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.04249382019043\n",
      "EVALUATION with 100 samples -> Loss: 6453638.0, CrossEntropy: 1.0198830366134644, Accuracy: 0.8048529849800949\n",
      "Iter 172 / 2000, Loss: 433854.7789322251, CrossEntropy: 0.0022814483381807804, Accuracy: 0.9999650335677749\n",
      "Elapsed time for the training: 57.98292875289917\n",
      "EVALUATION with 100 samples -> Loss: 6469287.0, CrossEntropy: 1.0250517129898071, Accuracy: 0.8051817457786777\n",
      "Iter 173 / 2000, Loss: 430605.33479859337, CrossEntropy: 0.0022257196251302958, Accuracy: 1.0\n",
      "Elapsed time for the training: 57.94457125663757\n",
      "EVALUATION with 100 samples -> Loss: 6446065.0, CrossEntropy: 1.0112590789794922, Accuracy: 0.8052933622619893\n",
      "Iter 174 / 2000, Loss: 428495.7953964194, CrossEntropy: 0.002202208386734128, Accuracy: 1.0\n",
      "Elapsed time for the training: 57.98167181015015\n",
      "EVALUATION with 100 samples -> Loss: 6471238.0, CrossEntropy: 1.0175377130508423, Accuracy: 0.8041826107330843\n",
      "Iter 175 / 2000, Loss: 429172.3357576726, CrossEntropy: 0.0022258732933551073, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.000916481018066\n",
      "EVALUATION with 100 samples -> Loss: 6460601.0, CrossEntropy: 1.0248196125030518, Accuracy: 0.8057228700044748\n",
      "Iter 176 / 2000, Loss: 420380.367407289, CrossEntropy: 0.002116455929353833, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.00347852706909\n",
      "EVALUATION with 100 samples -> Loss: 6484474.0, CrossEntropy: 1.0185672044754028, Accuracy: 0.8059656503468133\n",
      "Iter 177 / 2000, Loss: 420960.3469469309, CrossEntropy: 0.0021337110083550215, Accuracy: 1.0\n",
      "Elapsed time for the training: 57.97884011268616\n",
      "EVALUATION with 100 samples -> Loss: 6506395.5, CrossEntropy: 1.019035816192627, Accuracy: 0.805342862457637\n",
      "Iter 178 / 2000, Loss: 421658.15369245526, CrossEntropy: 0.002182208001613617, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.24066400527954\n",
      "EVALUATION with 100 samples -> Loss: 6491900.5, CrossEntropy: 1.0148626565933228, Accuracy: 0.8066387650780571\n",
      "Iter 179 / 2000, Loss: 411224.06257992325, CrossEntropy: 0.0020159354899078608, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.10150504112244\n",
      "EVALUATION with 100 samples -> Loss: 6483844.0, CrossEntropy: 1.0206594467163086, Accuracy: 0.8046640831244248\n",
      "Iter 180 / 2000, Loss: 414537.5837595908, CrossEntropy: 0.0020747585222125053, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.0084068775177\n",
      "EVALUATION with 100 samples -> Loss: 6531915.0, CrossEntropy: 1.035711646080017, Accuracy: 0.8044242607046734\n",
      "Iter 181 / 2000, Loss: 411941.28588554985, CrossEntropy: 0.0020669796504080296, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.045279026031494\n",
      "EVALUATION with 100 samples -> Loss: 6525578.0, CrossEntropy: 1.0216913223266602, Accuracy: 0.8050472797595138\n",
      "Iter 182 / 2000, Loss: 414081.2661445013, CrossEntropy: 0.0021076234988868237, Accuracy: 0.9999800191815856\n",
      "Elapsed time for the training: 58.00981092453003\n",
      "EVALUATION with 100 samples -> Loss: 6556127.5, CrossEntropy: 1.036428689956665, Accuracy: 0.8034724997866232\n",
      "Iter 183 / 2000, Loss: 404010.65241368284, CrossEntropy: 0.0019632054027169943, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.03527212142944\n",
      "EVALUATION with 100 samples -> Loss: 6561818.5, CrossEntropy: 1.0310450792312622, Accuracy: 0.8044735071506591\n",
      "Iter 184 / 2000, Loss: 403662.73617327365, CrossEntropy: 0.0019705782178789377, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.04474925994873\n",
      "EVALUATION with 100 samples -> Loss: 6600349.0, CrossEntropy: 1.0515798330307007, Accuracy: 0.8033193749957941\n",
      "Iter 185 / 2000, Loss: 403494.9671515345, CrossEntropy: 0.0019587199203670025, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.06397366523743\n",
      "EVALUATION with 100 samples -> Loss: 6587193.5, CrossEntropy: 1.0361799001693726, Accuracy: 0.8059399567388154\n",
      "Iter 186 / 2000, Loss: 396136.998241688, CrossEntropy: 0.0018719055224210024, Accuracy: 1.0\n",
      "Elapsed time for the training: 57.99316382408142\n",
      "EVALUATION with 100 samples -> Loss: 6582908.0, CrossEntropy: 1.0316015481948853, Accuracy: 0.8051784520048881\n",
      "Iter 187 / 2000, Loss: 394383.9596387468, CrossEntropy: 0.0018739711958914995, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.007930278778076\n",
      "EVALUATION with 100 samples -> Loss: 6596407.5, CrossEntropy: 1.0662988424301147, Accuracy: 0.8016043586191302\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Iter 188 / 2000, Loss: 402239.1055786445, CrossEntropy: 0.001993176294490695, Accuracy: 1.0\n",
      "Elapsed time for the training: 57.98478364944458\n",
      "EVALUATION with 100 samples -> Loss: 6587803.0, CrossEntropy: 1.0508157014846802, Accuracy: 0.8031692066226108\n",
      "Iter 189 / 2000, Loss: 388760.43622122763, CrossEntropy: 0.001806566258892417, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.030871868133545\n",
      "EVALUATION with 100 samples -> Loss: 6609374.0, CrossEntropy: 1.0350042581558228, Accuracy: 0.8047769757822598\n",
      "Iter 190 / 2000, Loss: 388055.95340473147, CrossEntropy: 0.0018121167086064816, Accuracy: 1.0\n",
      "Elapsed time for the training: 57.966866970062256\n",
      "EVALUATION with 100 samples -> Loss: 6597947.0, CrossEntropy: 1.0444400310516357, Accuracy: 0.8024556831646764\n",
      "Iter 191 / 2000, Loss: 387026.15976662404, CrossEntropy: 0.0018113324185833335, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.00096011161804\n",
      "EVALUATION with 100 samples -> Loss: 6626007.0, CrossEntropy: 1.0432097911834717, Accuracy: 0.8042196138277833\n",
      "Iter 192 / 2000, Loss: 384575.0909526854, CrossEntropy: 0.0017658283468335867, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.08261704444885\n",
      "EVALUATION with 100 samples -> Loss: 6632146.5, CrossEntropy: 1.0441311597824097, Accuracy: 0.8051024862094922\n",
      "Iter 193 / 2000, Loss: 380605.64546035806, CrossEntropy: 0.001733199111185968, Accuracy: 1.0\n",
      "Elapsed time for the training: 57.896090030670166\n",
      "EVALUATION with 100 samples -> Loss: 6617268.5, CrossEntropy: 1.0493038892745972, Accuracy: 0.8043453439659012\n",
      "Iter 194 / 2000, Loss: 379022.5387627877, CrossEntropy: 0.0017131794011220336, Accuracy: 1.0\n",
      "Elapsed time for the training: 57.877755641937256\n",
      "EVALUATION with 100 samples -> Loss: 6605934.0, CrossEntropy: 1.0354818105697632, Accuracy: 0.8045954756841851\n",
      "Iter 195 / 2000, Loss: 379527.1815057545, CrossEntropy: 0.0017554672667756677, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.0584602355957\n",
      "EVALUATION with 100 samples -> Loss: 6628103.0, CrossEntropy: 1.0505626201629639, Accuracy: 0.8039488634375718\n",
      "Iter 196 / 2000, Loss: 377182.6811061381, CrossEntropy: 0.0017144223675131798, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.005430459976196\n",
      "EVALUATION with 100 samples -> Loss: 6623571.0, CrossEntropy: 1.0429896116256714, Accuracy: 0.8046822186884335\n",
      "Iter 197 / 2000, Loss: 378700.54347826086, CrossEntropy: 0.00172576738987118, Accuracy: 0.9999975023976982\n",
      "Elapsed time for the training: 57.89257049560547\n",
      "EVALUATION with 100 samples -> Loss: 6644861.5, CrossEntropy: 1.050068974494934, Accuracy: 0.8051113606045235\n",
      "Iter 198 / 2000, Loss: 371444.12116368284, CrossEntropy: 0.0016656770603731275, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.088236570358276\n",
      "EVALUATION with 100 samples -> Loss: 6605068.0, CrossEntropy: 1.0472468137741089, Accuracy: 0.8050057017720228\n",
      "Iter 199 / 2000, Loss: 371226.8854699489, CrossEntropy: 0.001660812646150589, Accuracy: 1.0\n",
      "Elapsed time for the training: 57.85360836982727\n",
      "EVALUATION with 100 samples -> Loss: 6600519.0, CrossEntropy: 1.0565778017044067, Accuracy: 0.8061321269129347\n",
      "At iteration 200 we change the dropout rate from 0.1 to 0.2. \n",
      "Iter 200 / 2000, Loss: 373157.985693734, CrossEntropy: 0.0017104264115914702, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.02727961540222\n",
      "EVALUATION with 100 samples -> Loss: 6645382.0, CrossEntropy: 1.0444597005844116, Accuracy: 0.8035340360942176\n",
      "Iter 201 / 2000, Loss: 365686.32201086957, CrossEntropy: 0.0015970912063494325, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.14426398277283\n",
      "EVALUATION with 100 samples -> Loss: 6589081.5, CrossEntropy: 1.0447100400924683, Accuracy: 0.8034204887420576\n",
      "Iter 202 / 2000, Loss: 361789.58359974425, CrossEntropy: 0.0015390677144750953, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.04989266395569\n",
      "EVALUATION with 100 samples -> Loss: 6684265.0, CrossEntropy: 1.0529792308807373, Accuracy: 0.8028305752582786\n",
      "Iter 203 / 2000, Loss: 361852.155370844, CrossEntropy: 0.0015608756802976131, Accuracy: 0.9999975023976982\n",
      "Elapsed time for the training: 57.96222281455994\n",
      "EVALUATION with 100 samples -> Loss: 6675907.5, CrossEntropy: 1.0551255941390991, Accuracy: 0.8046365326422346\n",
      "Iter 204 / 2000, Loss: 359325.91847826086, CrossEntropy: 0.001533815753646195, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.09160327911377\n",
      "EVALUATION with 100 samples -> Loss: 6655866.5, CrossEntropy: 1.0403814315795898, Accuracy: 0.8068816408313964\n",
      "Iter 205 / 2000, Loss: 359205.6653612532, CrossEntropy: 0.001558370771817863, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.14162516593933\n",
      "EVALUATION with 100 samples -> Loss: 6680234.5, CrossEntropy: 1.049268364906311, Accuracy: 0.8071900293231324\n",
      "Iter 206 / 2000, Loss: 354399.9326246803, CrossEntropy: 0.0014940393157303333, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.151028633117676\n",
      "EVALUATION with 100 samples -> Loss: 6686562.5, CrossEntropy: 1.0671403408050537, Accuracy: 0.802644326805213\n",
      "Iter 207 / 2000, Loss: 357869.5668158568, CrossEntropy: 0.001559381140395999, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.116997957229614\n",
      "EVALUATION with 100 samples -> Loss: 6729217.0, CrossEntropy: 1.0610084533691406, Accuracy: 0.8063935512393339\n",
      "Iter 208 / 2000, Loss: 351601.23417519184, CrossEntropy: 0.0014589541824534535, Accuracy: 1.0\n",
      "Elapsed time for the training: 57.97011637687683\n",
      "EVALUATION with 100 samples -> Loss: 6716422.5, CrossEntropy: 1.0521454811096191, Accuracy: 0.8058276501829543\n",
      "Iter 209 / 2000, Loss: 352792.14585997444, CrossEntropy: 0.0014977921964600682, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.157670736312866\n",
      "EVALUATION with 100 samples -> Loss: 6713623.0, CrossEntropy: 1.055600881576538, Accuracy: 0.8041819314637738\n",
      "Iter 210 / 2000, Loss: 354211.5057544757, CrossEntropy: 0.0015235802857205272, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.02504801750183\n",
      "EVALUATION with 100 samples -> Loss: 6749701.0, CrossEntropy: 1.0694032907485962, Accuracy: 0.8045074529948055\n",
      "Iter 211 / 2000, Loss: 349350.9563618926, CrossEntropy: 0.0014728459063917398, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.12238311767578\n",
      "EVALUATION with 100 samples -> Loss: 6746986.0, CrossEntropy: 1.0647271871566772, Accuracy: 0.8058549603144763\n",
      "Iter 212 / 2000, Loss: 348114.6544117647, CrossEntropy: 0.0014614694518968463, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.126718521118164\n",
      "EVALUATION with 100 samples -> Loss: 6752121.5, CrossEntropy: 1.0700455904006958, Accuracy: 0.8051017668047736\n",
      "Iter 213 / 2000, Loss: 345096.8523817136, CrossEntropy: 0.0014243746409192681, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.09035277366638\n",
      "EVALUATION with 100 samples -> Loss: 6756033.0, CrossEntropy: 1.0727159976959229, Accuracy: 0.8036399792888181\n",
      "Iter 214 / 2000, Loss: 340109.0948689258, CrossEntropy: 0.0013792681274935603, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.08957242965698\n",
      "EVALUATION with 100 samples -> Loss: 6799264.0, CrossEntropy: 1.0753417015075684, Accuracy: 0.8026716317526587\n",
      "Iter 215 / 2000, Loss: 342514.0148657289, CrossEntropy: 0.0014289052924141288, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.222405672073364\n",
      "EVALUATION with 100 samples -> Loss: 6742735.5, CrossEntropy: 1.068447470664978, Accuracy: 0.8041778134270646\n",
      "Iter 216 / 2000, Loss: 342462.4517263427, CrossEntropy: 0.0013997284695506096, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.14935064315796\n",
      "EVALUATION with 100 samples -> Loss: 6776139.5, CrossEntropy: 1.0934884548187256, Accuracy: 0.8035449363556545\n",
      "Iter 217 / 2000, Loss: 335860.8128196931, CrossEntropy: 0.00132756307721138, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.21147584915161\n",
      "EVALUATION with 100 samples -> Loss: 6798356.5, CrossEntropy: 1.063369631767273, Accuracy: 0.8059460526337981\n",
      "Iter 218 / 2000, Loss: 337691.1590473146, CrossEntropy: 0.001379571738652885, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.03275799751282\n",
      "EVALUATION with 100 samples -> Loss: 6787561.5, CrossEntropy: 1.0646177530288696, Accuracy: 0.8057138172423404\n",
      "Iter 219 / 2000, Loss: 336353.9083280051, CrossEntropy: 0.001359975547529757, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.083996057510376\n",
      "EVALUATION with 100 samples -> Loss: 6772682.5, CrossEntropy: 1.069064974784851, Accuracy: 0.8045536610198171\n",
      "Iter 220 / 2000, Loss: 334327.5821611253, CrossEntropy: 0.0013500534696504474, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.049503803253174\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "EVALUATION with 100 samples -> Loss: 6773505.0, CrossEntropy: 1.0659531354904175, Accuracy: 0.8040125937706416\n",
      "Iter 221 / 2000, Loss: 334319.46771099744, CrossEntropy: 0.0013464423827826977, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.180800437927246\n",
      "EVALUATION with 100 samples -> Loss: 6761625.0, CrossEntropy: 1.0585005283355713, Accuracy: 0.8062204545825444\n",
      "Iter 222 / 2000, Loss: 330098.49536445015, CrossEntropy: 0.0012978679733350873, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.16869354248047\n",
      "EVALUATION with 100 samples -> Loss: 6791553.0, CrossEntropy: 1.0693504810333252, Accuracy: 0.8037447624780706\n",
      "Iter 223 / 2000, Loss: 332006.38602941175, CrossEntropy: 0.0013414165005087852, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.22975540161133\n",
      "EVALUATION with 100 samples -> Loss: 6767293.0, CrossEntropy: 1.0643994808197021, Accuracy: 0.8049451311955422\n",
      "Iter 224 / 2000, Loss: 327836.9397378517, CrossEntropy: 0.001297847949899733, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.191662311553955\n",
      "EVALUATION with 100 samples -> Loss: 6809792.0, CrossEntropy: 1.07853364944458, Accuracy: 0.8043240485410831\n",
      "Iter 225 / 2000, Loss: 327700.68462276214, CrossEntropy: 0.0012928226497024298, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.141953468322754\n",
      "EVALUATION with 100 samples -> Loss: 6804619.5, CrossEntropy: 1.0722395181655884, Accuracy: 0.8052243704407523\n",
      "Iter 226 / 2000, Loss: 324172.7273817136, CrossEntropy: 0.0012630937853828073, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.22023844718933\n",
      "EVALUATION with 100 samples -> Loss: 6810478.5, CrossEntropy: 1.078668475151062, Accuracy: 0.8043481441883414\n",
      "Iter 227 / 2000, Loss: 324623.72138746805, CrossEntropy: 0.0012634806334972382, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.15612006187439\n",
      "EVALUATION with 100 samples -> Loss: 6820806.5, CrossEntropy: 1.0775963068008423, Accuracy: 0.8050687223171958\n",
      "Iter 228 / 2000, Loss: 321816.5394820972, CrossEntropy: 0.0012356103397905827, Accuracy: 0.9999975023976982\n",
      "Elapsed time for the training: 58.21077275276184\n",
      "EVALUATION with 100 samples -> Loss: 6821073.0, CrossEntropy: 1.0738097429275513, Accuracy: 0.8036762411223165\n",
      "Iter 229 / 2000, Loss: 320398.9965233376, CrossEntropy: 0.0012337234802544117, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.103798389434814\n",
      "EVALUATION with 100 samples -> Loss: 6819409.0, CrossEntropy: 1.074716567993164, Accuracy: 0.8048097131687961\n",
      "Iter 230 / 2000, Loss: 318526.7597506394, CrossEntropy: 0.0012152782874181867, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.18750023841858\n",
      "EVALUATION with 100 samples -> Loss: 6830585.0, CrossEntropy: 1.0721970796585083, Accuracy: 0.8045155096785915\n",
      "Iter 231 / 2000, Loss: 318340.9140025575, CrossEntropy: 0.0012187882093712687, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.10061311721802\n",
      "EVALUATION with 100 samples -> Loss: 6839185.0, CrossEntropy: 1.0944952964782715, Accuracy: 0.8039924040019203\n",
      "Iter 232 / 2000, Loss: 315986.6260789642, CrossEntropy: 0.0011848508147522807, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.20981049537659\n",
      "EVALUATION with 100 samples -> Loss: 6835481.5, CrossEntropy: 1.0790374279022217, Accuracy: 0.8063039183721834\n",
      "Iter 233 / 2000, Loss: 314940.4112452046, CrossEntropy: 0.0011933878995478153, Accuracy: 1.0\n",
      "Elapsed time for the training: 58.08153176307678\n",
      "EVALUATION with 100 samples -> Loss: 6884776.0, CrossEntropy: 1.0823426246643066, Accuracy: 0.8063032281064829\n",
      "Iter 234 / 2000, Loss: 313406.73749200767, CrossEntropy: 0.0011738610919564962, Accuracy: 1.0\n",
      "Elapsed time for the training: 57.99770545959473\n",
      "EVALUATION with 100 samples -> Loss: 6829668.5, CrossEntropy: 1.077117681503296, Accuracy: 0.8048122876916963\n",
      "Iter 235 / 2000, Loss: 315991.306226023, CrossEntropy: 0.0012274114415049553, Accuracy: 0.9999700287723785\n",
      "Elapsed time for the training: 58.033692836761475\n",
      "EVALUATION with 100 samples -> Loss: 6868456.5, CrossEntropy: 1.087192416191101, Accuracy: 0.8043141636874525\n",
      "Iter 236 / 2000, Loss: 315692.27357736573, CrossEntropy: 0.001213815063238144, Accuracy: 0.9999950047953964\n",
      "Elapsed time for the training: 58.15720748901367\n",
      "EVALUATION with 100 samples -> Loss: 6886995.0, CrossEntropy: 1.0848220586776733, Accuracy: 0.8058823699920591\n"
     ]
    }
   ],
   "source": [
    "#for lvi\n",
    "\n",
    "from datetime import datetime\n",
    "import time\n",
    "\n",
    "num_epochs = 2000\n",
    "criterion = torch.nn.CrossEntropyLoss()  # loss function\n",
    "total_acc = []\n",
    "# start = time.time()\n",
    "for i in range(num_epochs):\n",
    "    losses = []\n",
    "    cross_losses = []\n",
    "    accuracy = []\n",
    "    \n",
    "    if (i+1) % 100 == 0:\n",
    "        print(\"At iteration %d we change the dropout rate from %.1f to %.1f. \" %(i+1, lvi_model.dropout_prob, lvi_model.dropout_prob+0.1))\n",
    "        lvi_model.change_dropout_rate(lvi_model.dropout_prob+0.1)\n",
    "    \n",
    "    start = time.time()\n",
    "    \n",
    "    for images, labels in trainloader:\n",
    "        inner_cross_losses = []\n",
    "        inner_accuracy = []\n",
    "        \n",
    "        # Flatten MNIST images into a 784 long vector\n",
    "#         images = images.view(images.shape[0], -1)\n",
    "        \n",
    "        if i < 1:\n",
    "            loss, y_pred,_ = lvi_model.training_step(\n",
    "                batch=(images, labels),\n",
    "                N=N,\n",
    "                deterministic_weights=True,\n",
    "                vi_batch_size=1,\n",
    "            )\n",
    "            losses.append(loss)\n",
    "        \n",
    "            cross_loss = criterion(y_pred.squeeze(0), labels)\n",
    "            cross_losses.append(cross_loss)\n",
    "            accuracy.append((torch.max(y_pred.squeeze(0),-1).indices == labels).sum().item() / labels.size(0))\n",
    "            \n",
    "        else:\n",
    "            loss, y_pred,_ = lvi_model.training_step(\n",
    "                batch=(images, labels),\n",
    "                N=N,\n",
    "                deterministic_weights=False,\n",
    "                vi_batch_size=8,\n",
    "            ) \n",
    "\n",
    "            losses.append(loss)\n",
    "\n",
    "            for j in range(y_pred.shape[0]):\n",
    "                cross_loss = criterion(y_pred.squeeze(0)[j], labels)\n",
    "                inner_cross_losses.append(cross_loss)\n",
    "                inner_accuracy.append((torch.max(y_pred.squeeze(0)[j],-1).indices == labels).sum().item() / labels.size(0))\n",
    "\n",
    "            accuracy.append(sum(inner_accuracy)/len(inner_accuracy))\n",
    "            cross_losses.append(sum(inner_cross_losses)/len(inner_cross_losses))\n",
    "\n",
    "#     if (i+1) % 10**math.floor(math.log10(i+1)) == 0:  # True when i+1 \\in {1, 2, ..., 10, 20, ..., 100, 200, ..., 1000, 2000, ...}\n",
    "    print(\"Iter {} / {}, Loss: {}, CrossEntropy: {}, Accuracy: {}\".format(i+1, num_epochs, sum(losses)/len(losses), sum(cross_losses)/len(cross_losses), sum(accuracy)/len(accuracy)))\n",
    "\n",
    "    end = time.time()\n",
    "    print('Elapsed time for the training:', end - start)\n",
    "    \n",
    "    tmp_acc = evaluation(lvi_model, testloader)\n",
    "    total_acc.append(tmp_acc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "chains = []\n",
    "for name, chain in lvi_model.get_chains().items():\n",
    "    chains.append(chain.view(chain.shape[0], -1).detach().cpu())\n",
    "w_all = torch.cat(chains, dim=-1).numpy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# torch.save(lvi_model.state_dict(), \"./cnn_svhn_non_lvi.pt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(10,10))\n",
    "plt.plot(range(len(mses)), mses)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lvi_model.use_dropout, lvi_model.num_samples_per_group"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "sns.set_style(\"white\")\n",
    "\n",
    "#to_plot = sgld_model.tensor_dict.W_2.theta_chains.view(sgld_model.chain_length, -1).cpu().detach().numpy()\n",
    "#to_plot = lvi_model.tensor_dict.W_2.theta_chains.view(lvi_model.chain_length, -1).cpu().detach().numpy()\n",
    "to_plot = lvi_model.get_chains()[\"W_0\"].squeeze().cpu().detach().numpy()\n",
    "\n",
    "g = sns.PairGrid(pd.DataFrame(to_plot))\n",
    "g.map_diag(plt.hist, bins=100)\n",
    "\n",
    "def pairgrid_heatmap(x, y, **kws):\n",
    "    cmap = sns.light_palette(kws.pop(\"color\"), as_cmap=True)\n",
    "    plt.hist2d(x, y, cmap=cmap, cmin=1, **kws)\n",
    "\n",
    "g.map_offdiag(pairgrid_heatmap, bins=100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "shaping = lvi_model.get_chains()[\"W_0\"].squeeze().cpu().detach().numpy().shape[0]\n",
    "w = pd.DataFrame(lvi_model.get_chains()[\"W_0\"].squeeze().cpu().detach().view(shaping, -1).numpy())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "w.to_csv('non_lvi_drop_10_batch_500.csv',header=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
