{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "75858edf-817e-45d3-b9af-51123484dbd2",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "#!pip install torch torchvision torchaudio\n",
    "#!pip install pytorch-lightning\n",
    "#!pip install scikit-learn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "2b32bfa2-58bd-4e4a-ae99-8d428bf92674",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The available device is : cuda:0\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import torch\n",
    "from torch import nn \n",
    "from torchvision.datasets import FashionMNIST, MNIST, EMNIST, CIFAR100\n",
    "from torchvision import transforms\n",
    "from pytorch_lightning import Trainer\n",
    "from torch.utils.data import TensorDataset, DataLoader\n",
    "from sklearn.model_selection import train_test_split\n",
    "import matplotlib.pyplot as plt\n",
    "import pytorch_lightning as pl\n",
    "from models import *\n",
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "from scipy.optimize import minimize\n",
    "from joblib import Parallel, delayed\n",
    "from tqdm import tqdm\n",
    "from collections import defaultdict\n",
    "\n",
    "\n",
    "import matplotlib as mpl\n",
    "mpl.style.use(\"classic\")\n",
    "mpl.rcParams[\"figure.figsize\"] = [5, 3]\n",
    "\n",
    "mpl.rcParams[\"axes.linewidth\"] = 0.75\n",
    "mpl.rcParams[\"figure.facecolor\"] = \"w\"\n",
    "mpl.rcParams[\"grid.linewidth\"] = 0.75\n",
    "mpl.rcParams[\"lines.linewidth\"] = 0.75\n",
    "mpl.rcParams[\"patch.linewidth\"] = 0.75\n",
    "mpl.rcParams[\"xtick.major.size\"] = 3\n",
    "mpl.rcParams[\"ytick.major.size\"] = 3\n",
    "\n",
    "mpl.rcParams[\"pdf.fonttype\"] = 42\n",
    "mpl.rcParams[\"ps.fonttype\"] = 42\n",
    "mpl.rcParams[\"font.size\"] = 9\n",
    "mpl.rcParams[\"axes.titlesize\"] = \"medium\"\n",
    "mpl.rcParams[\"legend.fontsize\"] = \"medium\"\n",
    "\n",
    "\n",
    "import os\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "\n",
    "# Reproducibility\n",
    "seed = 0\n",
    "np.random.seed(seed)\n",
    "torch.manual_seed(seed)\n",
    "\n",
    "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
    "print('The available device is :', device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "d1942cb8-c098-4c10-bf9d-d4aa438b3a16",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "dict_datasets = {'MNIST' : MNIST,\n",
    "                 'FashionMNIST': FashionMNIST,\n",
    "                 'EMNIST': EMNIST}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "be30e9be-b9f2-4d91-8d45-3540d8630c4d",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dataset used is : EMNIST\n"
     ]
    }
   ],
   "source": [
    "name = 'MNIST'\n",
    "print('dataset used is :', name)\n",
    "dataset = dict_datasets[name]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "833519a4-b3b8-484a-ae6f-7eb3a612838f",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "if name == 'EMNIST':\n",
    "    mnist_train = dataset(os.getcwd(), train=True, split = 'balanced', download=True, transform=transforms.ToTensor())\n",
    "    mnist_test = dataset(os.getcwd(), train=False, split = 'balanced', download=True, transform=transforms.ToTensor())\n",
    "\n",
    "else:\n",
    "    mnist_train = dataset(os.getcwd(), train=True, download=True, transform=transforms.ToTensor())\n",
    "    mnist_test = dataset(os.getcwd(), train=False, download=True, transform=transforms.ToTensor())\n",
    "        \n",
    "X_train, Y_train = torch.tensor(mnist_train.data).float().reshape(len(mnist_train.data), -1), torch.tensor(mnist_train.targets)\n",
    "X_test, Y_test = torch.tensor(mnist_test.data).float().reshape(len(mnist_test.data), -1), torch.tensor(mnist_test.targets)\n",
    "\n",
    "# rescaling the contexts\n",
    "X_train /= torch.norm(X_train, p = 2, dim = -1, keepdim=True)\n",
    "X_test /= torch.norm(X_test, p = 2, dim = -1, keepdim=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "8a7168fd-dcc4-4372-95f8-5cf3fbc3cd88",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train : dimension of X is : torch.Size([112800, 784]) dimension of Y is : 112800\n",
      "Test : dimension of X_test is : torch.Size([18800, 784]) dimension of Y is : 18800\n",
      "num_actions:  47\n"
     ]
    }
   ],
   "source": [
    "# data subsampling to learn a logging Policy\n",
    "\n",
    "x0, X_log, y0, Y_log = train_test_split(X_train, Y_train, train_size = 0.05)\n",
    "\n",
    "N = len(X_log)\n",
    "N_test = len(X_test)\n",
    "\n",
    "print('Train : dimension of X is :', X_train.shape, 'dimension of Y is :', len(X_train))\n",
    "print('Test : dimension of X_test is :', X_test.shape, 'dimension of Y is :', N_test)\n",
    "context_dim = X_log.shape[1]\n",
    "num_actions = len(np.unique(Y_log))\n",
    "print('num_actions: ', num_actions)\n",
    "\n",
    "subsample_pt = TensorDataset(x0, y0)\n",
    "subsample_dataloader = DataLoader(subsample_pt, batch_size=128, shuffle=True)\n",
    "\n",
    "# create the logging split\n",
    "logging_split = TensorDataset(X_log, Y_log)\n",
    "logging_split_dataloader = DataLoader(logging_split, batch_size=128, shuffle=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "5c26c6d8-c41e-4afb-a795-921b02621567",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ad6f923bdeb0450fb080ab245161e0fd",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Training a logging policy\n",
    "etas = np.round(np.linspace(0, 1, 10), 2) # Inverse temperature parameter (the higher eta the better the performance of the logging policy)\n",
    "\n",
    "dict_results = defaultdict(list)\n",
    "epochs_logging = 10\n",
    "epochs = 20 #\n",
    "\n",
    "logging_policy = SupervisedPolicy(n_actions=num_actions, context_dim=context_dim, softmax = True, reg=1e-6, device = device)\n",
    "trainer = Trainer(max_epochs=epochs_logging, gpus=1, checkpoint_callback=False, weights_summary=None, logger=None)\n",
    "trainer.fit(logging_policy, subsample_dataloader)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "3d69660f-8266-4496-82d2-f60b3a62dea2",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "eta parameter :  0.0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:00, 2798.08it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The reward of the logging policy:  0.021276595072543366\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 838/838 [00:01<00:00, 571.28it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "max 0.021276595070958138\n",
      "min 0.021276595070958138\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f68ccc076a30400fb19d9d79133e70d7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 46.09it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Ours with Gaussian policies after training  : 0.025437003513599964\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e96ba3a188b24a32abcead99029c0e50",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 84.20it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Ours with MixedLogit policies after training  : 0.021878887668569037\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0e1d295c8e49497bbd5f75055123652e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 44.87it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of London with Gaussian policies after training  : 0.02142250509972268\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f2bec62390334fe0a442601d7738d323",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 87.43it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of London with MixedLogit policies after training  : 0.021312499617008453\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "070a70265e5640bdace32ca02f3dd3ce",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 46.07it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 1 with Gaussian policies after training  : 0.13859928785486425\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "04cd62aff72848b7b2e1b4be119739c8",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 95.42it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 1 with MixedLogit policies after training  : 0.024251130755911482\n",
      "################################################################################\n",
      "S_lmbd is defined by these two bounds 0.000786160201153927 0.11054060047948723\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f41d50609d1d454d852abc272791c12b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 46.23it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 2 with Gaussian policies after training  : 0.022573957671510412\n",
      "################################################################################\n",
      "S_lmbd is defined by these two bounds 0.000786160201153927 0.11054060047948723\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a9d62eb17b2243c589974a8d388fef39",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 83.63it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 2 with MixedLogit policies after training  : 0.02147309272847277\n",
      "################################################################################\n",
      "defaultdict(<class 'list'>, {'eta': [0.0], 'logging_reward': [0.021276595072543366], 'ours, gaussian': [0.025437003513599964], 'ours, mixed-logit': [0.021878887668569037], 'london, gaussian': [0.02142250509972268], 'london, mixed-logit': [0.021312499617008453], 'sakhi1, gaussian': [0.13859928785486425], 'sakhi1, mixed-logit': [0.024251130755911482], 'sakhi2, gaussian': [0.022573957671510412], 'sakhi2, mixed-logit': [0.02147309272847277]})\n",
      "eta parameter :  0.11\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:00, 2758.21it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The reward of the logging policy:  0.05399596183858019\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 838/838 [00:01<00:00, 606.26it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "max 0.10748770833015442\n",
      "min 0.002750969026237726\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7c307fdf66664348a1733438135a7715",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 45.66it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Ours with Gaussian policies after training  : 0.08116453873350266\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "162e63d26f5941b78ea7abee82d51eb6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 84.06it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Ours with MixedLogit policies after training  : 0.041966592250986305\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d3d61b32135e43efa13d2f736b13807c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 46.18it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of London with Gaussian policies after training  : 0.11683310336255012\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9bf871f0beae40a5820ab6bedf56f986",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 84.13it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of London with MixedLogit policies after training  : 0.054623325048608985\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "60379cc2d3374187ba51c5917e3a9b3a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 45.53it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 1 with Gaussian policies after training  : 0.3059420185900749\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "deef9e853ac646fab5ecc3733236c7ce",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 83.34it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 1 with MixedLogit policies after training  : 0.19140858112497533\n",
      "################################################################################\n",
      "S_lmbd is defined by these two bounds 0.000786160201153927 0.11054060047948723\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b0f914d3102e40f2ada83c482f29e580",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 45.87it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 2 with Gaussian policies after training  : 0.15966477262212875\n",
      "################################################################################\n",
      "S_lmbd is defined by these two bounds 0.000786160201153927 0.11054060047948723\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "466a04f3d0f4452283651cb0857ee98f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 83.71it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 2 with MixedLogit policies after training  : 0.05796380212966432\n",
      "################################################################################\n",
      "defaultdict(<class 'list'>, {'eta': [0.0, 0.11], 'logging_reward': [0.021276595072543366, 0.05399596183858019], 'ours, gaussian': [0.025437003513599964, 0.08116453873350266], 'ours, mixed-logit': [0.021878887668569037, 0.041966592250986305], 'london, gaussian': [0.02142250509972268, 0.11683310336255012], 'london, mixed-logit': [0.021312499617008453, 0.054623325048608985], 'sakhi1, gaussian': [0.13859928785486425, 0.3059420185900749], 'sakhi1, mixed-logit': [0.024251130755911482, 0.19140858112497533], 'sakhi2, gaussian': [0.022573957671510412, 0.15966477262212875], 'sakhi2, mixed-logit': [0.02147309272847277, 0.05796380212966432]})\n",
      "eta parameter :  0.22\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:00, 2896.20it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The reward of the logging policy:  0.11383750940891023\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 838/838 [00:01<00:00, 597.79it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "max 0.3338525593280792\n",
      "min 0.000311591342324391\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "47aac353dbcd47beb8d41f768b13bb29",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 45.94it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Ours with Gaussian policies after training  : 0.10488492514224762\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "696d9fb14a9c491bb88a075451035884",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 83.29it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Ours with MixedLogit policies after training  : 0.07576388414869917\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e560658bf8aa4ad69ca226e5ef12b512",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 45.72it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of London with Gaussian policies after training  : 0.24720970630645753\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "351367f5c26c4afe927eb2afd2805583",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 83.29it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of London with MixedLogit policies after training  : 0.11627728157855095\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "555b811ab4a848b0a65b51369efe78aa",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 43.70it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 1 with Gaussian policies after training  : 0.3463822172043171\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "81ff36371ee84a02b2a17de103cd5a56",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 84.30it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 1 with MixedLogit policies after training  : 0.241775030988328\n",
      "################################################################################\n",
      "S_lmbd is defined by these two bounds 0.000786160201153927 0.11054060047948723\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "159e3731e50740d0a26d6447b381c5a7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 43.47it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 2 with Gaussian policies after training  : 0.2906203171547423\n",
      "################################################################################\n",
      "S_lmbd is defined by these two bounds 0.000786160201153927 0.11054060047948723\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "971dbc0f6f4f4cf6b95ea153ea71d5a9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 96.14it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 2 with MixedLogit policies after training  : 0.13371665594425608\n",
      "################################################################################\n",
      "defaultdict(<class 'list'>, {'eta': [0.0, 0.11, 0.22], 'logging_reward': [0.021276595072543366, 0.05399596183858019, 0.11383750940891023], 'ours, gaussian': [0.025437003513599964, 0.08116453873350266, 0.10488492514224762], 'ours, mixed-logit': [0.021878887668569037, 0.041966592250986305, 0.07576388414869917], 'london, gaussian': [0.02142250509972268, 0.11683310336255012, 0.24720970630645753], 'london, mixed-logit': [0.021312499617008453, 0.054623325048608985, 0.11627728157855095], 'sakhi1, gaussian': [0.13859928785486425, 0.3059420185900749, 0.3463822172043171], 'sakhi1, mixed-logit': [0.024251130755911482, 0.19140858112497533, 0.241775030988328], 'sakhi2, gaussian': [0.022573957671510412, 0.15966477262212875, 0.2906203171547423], 'sakhi2, mixed-logit': [0.02147309272847277, 0.05796380212966432, 0.13371665594425608]})\n",
      "eta parameter :  0.33\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:00, 2844.12it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The reward of the logging policy:  0.19573704922452886\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 838/838 [00:01<00:00, 655.62it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "max 0.6145292520523071\n",
      "min 2.795837826852221e-05\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d73ebfab051c4672bb4b4eed671a9c66",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 46.05it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Ours with Gaussian policies after training  : 0.5204365821594887\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "57720e1a21644f9c92f4427b71c9323f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 84.16it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Ours with MixedLogit policies after training  : 0.1502942757403597\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "754b9118fe94450290da6c1bc293a5e6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 46.12it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of London with Gaussian policies after training  : 0.35098636059050864\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a954e39354a1437d8d8b6db2cb965046",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 84.24it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of London with MixedLogit policies after training  : 0.20099180272284975\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "61d5a1c1e3f64baebb53c9725863c2c9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 47.74it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 1 with Gaussian policies after training  : 0.39708701965656684\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b369e2d8f14542bba91b2f1ee6d5d376",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 82.13it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 1 with MixedLogit policies after training  : 0.27554483231077803\n",
      "################################################################################\n",
      "S_lmbd is defined by these two bounds 0.000786160201153927 0.11054060047948723\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "fb9177bcad6a40d2be6ef12fcab821a2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 46.11it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 2 with Gaussian policies after training  : 0.3819904073755792\n",
      "################################################################################\n",
      "S_lmbd is defined by these two bounds 0.000786160201153927 0.11054060047948723\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5afaf272c73c4b77bea2e5cbdf6d497e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 98.89it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 2 with MixedLogit policies after training  : 0.22882444148368022\n",
      "################################################################################\n",
      "defaultdict(<class 'list'>, {'eta': [0.0, 0.11, 0.22, 0.33], 'logging_reward': [0.021276595072543366, 0.05399596183858019, 0.11383750940891023, 0.19573704922452886], 'ours, gaussian': [0.025437003513599964, 0.08116453873350266, 0.10488492514224762, 0.5204365821594887], 'ours, mixed-logit': [0.021878887668569037, 0.041966592250986305, 0.07576388414869917, 0.1502942757403597], 'london, gaussian': [0.02142250509972268, 0.11683310336255012, 0.24720970630645753, 0.35098636059050864], 'london, mixed-logit': [0.021312499617008453, 0.054623325048608985, 0.11627728157855095, 0.20099180272284975], 'sakhi1, gaussian': [0.13859928785486425, 0.3059420185900749, 0.3463822172043171, 0.39708701965656684], 'sakhi1, mixed-logit': [0.024251130755911482, 0.19140858112497533, 0.241775030988328, 0.27554483231077803], 'sakhi2, gaussian': [0.022573957671510412, 0.15966477262212875, 0.2906203171547423, 0.3819904073755792], 'sakhi2, mixed-logit': [0.02147309272847277, 0.05796380212966432, 0.13371665594425608, 0.22882444148368022]})\n",
      "eta parameter :  0.44\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:00, 2950.58it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The reward of the logging policy:  0.2802060965274243\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 838/838 [00:01<00:00, 661.50it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "max 0.8056713938713074\n",
      "min 2.0764407508977456e-06\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2f7d1886d4fa4a5f815db3c13a6f558a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 45.97it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Ours with Gaussian policies after training  : 0.535456029810804\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5a540dbc1bd04ce3bacbced661702b8c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 98.78it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Ours with MixedLogit policies after training  : 0.2481902222937726\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "050e4230ffda408c8684cdcb6091b204",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 45.35it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of London with Gaussian policies after training  : 0.4231144736675506\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4a0b8c35bfd54f979214d28cd195e72e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 83.71it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of London with MixedLogit policies after training  : 0.2855892888535845\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1f4c5aef0c1d425bbcb138b060079d32",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 45.88it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 1 with Gaussian policies after training  : 0.4502077628196554\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ce63f2d56d4e4de7b318ed86dc5782e3",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 88.82it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 1 with MixedLogit policies after training  : 0.32917025789301446\n",
      "################################################################################\n",
      "S_lmbd is defined by these two bounds 0.000786160201153927 0.11054060047948723\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e2fa43eb3a25439692a4c0d3337811c3",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 45.74it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 2 with Gaussian policies after training  : 0.447867104347716\n",
      "################################################################################\n",
      "S_lmbd is defined by these two bounds 0.000786160201153927 0.11054060047948723\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0e2a9d4f37c34a8e983a8fa521a186d6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 83.31it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 2 with MixedLogit policies after training  : 0.3136669290826676\n",
      "################################################################################\n",
      "defaultdict(<class 'list'>, {'eta': [0.0, 0.11, 0.22, 0.33, 0.44], 'logging_reward': [0.021276595072543366, 0.05399596183858019, 0.11383750940891023, 0.19573704922452886, 0.2802060965274243], 'ours, gaussian': [0.025437003513599964, 0.08116453873350266, 0.10488492514224762, 0.5204365821594887, 0.535456029810804], 'ours, mixed-logit': [0.021878887668569037, 0.041966592250986305, 0.07576388414869917, 0.1502942757403597, 0.2481902222937726], 'london, gaussian': [0.02142250509972268, 0.11683310336255012, 0.24720970630645753, 0.35098636059050864, 0.4231144736675506], 'london, mixed-logit': [0.021312499617008453, 0.054623325048608985, 0.11627728157855095, 0.20099180272284975, 0.2855892888535845], 'sakhi1, gaussian': [0.13859928785486425, 0.3059420185900749, 0.3463822172043171, 0.39708701965656684, 0.4502077628196554], 'sakhi1, mixed-logit': [0.024251130755911482, 0.19140858112497533, 0.241775030988328, 0.27554483231077803, 0.32917025789301446], 'sakhi2, gaussian': [0.022573957671510412, 0.15966477262212875, 0.2906203171547423, 0.3819904073755792, 0.447867104347716], 'sakhi2, mixed-logit': [0.02147309272847277, 0.05796380212966432, 0.13371665594425608, 0.22882444148368022, 0.3136669290826676]})\n",
      "eta parameter :  0.56\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:00, 2926.79it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The reward of the logging policy:  0.35780007382656664\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 838/838 [00:01<00:00, 642.93it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "max 0.9108126759529114\n",
      "min 1.0573683084658114e-07\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1fd673b1885444fd8e469e1186cbce0d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 45.24it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Ours with Gaussian policies after training  : 0.5527785167288273\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c10e8ae33df54412a09b3f356c209413",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 83.46it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Ours with MixedLogit policies after training  : 0.5204665646654494\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "681ff91f9a044526b2de668c0323d59d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 45.93it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of London with Gaussian policies after training  : 0.47439444278148896\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c68b29c0ab7b46c4a72dea4cafae6416",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 83.08it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of London with MixedLogit policies after training  : 0.36279718013519935\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b4c5e85226ca43c1a3ccda3b270036f4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 46.93it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 1 with Gaussian policies after training  : 0.4928121879252982\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8cf30bc331af43f0a4b2f53265ed73a2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 83.51it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 1 with MixedLogit policies after training  : 0.3882931591602082\n",
      "################################################################################\n",
      "S_lmbd is defined by these two bounds 0.000786160201153927 0.11054060047948723\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6d6f5ca7a26c476797ce6232ee90de48",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 46.07it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 2 with Gaussian policies after training  : 0.49427708199683656\n",
      "################################################################################\n",
      "S_lmbd is defined by these two bounds 0.000786160201153927 0.11054060047948723\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "afb49893086e4bc8a0fb6ac867c37286",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 85.09it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 2 with MixedLogit policies after training  : 0.38513455127147916\n",
      "################################################################################\n",
      "defaultdict(<class 'list'>, {'eta': [0.0, 0.11, 0.22, 0.33, 0.44, 0.56], 'logging_reward': [0.021276595072543366, 0.05399596183858019, 0.11383750940891023, 0.19573704922452886, 0.2802060965274243, 0.35780007382656664], 'ours, gaussian': [0.025437003513599964, 0.08116453873350266, 0.10488492514224762, 0.5204365821594887, 0.535456029810804, 0.5527785167288273], 'ours, mixed-logit': [0.021878887668569037, 0.041966592250986305, 0.07576388414869917, 0.1502942757403597, 0.2481902222937726, 0.5204665646654494], 'london, gaussian': [0.02142250509972268, 0.11683310336255012, 0.24720970630645753, 0.35098636059050864, 0.4231144736675506, 0.47439444278148896], 'london, mixed-logit': [0.021312499617008453, 0.054623325048608985, 0.11627728157855095, 0.20099180272284975, 0.2855892888535845, 0.36279718013519935], 'sakhi1, gaussian': [0.13859928785486425, 0.3059420185900749, 0.3463822172043171, 0.39708701965656684, 0.4502077628196554, 0.4928121879252982], 'sakhi1, mixed-logit': [0.024251130755911482, 0.19140858112497533, 0.241775030988328, 0.27554483231077803, 0.32917025789301446, 0.3882931591602082], 'sakhi2, gaussian': [0.022573957671510412, 0.15966477262212875, 0.2906203171547423, 0.3819904073755792, 0.447867104347716, 0.49427708199683656], 'sakhi2, mixed-logit': [0.02147309272847277, 0.05796380212966432, 0.13371665594425608, 0.22882444148368022, 0.3136669290826676, 0.38513455127147916]})\n",
      "eta parameter :  0.67\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:00, 2938.84it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The reward of the logging policy:  0.41177391579810607\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 838/838 [00:01<00:00, 822.15it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "max 0.9549104571342468\n",
      "min 6.63556676272492e-09\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6d64b8bc8d1a4eb3932edd9ebca48ebe",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 45.90it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Ours with Gaussian policies after training  : 0.5560591959446034\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9ce816f7fba940f4ab71839bcca7680d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 84.00it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Ours with MixedLogit policies after training  : 0.5304813033976453\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "194d86187333415e8df54a3e1e50534a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 45.91it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of London with Gaussian policies after training  : 0.5046713200021298\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3eee58b02aaf422e90adf6bd88ef4e6f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 89.70it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of London with MixedLogit policies after training  : 0.4155505156009755\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a31096a1ded24421a96dd782119a9295",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 45.62it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 1 with Gaussian policies after training  : 0.518886721590732\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a20fe4cff7424ffebe53c841b30eb969",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 83.63it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 1 with MixedLogit policies after training  : 0.43274993896484376\n",
      "################################################################################\n",
      "S_lmbd is defined by these two bounds 0.000786160201153927 0.11054060047948723\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "dd55eb4f8c9644c8a3c84be90a569834",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 45.84it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 2 with Gaussian policies after training  : 0.5196405668461577\n",
      "################################################################################\n",
      "S_lmbd is defined by these two bounds 0.000786160201153927 0.11054060047948723\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3835f83ff4a445b88c5408711b1f13b5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 86.34it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 2 with MixedLogit policies after training  : 0.43334786415100096\n",
      "################################################################################\n",
      "defaultdict(<class 'list'>, {'eta': [0.0, 0.11, 0.22, 0.33, 0.44, 0.56, 0.67], 'logging_reward': [0.021276595072543366, 0.05399596183858019, 0.11383750940891023, 0.19573704922452886, 0.2802060965274243, 0.35780007382656664, 0.41177391579810607], 'ours, gaussian': [0.025437003513599964, 0.08116453873350266, 0.10488492514224762, 0.5204365821594887, 0.535456029810804, 0.5527785167288273, 0.5560591959446034], 'ours, mixed-logit': [0.021878887668569037, 0.041966592250986305, 0.07576388414869917, 0.1502942757403597, 0.2481902222937726, 0.5204665646654494, 0.5304813033976453], 'london, gaussian': [0.02142250509972268, 0.11683310336255012, 0.24720970630645753, 0.35098636059050864, 0.4231144736675506, 0.47439444278148896, 0.5046713200021298], 'london, mixed-logit': [0.021312499617008453, 0.054623325048608985, 0.11627728157855095, 0.20099180272284975, 0.2855892888535845, 0.36279718013519935, 0.4155505156009755], 'sakhi1, gaussian': [0.13859928785486425, 0.3059420185900749, 0.3463822172043171, 0.39708701965656684, 0.4502077628196554, 0.4928121879252982, 0.518886721590732], 'sakhi1, mixed-logit': [0.024251130755911482, 0.19140858112497533, 0.241775030988328, 0.27554483231077803, 0.32917025789301446, 0.3882931591602082, 0.43274993896484376], 'sakhi2, gaussian': [0.022573957671510412, 0.15966477262212875, 0.2906203171547423, 0.3819904073755792, 0.447867104347716, 0.49427708199683656, 0.5196405668461577], 'sakhi2, mixed-logit': [0.02147309272847277, 0.05796380212966432, 0.13371665594425608, 0.22882444148368022, 0.3136669290826676, 0.38513455127147916, 0.43334786415100096]})\n",
      "eta parameter :  0.78\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:00, 2916.27it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The reward of the logging policy:  0.4516965019956548\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 838/838 [00:01<00:00, 820.28it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "max 0.9764773845672607\n",
      "min 4.1661610272747396e-10\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "98bf7c3f974444abaaf68432c7e7f36a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 46.25it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Ours with Gaussian policies after training  : 0.5595704981621276\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "fe9554354a7342d085d6024aa697430b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 83.66it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Ours with MixedLogit policies after training  : 0.5339105261133072\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5a5ce1aec7ba454bb23a5a4daca6230b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 46.19it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of London with Gaussian policies after training  : 0.5250325491073283\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6aa09b991a0149d580d1e332c8b28f73",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 89.99it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of London with MixedLogit policies after training  : 0.45494402073799295\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f6cdcc94a2c7497b86e81192a24c8e31",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 44.86it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 1 with Gaussian policies after training  : 0.5382738395447426\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9d88f6812ce44d8b85483b3b79973bbd",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 83.86it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 1 with MixedLogit policies after training  : 0.46888084634821464\n",
      "################################################################################\n",
      "S_lmbd is defined by these two bounds 0.000786160201153927 0.11054060047948723\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "75201f6336374f0bb34691a5d709dbbc",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 45.59it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 2 with Gaussian policies after training  : 0.5404147921217248\n",
      "################################################################################\n",
      "S_lmbd is defined by these two bounds 0.000786160201153927 0.11054060047948723\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8132182ad3dd4f06beed665b7308b05e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 83.84it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 2 with MixedLogit policies after training  : 0.46986117362976076\n",
      "################################################################################\n",
      "defaultdict(<class 'list'>, {'eta': [0.0, 0.11, 0.22, 0.33, 0.44, 0.56, 0.67, 0.78], 'logging_reward': [0.021276595072543366, 0.05399596183858019, 0.11383750940891023, 0.19573704922452886, 0.2802060965274243, 0.35780007382656664, 0.41177391579810607, 0.4516965019956548], 'ours, gaussian': [0.025437003513599964, 0.08116453873350266, 0.10488492514224762, 0.5204365821594887, 0.535456029810804, 0.5527785167288273, 0.5560591959446034, 0.5595704981621276], 'ours, mixed-logit': [0.021878887668569037, 0.041966592250986305, 0.07576388414869917, 0.1502942757403597, 0.2481902222937726, 0.5204665646654494, 0.5304813033976453, 0.5339105261133072], 'london, gaussian': [0.02142250509972268, 0.11683310336255012, 0.24720970630645753, 0.35098636059050864, 0.4231144736675506, 0.47439444278148896, 0.5046713200021298, 0.5250325491073283], 'london, mixed-logit': [0.021312499617008453, 0.054623325048608985, 0.11627728157855095, 0.20099180272284975, 0.2855892888535845, 0.36279718013519935, 0.4155505156009755, 0.45494402073799295], 'sakhi1, gaussian': [0.13859928785486425, 0.3059420185900749, 0.3463822172043171, 0.39708701965656684, 0.4502077628196554, 0.4928121879252982, 0.518886721590732, 0.5382738395447426], 'sakhi1, mixed-logit': [0.024251130755911482, 0.19140858112497533, 0.241775030988328, 0.27554483231077803, 0.32917025789301446, 0.3882931591602082, 0.43274993896484376, 0.46888084634821464], 'sakhi2, gaussian': [0.022573957671510412, 0.15966477262212875, 0.2906203171547423, 0.3819904073755792, 0.447867104347716, 0.49427708199683656, 0.5196405668461577, 0.5404147921217248], 'sakhi2, mixed-logit': [0.02147309272847277, 0.05796380212966432, 0.13371665594425608, 0.22882444148368022, 0.3136669290826676, 0.38513455127147916, 0.43334786415100096, 0.46986117362976076]})\n",
      "eta parameter :  0.89\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:00, 2847.09it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The reward of the logging policy:  0.4810397967886417\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 838/838 [00:01<00:00, 650.04it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "max 0.9872491955757141\n",
      "min 2.6052769316886604e-11\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "670e19b1106c466d854bc94abbf9f277",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 45.85it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Ours with Gaussian policies after training  : 0.5654494165866933\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7f37d049d4484197bb410b18e29bd1fc",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 98.75it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Ours with MixedLogit policies after training  : 0.5363754982643939\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f83232cc703246da896d31b35caf888d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 45.75it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of London with Gaussian policies after training  : 0.5399714303523936\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3699a49e2f0e43af90e213b1c64c229a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 84.17it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of London with MixedLogit policies after training  : 0.4838527024045904\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c43cd4c11ab94a958d77b78578553ae7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 45.83it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 1 with Gaussian policies after training  : 0.5517038724777547\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8bcc2a8815e140adb9dda4fdd386e553",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 98.50it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 1 with MixedLogit policies after training  : 0.4957585089257423\n",
      "################################################################################\n",
      "S_lmbd is defined by these two bounds 0.000786160201153927 0.11054060047948723\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0c90a317d6dc4494a1761960e1b6e6d2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 45.82it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 2 with Gaussian policies after training  : 0.5545369763069965\n",
      "################################################################################\n",
      "S_lmbd is defined by these two bounds 0.000786160201153927 0.11054060047948723\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "be54a670756145c298a1163d6a0cd34f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 84.57it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 2 with MixedLogit policies after training  : 0.49787267908136895\n",
      "################################################################################\n",
      "defaultdict(<class 'list'>, {'eta': [0.0, 0.11, 0.22, 0.33, 0.44, 0.56, 0.67, 0.78, 0.89], 'logging_reward': [0.021276595072543366, 0.05399596183858019, 0.11383750940891023, 0.19573704922452886, 0.2802060965274243, 0.35780007382656664, 0.41177391579810607, 0.4516965019956548, 0.4810397967886417], 'ours, gaussian': [0.025437003513599964, 0.08116453873350266, 0.10488492514224762, 0.5204365821594887, 0.535456029810804, 0.5527785167288273, 0.5560591959446034, 0.5595704981621276, 0.5654494165866933], 'ours, mixed-logit': [0.021878887668569037, 0.041966592250986305, 0.07576388414869917, 0.1502942757403597, 0.2481902222937726, 0.5204665646654494, 0.5304813033976453, 0.5339105261133072, 0.5363754982643939], 'london, gaussian': [0.02142250509972268, 0.11683310336255012, 0.24720970630645753, 0.35098636059050864, 0.4231144736675506, 0.47439444278148896, 0.5046713200021298, 0.5250325491073283, 0.5399714303523936], 'london, mixed-logit': [0.021312499617008453, 0.054623325048608985, 0.11627728157855095, 0.20099180272284975, 0.2855892888535845, 0.36279718013519935, 0.4155505156009755, 0.45494402073799295, 0.4838527024045904], 'sakhi1, gaussian': [0.13859928785486425, 0.3059420185900749, 0.3463822172043171, 0.39708701965656684, 0.4502077628196554, 0.4928121879252982, 0.518886721590732, 0.5382738395447426, 0.5517038724777547], 'sakhi1, mixed-logit': [0.024251130755911482, 0.19140858112497533, 0.241775030988328, 0.27554483231077803, 0.32917025789301446, 0.3882931591602082, 0.43274993896484376, 0.46888084634821464, 0.4957585089257423], 'sakhi2, gaussian': [0.022573957671510412, 0.15966477262212875, 0.2906203171547423, 0.3819904073755792, 0.447867104347716, 0.49427708199683656, 0.5196405668461577, 0.5404147921217248, 0.5545369763069965], 'sakhi2, mixed-logit': [0.02147309272847277, 0.05796380212966432, 0.13371665594425608, 0.22882444148368022, 0.3136669290826676, 0.38513455127147916, 0.43334786415100096, 0.46986117362976076, 0.49787267908136895]})\n",
      "eta parameter :  1.0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:00, 2896.73it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The reward of the logging policy:  0.5027762337948414\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 838/838 [00:01<00:00, 673.32it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "max 0.9928460717201233\n",
      "min 1.639082207634257e-12\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "43a35c7932b8475fbd15ddc7402347df",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 45.85it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Ours with Gaussian policies after training  : 0.5727778747233939\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a74a53dd13fe4be28c8c7d8318506216",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 83.83it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Ours with MixedLogit policies after training  : 0.5405956093808438\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "269b179b1efc49cbaa3e6885465d331b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 46.10it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of London with Gaussian policies after training  : 0.5507168138788101\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "86e01815c45f4dcca0ab988046775576",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 88.69it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of London with MixedLogit policies after training  : 0.5055363320289774\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "cb705f0595ca4f8ea934299a00c3287e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 46.24it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 1 with Gaussian policies after training  : 0.5614986099080836\n",
      "################################################################################\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "dbb1639a311740c5a239f5133dcbf1a6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 99.01it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 1 with MixedLogit policies after training  : 0.5150394492453717\n",
      "################################################################################\n",
      "S_lmbd is defined by these two bounds 0.000786160201153927 0.11054060047948723\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "063a0c805aaa459ea324584abe765566",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:03, 45.45it/s]\n",
      "GPU available: True, used: True\n",
      "TPU available: False, using: 0 TPU cores\n",
      "IPU available: False, using: 0 IPUs\n",
      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 2 with Gaussian policies after training  : 0.5634863958967493\n",
      "################################################################################\n",
      "S_lmbd is defined by these two bounds 0.000786160201153927 0.11054060047948723\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8810c0c75a324ef9898ff11ed3a3a7ec",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: 0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "147it [00:01, 91.09it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward of Sakhi et al. 2 with MixedLogit policies after training  : 0.5179282644961742\n",
      "################################################################################\n",
      "defaultdict(<class 'list'>, {'eta': [0.0, 0.11, 0.22, 0.33, 0.44, 0.56, 0.67, 0.78, 0.89, 1.0], 'logging_reward': [0.021276595072543366, 0.05399596183858019, 0.11383750940891023, 0.19573704922452886, 0.2802060965274243, 0.35780007382656664, 0.41177391579810607, 0.4516965019956548, 0.4810397967886417, 0.5027762337948414], 'ours, gaussian': [0.025437003513599964, 0.08116453873350266, 0.10488492514224762, 0.5204365821594887, 0.535456029810804, 0.5527785167288273, 0.5560591959446034, 0.5595704981621276, 0.5654494165866933, 0.5727778747233939], 'ours, mixed-logit': [0.021878887668569037, 0.041966592250986305, 0.07576388414869917, 0.1502942757403597, 0.2481902222937726, 0.5204665646654494, 0.5304813033976453, 0.5339105261133072, 0.5363754982643939, 0.5405956093808438], 'london, gaussian': [0.02142250509972268, 0.11683310336255012, 0.24720970630645753, 0.35098636059050864, 0.4231144736675506, 0.47439444278148896, 0.5046713200021298, 0.5250325491073283, 0.5399714303523936, 0.5507168138788101], 'london, mixed-logit': [0.021312499617008453, 0.054623325048608985, 0.11627728157855095, 0.20099180272284975, 0.2855892888535845, 0.36279718013519935, 0.4155505156009755, 0.45494402073799295, 0.4838527024045904, 0.5055363320289774], 'sakhi1, gaussian': [0.13859928785486425, 0.3059420185900749, 0.3463822172043171, 0.39708701965656684, 0.4502077628196554, 0.4928121879252982, 0.518886721590732, 0.5382738395447426, 0.5517038724777547, 0.5614986099080836], 'sakhi1, mixed-logit': [0.024251130755911482, 0.19140858112497533, 0.241775030988328, 0.27554483231077803, 0.32917025789301446, 0.3882931591602082, 0.43274993896484376, 0.46888084634821464, 0.4957585089257423, 0.5150394492453717], 'sakhi2, gaussian': [0.022573957671510412, 0.15966477262212875, 0.2906203171547423, 0.3819904073755792, 0.447867104347716, 0.49427708199683656, 0.5196405668461577, 0.5404147921217248, 0.5545369763069965, 0.5634863958967493], 'sakhi2, mixed-logit': [0.02147309272847277, 0.05796380212966432, 0.13371665594425608, 0.22882444148368022, 0.3136669290826676, 0.38513455127147916, 0.43334786415100096, 0.46986117362976076, 0.49787267908136895, 0.5179282644961742]})\n",
      "    eta  logging_reward  ours, gaussian  ours, mixed-logit  london, gaussian  \\\n",
      "0  0.00        0.021277        0.025437           0.021879          0.021423   \n",
      "1  0.11        0.053996        0.081165           0.041967          0.116833   \n",
      "2  0.22        0.113838        0.104885           0.075764          0.247210   \n",
      "3  0.33        0.195737        0.520437           0.150294          0.350986   \n",
      "4  0.44        0.280206        0.535456           0.248190          0.423114   \n",
      "5  0.56        0.357800        0.552779           0.520467          0.474394   \n",
      "6  0.67        0.411774        0.556059           0.530481          0.504671   \n",
      "7  0.78        0.451697        0.559570           0.533911          0.525033   \n",
      "8  0.89        0.481040        0.565449           0.536375          0.539971   \n",
      "9  1.00        0.502776        0.572778           0.540596          0.550717   \n",
      "\n",
      "   london, mixed-logit  sakhi1, gaussian  sakhi1, mixed-logit  \\\n",
      "0             0.021312          0.138599             0.024251   \n",
      "1             0.054623          0.305942             0.191409   \n",
      "2             0.116277          0.346382             0.241775   \n",
      "3             0.200992          0.397087             0.275545   \n",
      "4             0.285589          0.450208             0.329170   \n",
      "5             0.362797          0.492812             0.388293   \n",
      "6             0.415551          0.518887             0.432750   \n",
      "7             0.454944          0.538274             0.468881   \n",
      "8             0.483853          0.551704             0.495759   \n",
      "9             0.505536          0.561499             0.515039   \n",
      "\n",
      "   sakhi2, gaussian  sakhi2, mixed-logit  \n",
      "0          0.022574             0.021473  \n",
      "1          0.159665             0.057964  \n",
      "2          0.290620             0.133717  \n",
      "3          0.381990             0.228824  \n",
      "4          0.447867             0.313667  \n",
      "5          0.494277             0.385135  \n",
      "6          0.519641             0.433348  \n",
      "7          0.540415             0.469861  \n",
      "8          0.554537             0.497873  \n",
      "9          0.563486             0.517928  \n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "for i, eta in enumerate(etas):\n",
    "    logging_policy = logging_policy.to(device)\n",
    "    print('eta parameter : ', eta)  \n",
    "    logging_policy.alpha = eta\n",
    "    risk_logging = test_risk_exact_probit(X_test, Y_test, logging_policy)\n",
    "    print('The reward of the logging policy: ', -risk_logging)\n",
    "    \n",
    "    dict_results['eta'].append(eta)\n",
    "    dict_results['logging_reward'].append(-risk_logging)\n",
    "    \n",
    "    # Collect a bandit dataset\n",
    "    f, a, p, c = build_bandit_dataset(logging_split_dataloader, logging_policy, replay_count = 1)\n",
    "\n",
    "    print('max', p.max(dim = 0)[0].mean().item())\n",
    "    print('min', p.min(dim = 0)[0].mean().item())\n",
    "\n",
    "    bandit_train_posterior = TensorDataset(f, a, p, c)\n",
    "    bandit_train_posterior_dataloader = DataLoader(bandit_train_posterior, batch_size=128, shuffle=True)\n",
    "\n",
    "    mu_0 = eta * logging_policy.linear.weight.data\n",
    "    \n",
    "    #####################################################################################################################\n",
    "    #####################################################################################################################\n",
    "    ######### Ours with Gaussian policies\n",
    "    #######################################\n",
    "    \n",
    "    model = OurGaussian(n_actions=num_actions, context_dim=context_dim, beta = 1/(N**(1/4)), N = N, loc_weight = mu_0, device = device)\n",
    "\n",
    "    trainer = Trainer(max_epochs=epochs, gpus=1, checkpoint_callback=False, weights_summary=None, logger=None)\n",
    "    trainer.fit(model, bandit_train_posterior_dataloader)\n",
    "\n",
    "    model = model.to(device)\n",
    "    with torch.no_grad():\n",
    "        risk_after_train = test_risk_exact_probit(X_test, Y_test, model)\n",
    "        \n",
    "    print('Reward of Ours with Gaussian policies after training  :', -risk_after_train)\n",
    "    print('################################################################################')\n",
    "        \n",
    "    dict_results['ours, gaussian'].append(-risk_after_train)\n",
    "    \n",
    "    ######### Ours with Mixed-Logit policies\n",
    "    \n",
    "    model = OurMixedLogit(n_actions=num_actions, context_dim=context_dim, beta = 1/(N**(1/4)), N = N, loc_weight = mu_0, device = device)\n",
    "\n",
    "    trainer = Trainer(max_epochs=epochs, gpus=1, checkpoint_callback=False, weights_summary=None, logger=None)\n",
    "    trainer.fit(model, bandit_train_posterior_dataloader)\n",
    "\n",
    "    model = model.to(device)\n",
    "    with torch.no_grad():\n",
    "        risk_after_train = test_risk_exact_probit(X_test, Y_test, model)\n",
    "        \n",
    "    print('Reward of Ours with MixedLogit policies after training  :', -risk_after_train)\n",
    "    print('################################################################################')\n",
    "        \n",
    "    dict_results['ours, mixed-logit'].append(-risk_after_train)\n",
    "\n",
    "    #####################################################################################################################\n",
    "    #####################################################################################################################\n",
    "    ######### London with Gaussian policies\n",
    "    #######################################\n",
    "    \n",
    "    model = LondonGaussian(n_actions=num_actions, context_dim=context_dim, tau = 1/(N**(1/4)), N = N, loc_weight = mu_0, device = device)\n",
    "\n",
    "    trainer = Trainer(max_epochs=epochs, gpus=1, checkpoint_callback=False, weights_summary=None, logger=None)\n",
    "    trainer.fit(model, bandit_train_posterior_dataloader)\n",
    "\n",
    "    model = model.to(device)\n",
    "    with torch.no_grad():\n",
    "        risk_after_train = test_risk_exact_probit(X_test, Y_test, model)\n",
    "        \n",
    "    print('Reward of London with Gaussian policies after training  :', -risk_after_train)\n",
    "    print('################################################################################')\n",
    "        \n",
    "    dict_results['london, gaussian'].append(-risk_after_train)\n",
    "    \n",
    "    ######### London with MixedLogit policies\n",
    "    \n",
    "    model = LondonMixedLogit(n_actions=num_actions, context_dim=context_dim, tau = 1/(N**(1/4)), N = N, loc_weight = mu_0, device = device)\n",
    "\n",
    "    trainer = Trainer(max_epochs=epochs, gpus=1, checkpoint_callback=False, weights_summary=None, logger=None)\n",
    "    trainer.fit(model, bandit_train_posterior_dataloader)\n",
    "\n",
    "    model = model.to(device)\n",
    "    with torch.no_grad():\n",
    "        risk_after_train = test_risk_exact_probit(X_test, Y_test, model)\n",
    "        \n",
    "    print('Reward of London with MixedLogit policies after training  :', -risk_after_train)\n",
    "    print('################################################################################')\n",
    "        \n",
    "    dict_results['london, mixed-logit'].append(-risk_after_train)\n",
    "    \n",
    "    #####################################################################################################################\n",
    "    #####################################################################################################################\n",
    "    ######### Sakhi 1 with Gaussian policies\n",
    "    ########################################\n",
    "    model = CatoniGaussian(n_actions=num_actions, context_dim=context_dim, tau = 1/(N**(1/4)), N = N, loc_weight = mu_0, device = device)\n",
    "\n",
    "    trainer = Trainer(max_epochs=epochs, gpus=1, checkpoint_callback=False, weights_summary=None, logger=None)\n",
    "    trainer.fit(model, bandit_train_posterior_dataloader)\n",
    "\n",
    "    model = model.to(device)\n",
    "    with torch.no_grad():\n",
    "        risk_after_train = test_risk_exact_probit(X_test, Y_test, model)\n",
    "        \n",
    "    print('Reward of Sakhi et al. 1 with Gaussian policies after training  :', -risk_after_train)\n",
    "    print('################################################################################')\n",
    "        \n",
    "    dict_results['sakhi1, gaussian'].append(-risk_after_train)\n",
    "\n",
    "    ######### Sakhi 1 with MixedLogit policies\n",
    "    \n",
    "    model = CatoniMixedLogit(n_actions=num_actions, context_dim=context_dim, tau = 1/(N**(1/4)), N = N, loc_weight = mu_0, device = device)\n",
    "\n",
    "    trainer = Trainer(max_epochs=epochs, gpus=1, checkpoint_callback=False, weights_summary=None, logger=None)\n",
    "    trainer.fit(model, bandit_train_posterior_dataloader)\n",
    "\n",
    "    model = model.to(device)\n",
    "    with torch.no_grad():\n",
    "        risk_after_train = test_risk_exact_probit(X_test, Y_test, model)\n",
    "        \n",
    "    print('Reward of Sakhi et al. 1 with MixedLogit policies after training  :', -risk_after_train)\n",
    "    print('################################################################################')\n",
    "        \n",
    "    dict_results['sakhi1, mixed-logit'].append(-risk_after_train)\n",
    "    \n",
    "    \n",
    "    #####################################################################################################################\n",
    "    #####################################################################################################################\n",
    "    ######### Sakhi 2 with Gaussian policies\n",
    "    #######################################\n",
    "    model = BernsteinGaussian(n_actions=num_actions, context_dim=context_dim, tau=1/(N**(1/4)), N=N, loc_weight=mu_0, num_p=100, rc=1, xi=0, device = device)\n",
    "\n",
    "    trainer = Trainer(max_epochs=epochs, gpus=1, checkpoint_callback=False, weights_summary=None, logger=None)\n",
    "    trainer.fit(model, bandit_train_posterior_dataloader)\n",
    "\n",
    "    model = model.to(device)\n",
    "    with torch.no_grad():\n",
    "        risk_after_train = test_risk_exact_probit(X_test, Y_test, model)\n",
    "        \n",
    "    print('Reward of Sakhi et al. 2 with Gaussian policies after training  :', -risk_after_train)\n",
    "    print('################################################################################')\n",
    "        \n",
    "    dict_results['sakhi2, gaussian'].append(-risk_after_train)\n",
    "\n",
    "    ######### Sakhi 2 with MixedLogit policies\n",
    "    model = BernsteinMixedLogit(n_actions=num_actions, context_dim=context_dim, tau=1/(N**(1/4)), N=N, loc_weight=mu_0, num_p=100, rc=1, xi=0, device = device)\n",
    "\n",
    "    trainer = Trainer(max_epochs=epochs, gpus=1, checkpoint_callback=False, weights_summary=None, logger=None)\n",
    "    trainer.fit(model, bandit_train_posterior_dataloader)\n",
    "\n",
    "    model = model.to(device)\n",
    "    with torch.no_grad():\n",
    "        risk_after_train = test_risk_exact_probit(X_test, Y_test, model)\n",
    "        \n",
    "    print('Reward of Sakhi et al. 2 with MixedLogit policies after training  :', -risk_after_train)\n",
    "    print('################################################################################')\n",
    "        \n",
    "    dict_results['sakhi2, mixed-logit'].append(-risk_after_train)\n",
    "    \n",
    "    print(dict_results)\n",
    "\n",
    "df = pd.DataFrame(dict_results)\n",
    "print(df)\n",
    "df.to_csv('results/bound_optimization/' + name +'.csv', index = False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python Kernel (MOAB #58857)",
   "language": "python",
   "name": "python-kernel-58857"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
