{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "collapsed_sections": [
        "KZIfDThxMJs9",
        "tIq9bhSYXEDe",
        "DDKETcIFkgx7"
      ]
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "# **Stochastic Bandits for Egalitarian Assignment - ClusterUsage Experiments**\n",
        "---"
      ],
      "metadata": {
        "id": "udvmtNUmA3E2"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "rootdir = 'drive/MyDrive/egalucb/clusterusage' # @param {type: \"string\"}\n",
        "\n",
        "from google.colab import drive\n",
        "drive.mount('drive', force_remount=True)\n",
        "! mkdir -p {rootdir}"
      ],
      "metadata": {
        "cellView": "form",
        "id": "UBBOzLf7UpiH"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "KZIfDThxMJs9"
      },
      "source": [
        "## Setup"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "! sudo apt install cm-super dvipng texlive-latex-extra texlive-latex-recommended"
      ],
      "metadata": {
        "id": "7ZVkwpF93mRP"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "M4BUF4p2styx"
      },
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "import matplotlib\n",
        "import matplotlib.pyplot as plt\n",
        "import pandas as pd\n",
        "import os.path\n",
        "\n",
        "from collections import defaultdict\n",
        "from tqdm import tqdm"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "h6LFbGr5xZj2"
      },
      "outputs": [],
      "source": [
        "olderr = np.seterr(all='ignore')\n",
        "\n",
        "matplotlib.rcParams[\"text.usetex\"] = True\n",
        "matplotlib.rcParams[\"font.size\"] = \"7\"\n",
        "matplotlib.rcParams['mathtext.fontset'] = 'stix'\n",
        "matplotlib.rcParams['font.family'] = 'STIXGeneral'\n",
        "\n",
        "def pt2inches(width, height):\n",
        "    return (width / 72.27, height / 72.27)"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "class MultiUserClusterDataBanditInstance:\n",
        "\n",
        "    def __init__(self, K, U, data, mus):\n",
        "        self.K = K\n",
        "        self.U = U\n",
        "        self.data = data[:K]\n",
        "        self.mus = mus[:K]\n",
        "        self.mustar = np.sum(np.sort(self.mus)[-U:])\n",
        "\n",
        "    def pull(self, arms):\n",
        "        rewards = np.zeros(self.U)\n",
        "        for u in range(self.U):\n",
        "            rewards[u] = self.sample_from(arms[u])\n",
        "        return rewards\n",
        "\n",
        "    def sample_from(self, arm):\n",
        "        return np.random.choice(self.data[arm])\n",
        "\n",
        "    def simulate_egalucb(self, T):\n",
        "        cumrewards = np.zeros(self.K)\n",
        "        numplays = np.zeros(self.K, dtype=int)\n",
        "        expregrets = np.zeros((self.U, T + 1))\n",
        "        B = int(T / self.U)\n",
        "        t = 0\n",
        "        for b in tqdm(range(B)):\n",
        "            muhats = cumrewards / numplays\n",
        "            ucbs = muhats + np.sqrt(6 * np.log(b) / numplays)\n",
        "            ucbs = np.nan_to_num(ucbs, nan=np.inf)\n",
        "            blockarms = ucbs.argsort()[-self.U:][::-1]\n",
        "            blockarms = np.concatenate([blockarms, blockarms])\n",
        "            for i in range(self.U):\n",
        "                t += 1\n",
        "                arms = blockarms[i:self.U+i]\n",
        "                rewards = self.pull(arms)\n",
        "                cumrewards[arms] += rewards\n",
        "                numplays[arms] += 1\n",
        "                expregrets[:,t] = expregrets[:,t-1] + (self.mustar / self.U - self.mus[arms])\n",
        "        expregrets = expregrets[:, 1:]\n",
        "        return cumrewards, numplays, expregrets"
      ],
      "metadata": {
        "id": "IcB_Hk280nAJ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Load Dataset"
      ],
      "metadata": {
        "id": "YiYKStpY4vTE"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "Due to the size of the raw dataset, it exceeds the resource limit imposed by Google Colab. You may execute the following instructions outside of Google Colab to download and extract the relevant data for this experiment.\n",
        "\n",
        "1. Download and decompress the raw dataset.\n",
        "```\n",
        "wget https://storage.googleapis.com/clusterdata_2019_a/instance_usage-000000000000.json.gz\n",
        "gunzip -d instance_usage-000000000000.json.gz\n",
        "```\n",
        "\n",
        "2. Run the following code to extract only the `machine_id` and `cycles_per_instruction` fields from the first 4 million entries from the raw dataset.\n",
        "```python\n",
        "import pandas as pd\n",
        "import numpy as np\n",
        "with pd.read_json('instance_usage-000000000000.json', lines=True, chunksize=4000000) as reader:\n",
        "    data = next(reader)\n",
        "data = data[['machine_id', 'cycles_per_instruction']].to_numpy()\n",
        "machine_ids = np.unique(data[:, 0])\n",
        "machine_data = [data[data[:, 0] == machine_id, 1] for machine_id in machine_ids]\n",
        "np.save('machine_data.npy', np.array(machine_data, dtype=object))\n",
        "```\n",
        "\n",
        "3. Save the npz file as `{rootdir}/data.npz` and run the following instruction to load them."
      ],
      "metadata": {
        "id": "tIq9bhSYXEDe"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "machine_data = np.load(f'{rootdir}/machine_data.npy', allow_pickle=True)\n",
        "machine_lens = np.zeros(machine_data.shape[0])\n",
        "machine_mus = np.zeros(machine_data.shape[0])\n",
        "\n",
        "for i in range(machine_data.shape[0]):\n",
        "    machine_data[i] = -machine_data[i][~np.isnan(machine_data[i])] # negate and remove nan\n",
        "    machine_lens[i] = machine_data[i].shape[0]\n",
        "    machine_mus[i] = machine_data[i].mean() if machine_lens[i] > 0 else 0\n",
        "\n",
        "machine_data = machine_data[machine_lens.argsort()][::-1]\n",
        "machine_mus = machine_mus[machine_lens.argsort()][::-1]\n",
        "machine_lens = machine_lens[machine_lens.argsort()][::-1]"
      ],
      "metadata": {
        "id": "2LBK9e9h_7hM"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Experiment 1: How regret evolves over time?"
      ],
      "metadata": {
        "id": "DDKETcIFkgx7"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "K = 100\n",
        "T = 150000\n",
        "Us = [5, 10, 15, 20, 25]"
      ],
      "metadata": {
        "id": "4X7pJxqAX7u4"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "instances = []\n",
        "expregrets_list = []\n",
        "\n",
        "for i in range(len(Us)):\n",
        "    instances.append(MultiUserClusterDataBanditInstance(K, Us[i], machine_data, machine_mus))\n",
        "\n",
        "for i in range(len(Us)):\n",
        "    _, _, expregrets = instances[i].simulate_egalucb(T)\n",
        "    expregrets_list.append(expregrets)"
      ],
      "metadata": {
        "id": "umb56dJ91M7Y"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "np.save(f'{rootdir}/ex1-clusterusage.npy', np.array(expregrets_list, dtype=object))"
      ],
      "metadata": {
        "id": "iZlLAoqi1uDd"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "expregrets_list = np.load(f'{rootdir}/ex1-clusterusage.npy', allow_pickle=True)"
      ],
      "metadata": {
        "id": "1ih75vqP1xEd"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "fig, ax = plt.subplots(figsize=(pt2inches(200, 120)), dpi=120)\n",
        "linestyles = [(0, ()), (0, (1, 1)), (0, (5, 1)), (0, (5, 5)), (0, (3, 1, 1, 1))]\n",
        "colors = ['#EE6677', '#228833', '#4477AA', '#AA3377', '#CCBB44']\n",
        "\n",
        "res = 1000\n",
        "time = np.arange(1, T + 1)\n",
        "for i in range(len(Us)):\n",
        "    ax.plot(time[::res], expregrets_list[i][0,::res], color=colors[i], label=f'{Us[i]:d} users', linestyle=linestyles[i], linewidth=0.5)\n",
        "\n",
        "xticks = matplotlib.ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x / 1000))\n",
        "yticks = matplotlib.ticker.FuncFormatter(lambda y, pos: '{0:g}'.format(y / 1000))\n",
        "\n",
        "ax.xaxis.set_major_formatter(xticks)\n",
        "ax.yaxis.set_major_formatter(yticks)\n",
        "ax.set_xticks(np.arange(0, 150001, 30000))\n",
        "ax.set_yticks(np.arange(0, 2401, 600))\n",
        "ax.tick_params(axis='both', which='both', length=0)\n",
        "\n",
        "ax.spines['top'].set_visible(False)\n",
        "ax.spines['right'].set_visible(False)\n",
        "ax.spines['bottom'].set_visible(False)\n",
        "ax.spines['left'].set_visible(False)\n",
        "\n",
        "ax.set_xlabel('Timestep (in thousands)')\n",
        "ax.set_ylabel('Regret (in thousands)')\n",
        "ax.legend(fontsize='6')\n",
        "ax.grid(alpha=0.25, axis='y', color='#BBBBBB', linewidth=0.5)\n",
        "\n",
        "fig.show()\n",
        "fig.savefig(f'{rootdir}/ex1-clusterusage.pdf', bbox_inches='tight')"
      ],
      "metadata": {
        "id": "5CItfxCB62J0"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}