{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "a538dfd8",
   "metadata": {},
   "source": [
    "# Memory Usage: RoBERTa Baseline Pipeline"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "42de525b",
   "metadata": {},
   "source": [
    "In this notebooks we builds benchmarking pipeline for RoBERTa model with\n",
    "HuggingFace amazing libraries."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "357d4dc5",
   "metadata": {},
   "source": [
    "## Prerequisites"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "511b40ae",
   "metadata": {},
   "source": [
    "First of all, some Python packages are required.\n",
    "Also, it is needed to set (at least RO) access token to HuggingFace Hub."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cc0579a9",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install datasets transformers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8ee79bcf",
   "metadata": {},
   "outputs": [],
   "source": [
    "from huggingface_hub import notebook_login  # noqa"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c709d61",
   "metadata": {},
   "outputs": [],
   "source": [
    "notebook_login()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "beebf430",
   "metadata": {},
   "source": [
    "## Benchmark"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fa5634a1",
   "metadata": {},
   "source": [
    "In fact we needs only a few imports which are more technical (measure memory\n",
    "usage) then ml-ish."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4eb4ad5d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch as T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0258bcf5",
   "metadata": {},
   "outputs": [],
   "source": [
    "from functools import partial\n",
    "from os import environ"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ad9c44b8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Suppress HuggingFace tokenizer warnings.\n",
    "environ['TOKENIZERS_PARALLELISM'] = 'false'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9b31476e",
   "metadata": {},
   "source": [
    "As we told above, all significant imports and ml-ish code are incapsulated\n",
    "inside object in order to prevent occasional resource acquisition.\n",
    "Lifecycle of a benchmark can be described as an ordered chain of method calls\n",
    "`setup()`, `run()`, and `teardown()`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9c7c009b",
   "metadata": {},
   "outputs": [],
   "source": [
    "class BenchRoBERTa:\n",
    "\n",
    "    TASK_TO_KEYS = {\n",
    "        'cola': ('sentence', None),\n",
    "        'mnli': ('premise', 'hypothesis'),\n",
    "        'mnli-mm': ('premise', 'hypothesis'),\n",
    "        'mrpc': ('sentence1', 'sentence2'),\n",
    "        'qnli': ('question', 'sentence'),\n",
    "        'qqp': ('question1', 'question2'),\n",
    "        'rte': ('sentence1', 'sentence2'),\n",
    "        'sst2': ('sentence', None),\n",
    "        'stsb': ('sentence1', 'sentence2'),\n",
    "        'wnli': ('sentence1', 'sentence2'),\n",
    "    }\n",
    "\n",
    "    @staticmethod\n",
    "    def compute_metric(task, metric, inputs):\n",
    "        predictions, references = inputs\n",
    "        if task != 'stsb':\n",
    "            predictions = predictions.argmax(axis=1)\n",
    "        else:\n",
    "            predictions = predictions[..., 0]\n",
    "        return metric.compute(predictions=predictions, references=references)\n",
    "\n",
    "    @staticmethod\n",
    "    def preprocess(tokenizer, lhs, rhs, sample):\n",
    "        if rhs is None:\n",
    "            return tokenizer(sample[lhs], truncation=True)\n",
    "        return tokenizer(sample[lhs], sample[rhs], truncation=True)\n",
    "\n",
    "    def setup(self):\n",
    "        from datasets import load_dataset, load_metric\n",
    "        from transformers import (RobertaTokenizerFast as Tokenizer,\n",
    "                                  RobertaForSequenceClassification as Model,\n",
    "                                  Trainer, TrainingArguments)\n",
    "\n",
    "        # Load and configure model output head.\n",
    "        if self.task in ('mnli', 'mnli-mm'):\n",
    "            num_labels = 3\n",
    "        elif self.task == 'stsb':\n",
    "            num_labels = 1\n",
    "        else:\n",
    "            num_labels = 2\n",
    "        model_path = 'roberta-base'\n",
    "        model = Model.from_pretrained(model_path, num_labels=num_labels)\n",
    "\n",
    "        # Load tokenizer from checkpoint.\n",
    "        tokenizer = Tokenizer.from_pretrained(model_path)\n",
    "\n",
    "        # Make dataset preprocessor.\n",
    "        keys = BenchRoBERTa.TASK_TO_KEYS[self.task]\n",
    "        func = partial(BenchRoBERTa.preprocess, tokenizer, *keys)\n",
    "\n",
    "        # Load and preprocess dataset.\n",
    "        dataset_path = 'glue'\n",
    "        dataset_name = 'mnli' if self.task == 'mnli-mm' else self.task\n",
    "        dataset = load_dataset(dataset_path, dataset_name)\n",
    "        dataset_encoded = dataset.map(func, batched=True)\n",
    "\n",
    "        # Load dataset metric.\n",
    "        metric = load_metric(dataset_path, dataset_name)\n",
    "        metric_compute = partial(BenchRoBERTa.compute_metric, self.task,\n",
    "                                 metric)\n",
    "\n",
    "        # Initialize training driver.\n",
    "        args = TrainingArguments(output_dir='roberta',\n",
    "                                 save_strategy='no',\n",
    "                                 learning_rate=2e-5,\n",
    "                                 per_device_train_batch_size=self.batch,\n",
    "                                 num_train_epochs=1,\n",
    "                                 weight_decay=0.01,\n",
    "                                 load_best_model_at_end=False,\n",
    "                                 push_to_hub=False)\n",
    "        self.trainer = Trainer(model=model,\n",
    "                               args=args,\n",
    "                               train_dataset=dataset_encoded['train'],\n",
    "                               tokenizer=tokenizer,\n",
    "                               compute_metrics=metric_compute)\n",
    "\n",
    "    def run(self):\n",
    "        self.trainer.train()\n",
    "\n",
    "    def teardown(self):\n",
    "        pass"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0ff677e2",
   "metadata": {},
   "source": [
    "Actual model fitting happens to be here.\n",
    "In order to get metrics we need to gather statistics of interests before\n",
    "invocation of `.run()` and after it."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2faffd1c",
   "metadata": {},
   "outputs": [],
   "source": [
    "bench = BenchRoBERTa()\n",
    "bench.task = 'cola'\n",
    "bench.batch = 16\n",
    "bench.setup()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aed12242",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "memory = -T.cuda.memory_allocated()\n",
    "bench.run()\n",
    "memory += T.cuda.max_memory_allocated()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dc0302b7",
   "metadata": {},
   "outputs": [],
   "source": [
    "bench.teardown()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c64ccf39",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f'Memory Usage: {memory / 1024 ** 2:.1f} Mb')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
