{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "a749cada",
   "metadata": {},
   "source": [
    "# Few-Bit: Postprocessing of GLUE fine-tuning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "23be7160",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "import tensorboard as tb\n",
    "import tensorboard.data_compat\n",
    "import tensorflow as tf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e283818c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pathlib import Path\n",
    "from typing import List, Optional"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c2916d89",
   "metadata": {},
   "source": [
    "First of all, define some globals."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9d574cb5",
   "metadata": {},
   "outputs": [],
   "source": [
    "METRICS = {'eval/accuracy', 'eval/matthews_correlation', 'eval/pearson'}\n",
    "MDASH = '—'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0f94a3fd",
   "metadata": {},
   "outputs": [],
   "source": [
    "def convert(srcs: List[Path], dst: Optional[Path] = None) -> pd.DataFrame:\n",
    "    \"\"\"Function convert reads TensorBoard (written by HuggingFace transformers\n",
    "    library), filters it contents, and write it to CSV file.\n",
    "    \"\"\"\n",
    "    records = []\n",
    "    for i, el in enumerate(tf.data.TFRecordDataset(srcs)):\n",
    "        event = tb.compat.proto.event_pb2.Event.FromString(el.numpy())\n",
    "        event = tb.data_compat.migrate_event(event)\n",
    "        if event.summary is None:\n",
    "            continue\n",
    "        for value in event.summary.value:\n",
    "            prefix, _ = value.tag.split('/', 1)\n",
    "            if prefix not in ('eval', 'train'):\n",
    "                continue\n",
    "            records.append((event.step, value.tag, value.tensor.float_val[0]))\n",
    "\n",
    "    df = pd.DataFrame(data=records, columns=('step', 'tag', 'value'))\n",
    "    df = df.set_index(['tag', 'step'])\n",
    "    df = df.sort_index()\n",
    "    if dst:\n",
    "        df.to_csv(dst)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5c777024",
   "metadata": {},
   "outputs": [],
   "source": [
    "def format_index(x: str) -> str:\n",
    "    try:\n",
    "        return f'{int(x)}%'\n",
    "    except Exception:\n",
    "        return x.capitalize()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a600214d",
   "metadata": {},
   "source": [
    "Find all summary files in specific directory."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "69b8fd57",
   "metadata": {},
   "outputs": [],
   "source": [
    "dirname = Path('../log')\n",
    "filenames = !find \"$dirname\" -maxdepth 3 -type f -printf \"%P\\n\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8f97ff3b",
   "metadata": {},
   "source": [
    "Read all TensorBoard logs."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5d7b651f",
   "metadata": {},
   "outputs": [],
   "source": [
    "frames = []\n",
    "for filename in filenames:\n",
    "    param, task, _ = filename.split('/', 2)\n",
    "    path = dirname / filename\n",
    "    frame = convert(path)\n",
    "    frame = frame.reset_index()\n",
    "    frame['task'] = task.upper()\n",
    "    frame['param'] = param\n",
    "    frames.append(frame)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fd6f3dcc",
   "metadata": {},
   "source": [
    "Select neccesary columns and filter rows by metric name."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b8c8165f",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.concat(frames) \\\n",
    "    .set_index(['task', 'param', 'tag', 'step']) \\\n",
    "    .sort_index() \\\n",
    "    .reset_index()\n",
    "df = df[df.tag.isin(METRICS)]\n",
    "df.to_csv('summary.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "01798ea7",
   "metadata": {},
   "source": [
    "Make summary table and export it to LaTeX."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cec5e80d",
   "metadata": {},
   "outputs": [],
   "source": [
    "summary = df \\\n",
    "    .groupby(['task', 'param'])[['value']] \\\n",
    "    .max()\n",
    "summary['value'] = summary.value * 100\n",
    "summary = summary\\\n",
    "        .pivot_table(['value'], ['param'], ['task']) \\\n",
    "        .sort_index(ascending=False)\n",
    "summary.columns = summary.columns.levels[1]\n",
    "summary.columns.rename(None, inplace=True)\n",
    "summary.index = summary.index.map(format_index)\n",
    "summary.index.rename(None, inplace=True)\n",
    "summary.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f2cc9972",
   "metadata": {},
   "outputs": [],
   "source": [
    "summary.to_latex(buf='table.tex',\n",
    "                 na_rep=f'{MDASH:^5s}',\n",
    "                 float_format=lambda x: f'{x:5.2f}',\n",
    "                 caption='Fine-tuning on GLUE tasks.',\n",
    "                 label='tab:glue-fine-tuning')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "69f2ea17",
   "metadata": {},
   "outputs": [],
   "source": [
    "!cat 'table.tex'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "edc542db",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.concat(frames)\n",
    "df = df[df.tag == 'train/train_samples_per_second'] \\\n",
    "    .set_index(['task', 'param']) \\\n",
    "    .sort_index()\n",
    "df = df[['value']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9c963f05",
   "metadata": {},
   "outputs": [],
   "source": [
    "colors = plt.rcParams['axes.prop_cycle'].by_key()['color']\n",
    "ratios = [1, 5, 10, 20, 50, 90, 100]\n",
    "index = ['01', '05', '10', '20', '50', '90', '100', 'baseline']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cc30c75d",
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(figsize=(6.7, 3), dpi=150)\n",
    "fig.suptitle('Throughput of model with Gaussian randomized matmul')\n",
    "\n",
    "for color, (key, group) in zip(colors, df.groupby(level=[0])):\n",
    "    values = group \\\n",
    "        .reset_index(level=[0], drop=True) \\\n",
    "        .reindex(index=index) \\\n",
    "        .value\n",
    "    throughput, baseline = values[:-1].values, values[-1]\n",
    "    ax.plot(ratios, throughput / baseline, '-', color=color, label=key)\n",
    "\n",
    "ax.grid()\n",
    "ax.legend()\n",
    "ax.set_xlim(0, 100)\n",
    "ax.set_xticks([0, 10, 20, 30, 40, 50, 60, 70])\n",
    "ax.set_xlabel('Compression ratio, %')\n",
    "ax.set_ylabel('Speed up in throughput')"
   ]
  }
 ],
 "metadata": {
  "jupytext": {
   "cell_metadata_filter": "-all",
   "encoding": "# -*- coding: utf-8 -*-",
   "main_language": "python",
   "notebook_metadata_filter": "-all"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
