{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pyspark.sql.functions as F\n",
    "import pyspark.sql.types as T\n",
    "from pyspark.sql import SparkSession\n",
    "from pyspark.sql import Window\n",
    "from pyspark.sql import Row"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "APP_NAME = \"gender_read_dataset_spark\"\n",
    "\n",
    "config = {\n",
    "    'spark.driver.memory': '200G',\n",
    "    'spark.sql.shuffle.partitions': '1000',\n",
    "    'spark.ui.port': '4041',\n",
    "    'spark.local.dir': '/mnt/data/kireev/spark.local_dir/',\n",
    "}\n",
    "\n",
    "spark = SparkSession.builder\n",
    "for k, v in config.items():\n",
    "    spark = spark.config(k, v)\n",
    "spark = spark.appName(APP_NAME).master('local[16]').getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "            <div>\n",
       "                <p><b>SparkSession - in-memory</b></p>\n",
       "                \n",
       "        <div>\n",
       "            <p><b>SparkContext</b></p>\n",
       "\n",
       "            <p><a href=\"http://tdl-gpu:4041\">Spark UI</a></p>\n",
       "\n",
       "            <dl>\n",
       "              <dt>Version</dt>\n",
       "                <dd><code>v2.4.5</code></dd>\n",
       "              <dt>Master</dt>\n",
       "                <dd><code>local[16]</code></dd>\n",
       "              <dt>AppName</dt>\n",
       "                <dd><code>gender_read_dataset_spark</code></dd>\n",
       "            </dl>\n",
       "        </div>\n",
       "        \n",
       "            </div>\n",
       "        "
      ],
      "text/plain": [
       "<pyspark.sql.session.SparkSession at 0x7f02b8ea4400>"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "spark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dataset Preparation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "N_CLIENTS = 5e6\n",
    "N_TRX_PER_CLIENT = 250\n",
    "N_CAT_FIELDS = 6\n",
    "N_AMOUNTS = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = spark.range(0, N_CLIENTS * N_TRX_PER_CLIENT).repartition(1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.withColumn('client_id', (F.rand() * N_CLIENTS).cast('int'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.withColumn('event_time', (F.rand() * 365).cast('float'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(N_CAT_FIELDS):\n",
    "    df = df.withColumn(f'cat_{i}', F.abs((F.pow(F.randn(), 2) * 300)).cast('int') + 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(N_AMOUNTS):\n",
    "    df = df.withColumn(f'amnt_{i}', (F.randn() * 1000).cast('int') * 100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.drop('id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def collect_lists(df, col_id):\n",
    "    col_list = [col for col in df.columns if col != col_id]\n",
    "    df = df \\\n",
    "        .withColumn('trx_count', F.count(F.lit(1)).over(Window.partitionBy(col_id))) \\\n",
    "        .withColumn('_rn', F.row_number().over(Window.partitionBy(col_id).orderBy('event_time')))\n",
    "\n",
    "    w = Window.partitionBy(col_id).orderBy('_rn')\n",
    "    for col in col_list:\n",
    "        df = df.withColumn(col, F.collect_list(col).over(w)) \\\n",
    "\n",
    "    df = df.filter('_rn = trx_count').drop('_rn')\n",
    "    return df\n",
    "\n",
    "df = collect_lists(df, 'client_id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 422 ms, sys: 233 ms, total: 656 ms\n",
      "Wall time: 42min 6s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "df.repartition(50).write \\\n",
    "    .option('compression', 'uncompressed') \\\n",
    "    .mode('overwrite') \\\n",
    "    .parquet('/mnt/data/kireev/pycharm_1/dltranz/experiments/pyarrow_datasets/df_trx.uncompressed.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = spark.read.parquet('/mnt/data/kireev/pycharm_1/dltranz/experiments/pyarrow_datasets/df_trx.uncompressed.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 15.4 ms, sys: 13.5 ms, total: 28.9 ms\n",
      "Wall time: 1min 57s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "df.write \\\n",
    "    .option('compression', 'snappy') \\\n",
    "    .mode('overwrite') \\\n",
    "    .parquet('/mnt/data/kireev/pycharm_1/dltranz/experiments/pyarrow_datasets/df_trx.snappy.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 29.9 ms, sys: 8.6 ms, total: 38.5 ms\n",
      "Wall time: 2min 52s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "df.write \\\n",
    "    .option('compression', 'gzip') \\\n",
    "    .mode('overwrite') \\\n",
    "    .parquet('/mnt/data/kireev/pycharm_1/dltranz/experiments/pyarrow_datasets/df_trx.gzip.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "def to_array(x):\n",
    "    if type(x) is not list:\n",
    "        return x\n",
    "    \n",
    "    x = np.array(x)\n",
    "    if x.dtype.kind == 'i':\n",
    "        x = x.astype(np.int32)      \n",
    "    if x.dtype.kind == 'f':\n",
    "        x = x.astype(np.float32)      \n",
    "    \n",
    "    return x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.rdd \\\n",
    "    .map(lambda x: {k: to_array(v) for k, v in x.asDict().items()})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "!rm -r /mnt/data/kireev/pycharm_1/dltranz/experiments/pyarrow_datasets/df_trx.sparkpickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 33.9 ms, sys: 13.8 ms, total: 47.7 ms\n",
      "Wall time: 2min 47s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "df.saveAsPickleFile('/mnt/data/kireev/pycharm_1/dltranz/experiments/pyarrow_datasets/df_trx.sparkpickle')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "17G\t/mnt/data/kireev/pycharm_1/dltranz/experiments/pyarrow_datasets/df_trx.gzip.parquet\r\n",
      "19G\t/mnt/data/kireev/pycharm_1/dltranz/experiments/pyarrow_datasets/df_trx.snappy.parquet\r\n",
      "40G\t/mnt/data/kireev/pycharm_1/dltranz/experiments/pyarrow_datasets/df_trx.sparkpickle\r\n",
      "19G\t/mnt/data/kireev/pycharm_1/dltranz/experiments/pyarrow_datasets/df_trx.uncompressed.parquet\r\n"
     ]
    }
   ],
   "source": [
    "!du -sh /mnt/data/kireev/pycharm_1/dltranz/experiments/pyarrow_datasets/df_trx*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Read results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_json('/mnt/data/kireev/pycharm_1/dltranz/experiments/pyarrow_datasets/results.json', lines=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>data_path</th>\n",
       "      <th>n_process</th>\n",
       "      <th>cpu_count</th>\n",
       "      <th>return_data</th>\n",
       "      <th>engine</th>\n",
       "      <th>output_file</th>\n",
       "      <th>elapsed</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>/mnt/data/kireev/pycharm_1/dltranz/experiments...</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>sparkpickle</td>\n",
       "      <td>results.json</td>\n",
       "      <td>171</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>/mnt/data/kireev/pycharm_1/dltranz/experiments...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>sparkpickle</td>\n",
       "      <td>results.json</td>\n",
       "      <td>146</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>/mnt/data/kireev/pycharm_1/dltranz/experiments...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>sparkpickle</td>\n",
       "      <td>results.json</td>\n",
       "      <td>143</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>/mnt/data/kireev/pycharm_1/dltranz/experiments...</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>sparkpickle</td>\n",
       "      <td>results.json</td>\n",
       "      <td>73</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>/mnt/data/kireev/pycharm_1/dltranz/experiments...</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "      <td>sparkpickle</td>\n",
       "      <td>results.json</td>\n",
       "      <td>36</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>115</th>\n",
       "      <td>/mnt/data/kireev/pycharm_1/dltranz/experiments...</td>\n",
       "      <td>16</td>\n",
       "      <td>8</td>\n",
       "      <td>True</td>\n",
       "      <td>pyarrow</td>\n",
       "      <td>results.json</td>\n",
       "      <td>251</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>116</th>\n",
       "      <td>/mnt/data/kireev/pycharm_1/dltranz/experiments...</td>\n",
       "      <td>18</td>\n",
       "      <td>8</td>\n",
       "      <td>True</td>\n",
       "      <td>pyarrow</td>\n",
       "      <td>results.json</td>\n",
       "      <td>242</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>117</th>\n",
       "      <td>/mnt/data/kireev/pycharm_1/dltranz/experiments...</td>\n",
       "      <td>20</td>\n",
       "      <td>8</td>\n",
       "      <td>True</td>\n",
       "      <td>pyarrow</td>\n",
       "      <td>results.json</td>\n",
       "      <td>243</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>118</th>\n",
       "      <td>/mnt/data/kireev/pycharm_1/dltranz/experiments...</td>\n",
       "      <td>22</td>\n",
       "      <td>8</td>\n",
       "      <td>True</td>\n",
       "      <td>pyarrow</td>\n",
       "      <td>results.json</td>\n",
       "      <td>233</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>119</th>\n",
       "      <td>/mnt/data/kireev/pycharm_1/dltranz/experiments...</td>\n",
       "      <td>24</td>\n",
       "      <td>8</td>\n",
       "      <td>True</td>\n",
       "      <td>pyarrow</td>\n",
       "      <td>results.json</td>\n",
       "      <td>260</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>120 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                             data_path  n_process  cpu_count  \\\n",
       "0    /mnt/data/kireev/pycharm_1/dltranz/experiments...         -1          0   \n",
       "1    /mnt/data/kireev/pycharm_1/dltranz/experiments...          0          0   \n",
       "2    /mnt/data/kireev/pycharm_1/dltranz/experiments...          1          0   \n",
       "3    /mnt/data/kireev/pycharm_1/dltranz/experiments...          2          0   \n",
       "4    /mnt/data/kireev/pycharm_1/dltranz/experiments...          4          0   \n",
       "..                                                 ...        ...        ...   \n",
       "115  /mnt/data/kireev/pycharm_1/dltranz/experiments...         16          8   \n",
       "116  /mnt/data/kireev/pycharm_1/dltranz/experiments...         18          8   \n",
       "117  /mnt/data/kireev/pycharm_1/dltranz/experiments...         20          8   \n",
       "118  /mnt/data/kireev/pycharm_1/dltranz/experiments...         22          8   \n",
       "119  /mnt/data/kireev/pycharm_1/dltranz/experiments...         24          8   \n",
       "\n",
       "     return_data       engine   output_file  elapsed  \n",
       "0          False  sparkpickle  results.json      171  \n",
       "1          False  sparkpickle  results.json      146  \n",
       "2          False  sparkpickle  results.json      143  \n",
       "3          False  sparkpickle  results.json       73  \n",
       "4          False  sparkpickle  results.json       36  \n",
       "..           ...          ...           ...      ...  \n",
       "115         True      pyarrow  results.json      251  \n",
       "116         True      pyarrow  results.json      242  \n",
       "117         True      pyarrow  results.json      243  \n",
       "118         True      pyarrow  results.json      233  \n",
       "119         True      pyarrow  results.json      260  \n",
       "\n",
       "[120 rows x 7 columns]"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_read = df[lambda x: ~x['return_data']].copy()\n",
    "df_load = df[lambda x: x['return_data']].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr:last-of-type th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th colspan=\"6\" halign=\"left\">elapsed</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>engine</th>\n",
       "      <th>sparkpickle</th>\n",
       "      <th colspan=\"5\" halign=\"left\">pyarrow</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cpu_count</th>\n",
       "      <th>0</th>\n",
       "      <th>0</th>\n",
       "      <th>2</th>\n",
       "      <th>4</th>\n",
       "      <th>6</th>\n",
       "      <th>8</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>n_process</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>-1</th>\n",
       "      <td>171</td>\n",
       "      <td>135</td>\n",
       "      <td>79</td>\n",
       "      <td>50</td>\n",
       "      <td>50</td>\n",
       "      <td>39</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>146</td>\n",
       "      <td>123</td>\n",
       "      <td>73</td>\n",
       "      <td>49</td>\n",
       "      <td>47</td>\n",
       "      <td>37</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>143</td>\n",
       "      <td>115</td>\n",
       "      <td>69</td>\n",
       "      <td>46</td>\n",
       "      <td>44</td>\n",
       "      <td>34</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>73</td>\n",
       "      <td>58</td>\n",
       "      <td>35</td>\n",
       "      <td>23</td>\n",
       "      <td>23</td>\n",
       "      <td>18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>36</td>\n",
       "      <td>30</td>\n",
       "      <td>18</td>\n",
       "      <td>12</td>\n",
       "      <td>12</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>24</td>\n",
       "      <td>20</td>\n",
       "      <td>12</td>\n",
       "      <td>9</td>\n",
       "      <td>8</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>18</td>\n",
       "      <td>16</td>\n",
       "      <td>10</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>15</td>\n",
       "      <td>12</td>\n",
       "      <td>7</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>13</td>\n",
       "      <td>12</td>\n",
       "      <td>7</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>11</td>\n",
       "      <td>9</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>10</td>\n",
       "      <td>9</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>9</td>\n",
       "      <td>7</td>\n",
       "      <td>6</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>8</td>\n",
       "      <td>7</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>7</td>\n",
       "      <td>7</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>7</td>\n",
       "      <td>7</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              elapsed                        \n",
       "engine    sparkpickle pyarrow                \n",
       "cpu_count           0       0   2   4   6   8\n",
       "n_process                                    \n",
       "-1                171     135  79  50  50  39\n",
       " 0                146     123  73  49  47  37\n",
       " 1                143     115  69  46  44  34\n",
       " 2                 73      58  35  23  23  18\n",
       " 4                 36      30  18  12  12   9\n",
       " 6                 24      20  12   9   8   9\n",
       " 8                 18      16  10   7   8   8\n",
       " 10                15      12   7   6   7   7\n",
       " 12                13      12   7   6   6   7\n",
       " 14                11       9   6   6   6   7\n",
       " 16                10       9   6   6   6   6\n",
       " 18                 9       7   6   5   6   6\n",
       " 20                 8       7   6   6   6   6\n",
       " 22                 7       7   6   6   6   6\n",
       " 24                 7       7   6   6   6   6"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_read \\\n",
    "    .drop(columns=['data_path', 'output_file', 'return_data']) \\\n",
    "    .set_index(['n_process', 'cpu_count', 'engine']).unstack(level=[2, 1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr:last-of-type th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th colspan=\"2\" halign=\"left\">elapsed</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>engine</th>\n",
       "      <th>sparkpickle</th>\n",
       "      <th>pyarrow</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cpu_count</th>\n",
       "      <th>0</th>\n",
       "      <th>8</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>n_process</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>-1</th>\n",
       "      <td>176</td>\n",
       "      <td>38</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>199</td>\n",
       "      <td>39</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>475</td>\n",
       "      <td>419</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>248</td>\n",
       "      <td>274</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>227</td>\n",
       "      <td>232</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>245</td>\n",
       "      <td>227</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>246</td>\n",
       "      <td>261</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>255</td>\n",
       "      <td>237</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>221</td>\n",
       "      <td>240</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>226</td>\n",
       "      <td>247</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>225</td>\n",
       "      <td>251</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>233</td>\n",
       "      <td>242</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>224</td>\n",
       "      <td>243</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>215</td>\n",
       "      <td>233</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>236</td>\n",
       "      <td>260</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              elapsed        \n",
       "engine    sparkpickle pyarrow\n",
       "cpu_count           0       8\n",
       "n_process                    \n",
       "-1                176      38\n",
       " 0                199      39\n",
       " 1                475     419\n",
       " 2                248     274\n",
       " 4                227     232\n",
       " 6                245     227\n",
       " 8                246     261\n",
       " 10               255     237\n",
       " 12               221     240\n",
       " 14               226     247\n",
       " 16               225     251\n",
       " 18               233     242\n",
       " 20               224     243\n",
       " 22               215     233\n",
       " 24               236     260"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_load \\\n",
    "    .drop(columns=['data_path', 'output_file', 'return_data']) \\\n",
    "    .set_index(['n_process', 'cpu_count', 'engine']).unstack(level=[2, 1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python (TDL)",
   "language": "python",
   "name": "tdl_env"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
