{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "3d339bf1-4c2a-4a20-ae87-9485d8e3d1bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "e8297d0e-6a30-4651-a806-e7ed684da042",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "if os.path.isdir(\"../notebooks/\"):\n",
    "    os.chdir(\"../badseeds/\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "3cf86f9d-063d-4cc8-bd25-19d9bcda2176",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import json\n",
    "\n",
    "import spacy\n",
    "from tqdm import tqdm\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from collections import Counter\n",
    "\n",
    "import preprocess"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "a26fe4a9-2834-4488-b424-026f46753013",
   "metadata": {},
   "outputs": [],
   "source": [
    "# path to config json file containing paths to datasets. change if necessary\n",
    "CONFIG_PATH = \"../config.json\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c7c1c5ec-fb18-4282-8797-b6c7d633015e",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(CONFIG_PATH, \"r\") as f:\n",
    "    config = json.load(f)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "15509f51-cbe6-49a1-90bb-0159c54e54aa",
   "metadata": {},
   "source": [
    "## Preprocessing\n",
    "\n",
    "If you already have saved proprocessed the datasets, preprocessing can be skipped, reading the preprocessed results from disk (Default). Otherwise, change the `PREPROC_NOW` flag to `True` to preprocess the data now. Note that preprocessing will take a long time.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "a676a87b-5284-4c14-b7d5-d6164e88d1e9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "reading nyt\n",
      "reading wikitext\n",
      "reading goodreads romance\n",
      "Directory detected, reading and concatenating all containing files\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████| 8/8 [00:01<00:00,  5.26it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "reading goodreads history/biography\n",
      "Directory detected, reading and concatenating all containing files\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████| 5/5 [00:01<00:00,  4.82it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "peak memory: 4274.16 MiB, increment: 3862.75 MiB\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "PREPROC_NOW = False\n",
    "# if your paths are different, change them accordingly (paths to preproc files)\n",
    "pproc_data_path = os.path.join(config[\"preprocessed\"][\"dir_path\"])\n",
    "NYT_PATH = os.path.join(pproc_data_path, \"nytimes_news_articles.bin\")\n",
    "WIKI_PATH = os.path.join(pproc_data_path, \"wiki.train.tokens.bin\")\n",
    "GRR_PATH = os.path.join(pproc_data_path, \"romance\")\n",
    "GRHB_PATH = os.path.join(pproc_data_path, \"history_biography\")\n",
    "# preprocess if requested. This will save results to disk.\n",
    "if PREPROC_NOW:\n",
    "    preprocess.preprocess_datasets()\n",
    "# read preprocessed results from disk\n",
    "pproc_data = preprocess.read_pproc_datasets(NYT_PATH, WIKI_PATH, GRR_PATH, GRHB_PATH)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "99592670-70e6-409f-84aa-02bb753e4447",
   "metadata": {},
   "source": [
    "## Table 2"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "59b40cc9-6127-44e7-904b-b98dca49e72e",
   "metadata": {},
   "source": [
    "### Prepare"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "6281930f-5ae8-4384-a5d4-595f699f566c",
   "metadata": {},
   "outputs": [],
   "source": [
    "table_2 = pd.DataFrame(\n",
    "    index=pd.Series(\n",
    "        [\"NYT\", \"WikiText\", \"Goodreads (Romance)\", \"Goodreads (History/Biography)\"],\n",
    "        name=\"Dataset\",\n",
    "    ),\n",
    "    columns=[\n",
    "        \"Total Documents_theirs\",\n",
    "        \"Total Documents_ours\",\n",
    "        \"Total Words_theirs\",\n",
    "        \"Total Words_ours\",\n",
    "        \"Vocabulary Size_theirs\",\n",
    "        \"Vocabulary Size_ours\",\n",
    "        \"Mean Document Length_theirs\",\n",
    "        \"Mean Document Length_ours\",\n",
    "    ],\n",
    ")\n",
    "table_2.columns = table_2.columns.str.split(\"_\", expand=True)\n",
    "table_2[(\"Total Documents\", \"theirs\")] = [8888, 28472, 197000, 136000]\n",
    "table_2[(\"Total Words\", \"theirs\")] = [7244457, 99197146, 24856924, 14324947]\n",
    "table_2[(\"Vocabulary Size\", \"theirs\")] = [162998, 546828, 214572, 163171]\n",
    "table_2[(\"Mean Document Length\", \"theirs\")] = [815, 3484, 126, 105]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "abd62205-09f2-4665-8495-c73b780345b5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr:last-of-type th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th colspan=\"2\" halign=\"left\">Total Documents</th>\n",
       "      <th colspan=\"2\" halign=\"left\">Total Words</th>\n",
       "      <th colspan=\"2\" halign=\"left\">Vocabulary Size</th>\n",
       "      <th colspan=\"2\" halign=\"left\">Mean Document Length</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>theirs</th>\n",
       "      <th>ours</th>\n",
       "      <th>theirs</th>\n",
       "      <th>ours</th>\n",
       "      <th>theirs</th>\n",
       "      <th>ours</th>\n",
       "      <th>theirs</th>\n",
       "      <th>ours</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Dataset</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>NYT</th>\n",
       "      <td>8888</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7244457</td>\n",
       "      <td>NaN</td>\n",
       "      <td>162998</td>\n",
       "      <td>NaN</td>\n",
       "      <td>815</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>WikiText</th>\n",
       "      <td>28472</td>\n",
       "      <td>NaN</td>\n",
       "      <td>99197146</td>\n",
       "      <td>NaN</td>\n",
       "      <td>546828</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3484</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Goodreads (Romance)</th>\n",
       "      <td>197000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>24856924</td>\n",
       "      <td>NaN</td>\n",
       "      <td>214572</td>\n",
       "      <td>NaN</td>\n",
       "      <td>126</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Goodreads (History/Biography)</th>\n",
       "      <td>136000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>14324947</td>\n",
       "      <td>NaN</td>\n",
       "      <td>163171</td>\n",
       "      <td>NaN</td>\n",
       "      <td>105</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                              Total Documents      Total Words       \\\n",
       "                                       theirs ours      theirs ours   \n",
       "Dataset                                                               \n",
       "NYT                                      8888  NaN     7244457  NaN   \n",
       "WikiText                                28472  NaN    99197146  NaN   \n",
       "Goodreads (Romance)                    197000  NaN    24856924  NaN   \n",
       "Goodreads (History/Biography)          136000  NaN    14324947  NaN   \n",
       "\n",
       "                              Vocabulary Size      Mean Document Length       \n",
       "                                       theirs ours               theirs ours  \n",
       "Dataset                                                                       \n",
       "NYT                                    162998  NaN                  815  NaN  \n",
       "WikiText                               546828  NaN                 3484  NaN  \n",
       "Goodreads (Romance)                    214572  NaN                  126  NaN  \n",
       "Goodreads (History/Biography)          163171  NaN                  105  NaN  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "table_2"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ed8f6f7d-88af-4bd6-af9d-911d953a9da6",
   "metadata": {},
   "source": [
    "### Helper Class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "03ed9c1c",
   "metadata": {},
   "outputs": [],
   "source": [
    "class StatGetter:\n",
    "    def __init__(self, docbin):\n",
    "        self.counter = Counter()\n",
    "        self.docbin = docbin\n",
    "        self.length = 0\n",
    "\n",
    "    def get_total_documents(self):\n",
    "        print(\"Computing total docs\")\n",
    "        self.length = sum(1 for _ in preprocess.docbin_to_docs(self.docbin))\n",
    "        return self.length\n",
    "\n",
    "    def count_tokens(self):\n",
    "        print(\"Counting word occurrences\")\n",
    "        for doc in tqdm(preprocess.docbin_to_docs(self.docbin), total=self.length):\n",
    "            for token in doc:\n",
    "                # only keep alphanumeric tokens\n",
    "                token = re.sub(r\"[^\\w\\s]\", \"\", token.text)\n",
    "                if token != \"\":\n",
    "                    self.counter[token] += 1\n",
    "\n",
    "    def filter_counter(self):\n",
    "        print(\"Only keeping words that appear at least 10 times\")\n",
    "        # only keep words that appear at least 10 times\n",
    "        self.counter = {x: count for x, count in self.counter.items() if count >= 10}\n",
    "\n",
    "    def count_and_filter(self):\n",
    "        self.count_tokens()\n",
    "        self.filter_counter()\n",
    "\n",
    "    def get_total_words(self):\n",
    "        print(\"Computing total words\")\n",
    "        if len(self.counter) == 0:\n",
    "            self.count_and_filter()\n",
    "        return sum(self.counter.values())\n",
    "\n",
    "    def get_vocab_size(self, filter_flag=False):\n",
    "        print(\"Computing vocab size\")\n",
    "        if filter_flag:\n",
    "            if len(self.counter) == 0:\n",
    "                self.count_and_filter()\n",
    "            return len(self.counter.keys())\n",
    "        else:\n",
    "            vocab = set()\n",
    "            for doc in tqdm(preprocess.docbin_to_docs(self.docbin), total=self.length):\n",
    "                for token in doc:\n",
    "                    vocab.add(token.text)\n",
    "            return len(vocab)\n",
    "\n",
    "    def get_mean_document_length(self):\n",
    "        print(\"Computing mean document length\")\n",
    "        if len(self.counter) == 0:\n",
    "            self.count_and_filter()\n",
    "        lengths = []\n",
    "        for doc in tqdm(preprocess.docbin_to_docs(self.docbin), total=self.length):\n",
    "            length = 0\n",
    "            for token in doc:\n",
    "                if self.counter.get(re.sub(r\"[^\\w\\s]\", \"\", token.text), 0) > 0:\n",
    "                    length += 1\n",
    "            lengths.append(length)\n",
    "        return np.mean(lengths)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e2df31d5-a8c9-4324-8d90-0ae0c631c97c",
   "metadata": {},
   "source": [
    "### Stats Computation\n",
    "\n",
    "This will take a while (~15 mins)! Run the cells below and go get a coffee or something."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "7c6642ba",
   "metadata": {},
   "outputs": [],
   "source": [
    "amount_documents = []\n",
    "word_counts = []\n",
    "vocab_counts = []\n",
    "document_lengths = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "7d76a523",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Getting statistics for NYT\n",
      "Computing total docs\n",
      "Computing total words\n",
      "Counting word occurrences\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████| 8888/8888 [00:12<00:00, 727.73it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Only keeping words that appear at least 10 times\n",
      "Computing vocab size\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████| 8888/8888 [00:04<00:00, 1784.34it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Computing mean document length\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████| 8888/8888 [00:09<00:00, 927.17it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "Getting statistics for WikiText\n",
      "Computing total docs\n",
      "Computing total words\n",
      "Counting word occurrences\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████| 28472/28472 [02:08<00:00, 222.30it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Only keeping words that appear at least 10 times\n",
      "Computing vocab size\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████| 28472/28472 [00:53<00:00, 527.55it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Computing mean document length\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████| 28472/28472 [01:57<00:00, 241.96it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "Getting statistics for Goodreads (Romance)\n",
      "Computing total docs\n",
      "Computing total words\n",
      "Counting word occurrences\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████| 194500/194500 [01:00<00:00, 3195.69it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Only keeping words that appear at least 10 times\n",
      "Computing vocab size\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████| 194500/194500 [00:39<00:00, 4934.34it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Computing mean document length\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████| 194500/194500 [00:59<00:00, 3289.11it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "Getting statistics for Goodreads (History/Biography)\n",
      "Computing total docs\n",
      "Computing total words\n",
      "Counting word occurrences\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████| 135000/135000 [00:38<00:00, 3492.78it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Only keeping words that appear at least 10 times\n",
      "Computing vocab size\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████| 135000/135000 [00:25<00:00, 5397.20it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Computing mean document length\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████| 135000/135000 [00:37<00:00, 3608.11it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "peak memory: 3560.87 MiB, increment: 207.52 MiB\n"
     ]
    }
   ],
   "source": [
    "# iterate dataset by dataset\n",
    "for key, docbin in pproc_data.items():\n",
    "    print(f\"Getting statistics for {key}\")\n",
    "    # init StatGetter object\n",
    "    stats = StatGetter(docbin)\n",
    "    # record and compute stats for this particular dataset\n",
    "    amount_documents.append(stats.get_total_documents())\n",
    "    word_counts.append(stats.get_total_words())\n",
    "    vocab_counts.append(stats.get_vocab_size())\n",
    "    document_lengths.append(stats.get_mean_document_length())\n",
    "    print(\"\\n\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "22ece920",
   "metadata": {},
   "source": [
    "### Insert data in table 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "4031dd0d",
   "metadata": {},
   "outputs": [],
   "source": [
    "table_2[(\"Total Documents\", \"ours\")] = amount_documents\n",
    "table_2[(\"Total Words\", \"ours\")] = word_counts\n",
    "table_2[(\"Vocabulary Size\", \"ours\")] = vocab_counts\n",
    "table_2[(\"Mean Document Length\", \"ours\")] = document_lengths"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ceb2e422",
   "metadata": {},
   "source": [
    "### Format and show table 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "f02db25a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style type=\"text/css\">\n",
       "</style>\n",
       "<table id=\"T_59061_\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th class=\"blank level0\" >&nbsp;</th>\n",
       "      <th class=\"col_heading level0 col0\" colspan=\"2\">Total Documents</th>\n",
       "      <th class=\"col_heading level0 col2\" colspan=\"2\">Total Words</th>\n",
       "      <th class=\"col_heading level0 col4\" colspan=\"2\">Vocabulary Size</th>\n",
       "      <th class=\"col_heading level0 col6\" colspan=\"2\">Mean Document Length</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th class=\"blank level1\" >&nbsp;</th>\n",
       "      <th class=\"col_heading level1 col0\" >theirs</th>\n",
       "      <th class=\"col_heading level1 col1\" >ours</th>\n",
       "      <th class=\"col_heading level1 col2\" >theirs</th>\n",
       "      <th class=\"col_heading level1 col3\" >ours</th>\n",
       "      <th class=\"col_heading level1 col4\" >theirs</th>\n",
       "      <th class=\"col_heading level1 col5\" >ours</th>\n",
       "      <th class=\"col_heading level1 col6\" >theirs</th>\n",
       "      <th class=\"col_heading level1 col7\" >ours</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th class=\"index_name level0\" >Dataset</th>\n",
       "      <th class=\"blank col0\" >&nbsp;</th>\n",
       "      <th class=\"blank col1\" >&nbsp;</th>\n",
       "      <th class=\"blank col2\" >&nbsp;</th>\n",
       "      <th class=\"blank col3\" >&nbsp;</th>\n",
       "      <th class=\"blank col4\" >&nbsp;</th>\n",
       "      <th class=\"blank col5\" >&nbsp;</th>\n",
       "      <th class=\"blank col6\" >&nbsp;</th>\n",
       "      <th class=\"blank col7\" >&nbsp;</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th id=\"T_59061_level0_row0\" class=\"row_heading level0 row0\" >NYT</th>\n",
       "      <td id=\"T_59061_row0_col0\" class=\"data row0 col0\" >8,888</td>\n",
       "      <td id=\"T_59061_row0_col1\" class=\"data row0 col1\" >8,888</td>\n",
       "      <td id=\"T_59061_row0_col2\" class=\"data row0 col2\" >7,244,457</td>\n",
       "      <td id=\"T_59061_row0_col3\" class=\"data row0 col3\" >7,217,851</td>\n",
       "      <td id=\"T_59061_row0_col4\" class=\"data row0 col4\" >162,998</td>\n",
       "      <td id=\"T_59061_row0_col5\" class=\"data row0 col5\" >109,713</td>\n",
       "      <td id=\"T_59061_row0_col6\" class=\"data row0 col6\" >815</td>\n",
       "      <td id=\"T_59061_row0_col7\" class=\"data row0 col7\" >812</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_59061_level0_row1\" class=\"row_heading level0 row1\" >WikiText</th>\n",
       "      <td id=\"T_59061_row1_col0\" class=\"data row1 col0\" >28,472</td>\n",
       "      <td id=\"T_59061_row1_col1\" class=\"data row1 col1\" >28,472</td>\n",
       "      <td id=\"T_59061_row1_col2\" class=\"data row1 col2\" >99,197,146</td>\n",
       "      <td id=\"T_59061_row1_col3\" class=\"data row1 col3\" >87,073,910</td>\n",
       "      <td id=\"T_59061_row1_col4\" class=\"data row1 col4\" >546,828</td>\n",
       "      <td id=\"T_59061_row1_col5\" class=\"data row1 col5\" >228,318</td>\n",
       "      <td id=\"T_59061_row1_col6\" class=\"data row1 col6\" >3,484</td>\n",
       "      <td id=\"T_59061_row1_col7\" class=\"data row1 col7\" >3,058</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_59061_level0_row2\" class=\"row_heading level0 row2\" >Goodreads (Romance)</th>\n",
       "      <td id=\"T_59061_row2_col0\" class=\"data row2 col0\" >197,000</td>\n",
       "      <td id=\"T_59061_row2_col1\" class=\"data row2 col1\" >194,500</td>\n",
       "      <td id=\"T_59061_row2_col2\" class=\"data row2 col2\" >24,856,924</td>\n",
       "      <td id=\"T_59061_row2_col3\" class=\"data row2 col3\" >24,649,361</td>\n",
       "      <td id=\"T_59061_row2_col4\" class=\"data row2 col4\" >214,572</td>\n",
       "      <td id=\"T_59061_row2_col5\" class=\"data row2 col5\" >252,816</td>\n",
       "      <td id=\"T_59061_row2_col6\" class=\"data row2 col6\" >126</td>\n",
       "      <td id=\"T_59061_row2_col7\" class=\"data row2 col7\" >127</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th id=\"T_59061_level0_row3\" class=\"row_heading level0 row3\" >Goodreads (History/Biography)</th>\n",
       "      <td id=\"T_59061_row3_col0\" class=\"data row3 col0\" >136,000</td>\n",
       "      <td id=\"T_59061_row3_col1\" class=\"data row3 col1\" >135,000</td>\n",
       "      <td id=\"T_59061_row3_col2\" class=\"data row3 col2\" >14,324,947</td>\n",
       "      <td id=\"T_59061_row3_col3\" class=\"data row3 col3\" >14,126,407</td>\n",
       "      <td id=\"T_59061_row3_col4\" class=\"data row3 col4\" >163,171</td>\n",
       "      <td id=\"T_59061_row3_col5\" class=\"data row3 col5\" >192,073</td>\n",
       "      <td id=\"T_59061_row3_col6\" class=\"data row3 col6\" >105</td>\n",
       "      <td id=\"T_59061_row3_col7\" class=\"data row3 col7\" >105</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n"
      ],
      "text/plain": [
       "<pandas.io.formats.style.Styler at 0x7fdf0488b760>"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# to get into latex, copy into https://www.tablesgenerator.com/latex_tables\n",
    "table_2.round().astype(int).style.format(\"{:,d}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
