{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "adfa9e34",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "0c44e809",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "import gensim_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "3e80a47f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# path to config json file containing paths to datasets. change if necessary\n",
    "CONFIG_PATH = \"../config.json\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "f1031912",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(CONFIG_PATH, \"r\") as f:\n",
    "    config = json.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "f4ed17bc",
   "metadata": {},
   "outputs": [],
   "source": [
    "arguments = {\n",
    "    \"models_dir\": \"/data/models/\",\n",
    "    \"number_samples\": 20,\n",
    "    \"vector_size\": 100,\n",
    "    \"window\": 5,\n",
    "    \"min_count\": 10,\n",
    "    \"negative\": 5,\n",
    "    \"workers\": 10,\n",
    "    \"seed\": 42,\n",
    "    \"epochs\": 5,\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "3bb7b945",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'spacy.tokens._serialize.DocBin'>\n",
      "Sample 1, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████| 8888/8888 [00:09<00:00, 971.56documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████| 87839/87839 [00:00<00:00, 831650.05vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "models/nytimes_news_articles_min10/vectors_sample1.kv already exists, skipping. Delete this file to save re-trained vectors.\n",
      "\n",
      "Saved in models/nytimes_news_articles_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 2, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████| 8888/8888 [00:09<00:00, 986.58documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████| 89144/89144 [00:00<00:00, 826953.35vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/nytimes_news_articles_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 3, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████| 8888/8888 [00:08<00:00, 1019.94documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████| 88564/88564 [00:00<00:00, 836861.34vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/nytimes_news_articles_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 4, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████| 8888/8888 [00:09<00:00, 919.11documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████| 89293/89293 [00:00<00:00, 843056.59vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/nytimes_news_articles_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 5, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████| 8888/8888 [00:09<00:00, 972.73documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████| 88215/88215 [00:00<00:00, 845752.22vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/nytimes_news_articles_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 6, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████| 8888/8888 [00:08<00:00, 996.37documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████| 89405/89405 [00:00<00:00, 808393.14vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/nytimes_news_articles_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 7, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████| 8888/8888 [00:08<00:00, 991.53documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████| 87414/87414 [00:00<00:00, 831407.86vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/nytimes_news_articles_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 8, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████| 8888/8888 [00:09<00:00, 959.61documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████| 89736/89736 [00:00<00:00, 842467.78vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/nytimes_news_articles_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 9, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████| 8888/8888 [00:08<00:00, 993.50documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████| 88582/88582 [00:00<00:00, 830574.00vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/nytimes_news_articles_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 10, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████| 8888/8888 [00:09<00:00, 965.33documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████| 89715/89715 [00:00<00:00, 783176.40vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/nytimes_news_articles_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 11, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████| 8888/8888 [00:08<00:00, 998.88documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████| 89590/89590 [00:00<00:00, 866744.85vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/nytimes_news_articles_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 12, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████| 8888/8888 [00:08<00:00, 991.00documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████| 89041/89041 [00:00<00:00, 855689.95vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/nytimes_news_articles_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 13, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████| 8888/8888 [00:08<00:00, 992.45documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████| 89245/89245 [00:00<00:00, 863431.02vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/nytimes_news_articles_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 14, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████| 8888/8888 [00:09<00:00, 958.68documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████| 88538/88538 [00:00<00:00, 848901.34vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/nytimes_news_articles_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 15, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████| 8888/8888 [00:09<00:00, 971.31documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████| 89006/89006 [00:00<00:00, 849837.28vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/nytimes_news_articles_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 16, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████| 8888/8888 [00:08<00:00, 1000.91documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████| 87700/87700 [00:00<00:00, 787515.17vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/nytimes_news_articles_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 17, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████| 8888/8888 [00:09<00:00, 951.50documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████| 90450/90450 [00:00<00:00, 875339.57vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/nytimes_news_articles_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 18, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████| 8888/8888 [00:08<00:00, 1031.13documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████| 86065/86065 [00:00<00:00, 834723.31vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/nytimes_news_articles_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 19, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████| 8888/8888 [00:09<00:00, 984.56documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████| 86719/86719 [00:00<00:00, 843545.49vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/nytimes_news_articles_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 20, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████| 8888/8888 [00:09<00:00, 936.52documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████| 88987/88987 [00:00<00:00, 855147.49vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/nytimes_news_articles_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'spacy.tokens._serialize.DocBin'>\n",
      "Sample 1, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████| 28472/28472 [01:45<00:00, 269.60documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 211535/211535 [00:00<00:00, 553592.12vectors/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/wiki.train.tokens_min10.\n",
      "Sample 2, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████| 28472/28472 [01:47<00:00, 263.86documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 211421/211421 [00:00<00:00, 552020.08vectors/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/wiki.train.tokens_min10.\n",
      "Sample 3, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████| 28472/28472 [01:44<00:00, 271.24documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 211721/211721 [00:00<00:00, 547979.22vectors/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/wiki.train.tokens_min10.\n",
      "Sample 4, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████| 28472/28472 [01:46<00:00, 267.55documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 211906/211906 [00:00<00:00, 492279.96vectors/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/wiki.train.tokens_min10.\n",
      "Sample 5, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████| 28472/28472 [01:46<00:00, 266.98documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 211209/211209 [00:00<00:00, 552801.06vectors/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/wiki.train.tokens_min10.\n",
      "Sample 6, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████| 28472/28472 [01:45<00:00, 270.20documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 211388/211388 [00:00<00:00, 550097.06vectors/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/wiki.train.tokens_min10.\n",
      "Sample 7, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████| 28472/28472 [01:45<00:00, 269.78documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 211272/211272 [00:00<00:00, 543312.57vectors/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/wiki.train.tokens_min10.\n",
      "Sample 8, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████| 28472/28472 [01:46<00:00, 268.43documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 211270/211270 [00:00<00:00, 548544.35vectors/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/wiki.train.tokens_min10.\n",
      "Sample 9, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████| 28472/28472 [01:45<00:00, 269.45documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 211263/211263 [00:00<00:00, 543493.03vectors/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/wiki.train.tokens_min10.\n",
      "Sample 10, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████| 28472/28472 [01:45<00:00, 269.15documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 211677/211677 [00:00<00:00, 555381.04vectors/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/wiki.train.tokens_min10.\n",
      "Sample 11, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████| 28472/28472 [01:46<00:00, 268.34documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 211812/211812 [00:00<00:00, 544017.80vectors/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/wiki.train.tokens_min10.\n",
      "Sample 12, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████| 28472/28472 [01:45<00:00, 269.25documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 211643/211643 [00:00<00:00, 557304.76vectors/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/wiki.train.tokens_min10.\n",
      "Sample 13, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████| 28472/28472 [01:46<00:00, 267.59documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 211612/211612 [00:00<00:00, 556035.19vectors/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/wiki.train.tokens_min10.\n",
      "Sample 14, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████| 28472/28472 [01:44<00:00, 272.62documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 211919/211919 [00:00<00:00, 539186.17vectors/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/wiki.train.tokens_min10.\n",
      "Sample 15, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████| 28472/28472 [01:46<00:00, 267.96documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 212045/212045 [00:00<00:00, 548580.50vectors/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/wiki.train.tokens_min10.\n",
      "Sample 16, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████| 28472/28472 [01:45<00:00, 269.69documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 211337/211337 [00:00<00:00, 553207.19vectors/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/wiki.train.tokens_min10.\n",
      "Sample 17, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████| 28472/28472 [01:45<00:00, 270.69documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 211518/211518 [00:00<00:00, 558888.93vectors/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/wiki.train.tokens_min10.\n",
      "Sample 18, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████| 28472/28472 [01:45<00:00, 268.88documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 211076/211076 [00:00<00:00, 556463.26vectors/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/wiki.train.tokens_min10.\n",
      "Sample 19, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████| 28472/28472 [01:44<00:00, 271.25documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 211120/211120 [00:00<00:00, 508014.90vectors/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/wiki.train.tokens_min10.\n",
      "Sample 20, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████| 28472/28472 [01:44<00:00, 271.43documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 211447/211447 [00:00<00:00, 549953.06vectors/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/wiki.train.tokens_min10.\n",
      "Directory detected, reading and concatenating all containing files\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████| 5/5 [00:00<00:00,  5.68it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'list'>\n",
      "Sample 1, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 135000/135000 [00:16<00:00, 7943.04documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 149652/149652 [00:00<00:00, 1147980.67vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/history_biography_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 2, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 135000/135000 [00:17<00:00, 7865.68documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 148119/148119 [00:00<00:00, 1159124.79vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/history_biography_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 3, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 135000/135000 [00:17<00:00, 7870.39documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 149181/149181 [00:00<00:00, 1184268.19vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/history_biography_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 4, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 135000/135000 [00:16<00:00, 8063.25documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 146968/146968 [00:00<00:00, 1159048.72vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/history_biography_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 5, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 135000/135000 [00:16<00:00, 8046.42documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 147346/147346 [00:00<00:00, 1160580.80vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/history_biography_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 6, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 135000/135000 [00:17<00:00, 7734.19documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 151016/151016 [00:00<00:00, 1179370.76vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/history_biography_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 7, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 135000/135000 [00:16<00:00, 8043.48documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 149415/149415 [00:00<00:00, 1157278.80vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/history_biography_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 8, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 135000/135000 [00:17<00:00, 7732.92documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 149588/149588 [00:00<00:00, 1177616.43vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/history_biography_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 9, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 135000/135000 [00:16<00:00, 8111.61documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 147934/147934 [00:00<00:00, 1199874.24vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/history_biography_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 10, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 135000/135000 [00:16<00:00, 8042.19documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 147373/147373 [00:00<00:00, 1174088.68vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/history_biography_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 11, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 135000/135000 [00:17<00:00, 7920.38documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 149439/149439 [00:00<00:00, 1201866.47vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/history_biography_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 12, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 135000/135000 [00:16<00:00, 7986.25documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 149151/149151 [00:00<00:00, 1194103.09vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/history_biography_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 13, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 135000/135000 [00:16<00:00, 8067.31documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 148744/148744 [00:00<00:00, 1167031.54vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/history_biography_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 14, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 135000/135000 [00:16<00:00, 7986.78documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 147622/147622 [00:00<00:00, 1043776.83vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/history_biography_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 15, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 135000/135000 [00:17<00:00, 7605.13documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 147795/147795 [00:00<00:00, 1133789.53vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/history_biography_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 16, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 135000/135000 [00:16<00:00, 8015.45documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 148210/148210 [00:00<00:00, 1170536.14vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/history_biography_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 17, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 135000/135000 [00:16<00:00, 8080.86documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 150191/150191 [00:00<00:00, 1182654.27vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/history_biography_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 18, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 135000/135000 [00:16<00:00, 8006.71documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 148933/148933 [00:00<00:00, 1164224.13vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/history_biography_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 19, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 135000/135000 [00:16<00:00, 8019.73documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 148543/148543 [00:00<00:00, 1105659.13vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/history_biography_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 20, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 135000/135000 [00:17<00:00, 7809.97documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 149672/149672 [00:00<00:00, 1187424.33vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/history_biography_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Directory detected, reading and concatenating all containing files\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████| 8/8 [00:01<00:00,  5.78it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'list'>\n",
      "Sample 1, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 194500/194500 [00:29<00:00, 6674.51documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 193945/193945 [00:00<00:00, 1214470.21vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/romance_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 2, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 194500/194500 [00:29<00:00, 6705.07documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 193707/193707 [00:00<00:00, 1196199.75vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/romance_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 3, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 194500/194500 [00:29<00:00, 6675.98documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 192948/192948 [00:00<00:00, 1168618.84vectors/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/romance_min10.\n",
      "Sample 4, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 194500/194500 [00:29<00:00, 6618.65documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 193099/193099 [00:00<00:00, 1185482.78vectors/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/romance_min10.\n",
      "Sample 5, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 194500/194500 [00:29<00:00, 6669.71documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 196760/196760 [00:00<00:00, 1196293.22vectors/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/romance_min10.\n",
      "Sample 6, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 194500/194500 [00:29<00:00, 6554.66documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 193761/193761 [00:00<00:00, 1202802.18vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/romance_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 7, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 194500/194500 [00:30<00:00, 6471.38documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 194058/194058 [00:00<00:00, 1209123.25vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/romance_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 8, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 194500/194500 [00:29<00:00, 6661.42documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 194897/194897 [00:00<00:00, 1195072.78vectors/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/romance_min10.\n",
      "Sample 9, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 194500/194500 [00:28<00:00, 6750.57documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 194159/194159 [00:00<00:00, 1180576.79vectors/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/romance_min10.\n",
      "Sample 10, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 194500/194500 [00:29<00:00, 6594.76documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 195213/195213 [00:00<00:00, 1221401.94vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/romance_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 11, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 194500/194500 [00:28<00:00, 6769.15documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 194010/194010 [00:00<00:00, 1217494.68vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/romance_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 12, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 194500/194500 [00:29<00:00, 6567.84documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 194006/194006 [00:00<00:00, 1209275.30vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/romance_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 13, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 194500/194500 [00:29<00:00, 6629.96documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 194074/194074 [00:00<00:00, 1203872.12vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/romance_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 14, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 194500/194500 [00:29<00:00, 6615.36documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 192930/192930 [00:00<00:00, 1214141.46vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/romance_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 15, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 194500/194500 [00:29<00:00, 6600.29documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 192114/192114 [00:00<00:00, 1193507.30vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/romance_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 16, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 194500/194500 [00:29<00:00, 6687.41documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 193661/193661 [00:00<00:00, 1204688.26vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/romance_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 17, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 194500/194500 [00:29<00:00, 6615.50documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 193870/193870 [00:00<00:00, 1198127.13vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/romance_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sample 18, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 194500/194500 [00:28<00:00, 6711.02documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 193397/193397 [00:00<00:00, 1061010.42vectors/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/romance_min10.\n",
      "Sample 19, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 194500/194500 [00:29<00:00, 6645.56documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 195639/195639 [00:00<00:00, 1186740.64vectors/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/romance_min10.\n",
      "Sample 20, building dataset:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████| 194500/194500 [00:29<00:00, 6498.89documents/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Training word2vec...\n",
      "\n",
      "Attaching pos tags to word vectors:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████| 195728/195728 [00:00<00:00, 1216128.58vectors/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved in models/romance_min10.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "models_dir = arguments.pop(\"models_dir\")\n",
    "n = arguments.pop(\"number_samples\")\n",
    "seed = arguments.pop(\"seed\")\n",
    "\n",
    "# train word2vec models\n",
    "# if paths don't work, change the paths accordingly\n",
    "gensim_train.bootstrap_train(\n",
    "    \"data/preprocessed_data/nytimes_news_articles.bin\", models_dir, arguments, seed, n\n",
    ")\n",
    "gensim_train.bootstrap_train(\n",
    "    \"data/preprocessed_data/wiki.train.tokens.bin\", models_dir, arguments, seed, n\n",
    ")\n",
    "gensim_train.bootstrap_train(\n",
    "    \"data/preprocessed_data/history_biography\", models_dir, arguments, seed, n\n",
    ")\n",
    "gensim_train.bootstrap_train(\n",
    "    \"data/preprocessed_data/romance\", models_dir, arguments, seed, n\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3e2acee6",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
