{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "3ec7e241",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "if os.path.isdir(\"../notebooks/\"):\n",
    "    os.chdir(\"../badseeds/\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "f2076ade",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "import dataloader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "7427cadd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# path to config json file containing paths to datasets. change if necessary\n",
    "CONFIG_PATH = \"../config.json\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c100bfd1",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(CONFIG_PATH, \"r\") as f:\n",
    "    config = json.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "42cdf2af",
   "metadata": {},
   "outputs": [],
   "source": [
    "DOWNLOAD_RAW = False\n",
    "DOWNLOAD_PREPROCESSED = False\n",
    "DOWNLOAD_SEEDS = False\n",
    "DOWNLOAD_MODELS = True\n",
    "DOWNLOAD_ALL = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "6d8a403e",
   "metadata": {},
   "outputs": [],
   "source": [
    "loadthedata = dataloader.LoadTheData(config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "c0335edd",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading...\n",
      "From: https://drive.google.com/uc?id=14eLHQ7oo1_V6DT8h_cd69-j4e-dF7Ymg\n",
      "To: /home/jille/Desktop/mlrc-2021/data/models/nytimes_news_articles_min0.zip\n",
      "100%|████████████████████████████████████████| 677M/677M [00:54<00:00, 12.4MB/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Unzipping the  dataset\n",
      "Extracting all the files now...\n",
      "Finished unzipping the ./data/models/nytimes_news_articles_min0.zip dataset\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading...\n",
      "From: https://drive.google.com/uc?id=1JXzX0Egg0Hw8YpoQexc1qJG6KTK929jE\n",
      "To: /home/jille/Desktop/mlrc-2021/data/models/nytimes_news_articles_min10.zip\n",
      "100%|████████████████████████████████████████| 213M/213M [00:17<00:00, 12.5MB/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Unzipping the  dataset\n",
      "Extracting all the files now...\n",
      "Finished unzipping the ./data/models/nytimes_news_articles_min10.zip dataset\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading...\n",
      "From: https://drive.google.com/uc?id=1LHdwfpvPKI02kYpzTqepMKkK3xOHEyXD\n",
      "To: /home/jille/Desktop/mlrc-2021/data/models/nytimes_news_articles_min100.zip\n",
      "100%|██████████████████████████████████████| 48.7M/48.7M [00:03<00:00, 12.7MB/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Unzipping the  dataset\n",
      "Extracting all the files now...\n",
      "Finished unzipping the ./data/models/nytimes_news_articles_min100.zip dataset\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading...\n",
      "From: https://drive.google.com/uc?id=1COecvAc3pjcIG7vpy6mGYTB28wc0gu4F\n",
      "To: /home/jille/Desktop/mlrc-2021/data/models/history_biography_min0.zip\n",
      "100%|██████████████████████████████████████| 1.13G/1.13G [01:31<00:00, 12.4MB/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Unzipping the  dataset\n",
      "Extracting all the files now...\n",
      "Finished unzipping the ./data/models/history_biography_min0.zip dataset\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading...\n",
      "From: https://drive.google.com/uc?id=1yD3Q4dfWRfQIa6VSMwqgmKD5i91KoFEL\n",
      "To: /home/jille/Desktop/mlrc-2021/data/models/history_biography_min10.zip\n",
      "100%|████████████████████████████████████████| 233M/233M [00:18<00:00, 12.4MB/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Unzipping the  dataset\n",
      "Extracting all the files now...\n",
      "Finished unzipping the ./data/models/history_biography_min10.zip dataset\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading...\n",
      "From: https://drive.google.com/uc?id=1l1W9VKjmJVUtzE6dZYfgh0RzVRszYPPU\n",
      "To: /home/jille/Desktop/mlrc-2021/data/models/romance_min0.zip\n",
      "100%|██████████████████████████████████████| 1.46G/1.46G [01:58<00:00, 12.4MB/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Unzipping the  dataset\n",
      "Extracting all the files now...\n",
      "Finished unzipping the ./data/models/romance_min0.zip dataset\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading...\n",
      "From: https://drive.google.com/uc?id=1LZOiQSWvl82qglCTZSp-nVurUmSO6qPj\n",
      "To: /home/jille/Desktop/mlrc-2021/data/models/romance_min10.zip\n",
      "100%|████████████████████████████████████████| 275M/275M [00:22<00:00, 12.4MB/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Unzipping the  dataset\n",
      "Extracting all the files now...\n",
      "Finished unzipping the ./data/models/romance_min10.zip dataset\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading...\n",
      "From: https://drive.google.com/uc?id=1Y4_dQE_tbXun2YSFHllePv4IrVPtOTJB\n",
      "To: /home/jille/Desktop/mlrc-2021/data/models/wiki.train.tokens_min0.zip\n",
      "100%|██████████████████████████████████████| 1.62G/1.62G [02:10<00:00, 12.4MB/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Unzipping the train dataset\n",
      "Extracting all the files now...\n",
      "Finished unzipping the ./data/models/wiki.train.tokens_min0.zip dataset\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Downloading...\n",
      "From: https://drive.google.com/uc?id=1KNRsNnIdtic-kch8XzE3s0ukix6gt4Dp\n",
      "To: /home/jille/Desktop/mlrc-2021/data/models/wiki.train.tokens_min10.zip\n",
      "100%|████████████████████████████████████████| 882M/882M [01:11<00:00, 12.4MB/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Unzipping the train dataset\n",
      "Extracting all the files now...\n",
      "Finished unzipping the ./data/models/wiki.train.tokens_min10.zip dataset\n",
      "peak memory: 416.06 MiB, increment: 1.35 MiB\n"
     ]
    }
   ],
   "source": [
    "if DOWNLOAD_RAW:\n",
    "    loadthedata.download_raw()\n",
    "elif DOWNLOAD_PREPROCESSED:\n",
    "    loadthedata.download_preprocessed()\n",
    "elif DOWNLOAD_SEEDS:\n",
    "    loadthedata.download_seeds()\n",
    "elif DOWNLOAD_MODELS:\n",
    "    loadthedata.download_models()\n",
    "elif DOWNLOAD_ALL:\n",
    "    loadthedata.download_all()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dfdaf748",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
