{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "import numpy as np\n",
    "import random\n",
    "from omegaconf import OmegaConf\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "from torch.utils.data import IterableDataset\n",
    "from torch.nn.utils.rnn import pad_sequence\n",
    "\n",
    "from tokenizers import ByteLevelBPETokenizer\n",
    "\n",
    "from torchfly.rl.env import Env\n",
    "from torchfly.flydata import FlyDataLoader\n",
    "from torchfly.flyconfig import GlobalFlyConfig\n",
    "from torchfly.rl.vector import AsyncVectorEnv\n",
    "from torchfly.common import set_random_seed, get_rank\n",
    "\n",
    "from typing import Iterator, Tuple, List\n",
    "\n",
    "from dataloaders.text_tasks import TextTask, TextContinuation, TextInfilling, TextRecall, MultiTextTask"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "vocab_file = \"/home/username/cloud-data/tokenizers/MemformerABPE/vocab.json\"\n",
    "merges_file = \"/home/username/cloud-data/tokenizers/MemformerABPE/merges.txt\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = ByteLevelBPETokenizer(vocab_file=vocab_file, merges_file=merges_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "config = GlobalFlyConfig(config_path=\"config/base_time_1.yml\", \n",
    "                         disable_chdir=True, \n",
    "                         disable_logging=True)\n",
    "config = config.user_config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "task = MultiTextTask(config.flydata.training, tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "document = \"\"\"re-implement strong pre-training approaches recently\n",
    "proposed for discriminative and generation tasks. We\n",
    "aim, as much as possible, to control for differences unrelated to the pre-training objective. However, we do\n",
    "make minor changes to the learning rate and usage of\n",
    "layer normalisation in order to improve performance\n",
    "(tuning these separately for each objective). For reference, we compare our implementations with published\n",
    "numbers from BERT, which was also trained for 1M\n",
    "steps on a combination of books and Wikipedia data.\n",
    "We compare the following approaches:\n",
    "Language Model Similarly to GPT (Radford et al.,\n",
    "2018), we train a left-to-right Transformer language\n",
    "model. This model is equivalent to the BART decoder,\n",
    "without cross-attention.\n",
    "Permuted Language Model Based on XLNet (Yang\n",
    "et al., 2019), we sample 1/6 of the tokens, and generate them in a random order autoregressively. For consistency with other models, we do not implement the\n",
    "relative positional embeddings or attention across segments from XLNet.\n",
    "Masked Language Model Following BERT (Devlin\n",
    "et al., 2019), we replace 15% of tokens with [MASK]\n",
    "symbols, and train the model to independently predict\n",
    "the original tokens.\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ratio 0.26865671641791045\n",
      "ratio 0.26785714285714285\n",
      "ratio 0.2857142857142857\n",
      "ratio 0.2727272727272727\n",
      "ratio 0.17647058823529413\n",
      "source <TextInfill><|startoftext|>re-implement strong<mask> for discriminative and generation tasks. We\n",
      "aim, as much as possible, to control for differences<mask>-training objective. However, we do\n",
      "<mask> rate and usage of\n",
      "layer normalisation in order to improve performance</s>\n",
      "target <TextInfill> pre-training approaches recently\n",
      "proposed</s> unrelated to the pre</s>make minor changes to the learning</s>\n",
      "\n",
      "source <TextInfill>\n",
      "(tuning<mask> each objective).<mask> we compare our implementations with published\n",
      "numbers from B<mask> was also trained for 1M\n",
      "steps on<mask> data.\n",
      "We compare the following approaches:\n",
      "Language Model Similarly to</s>\n",
      "target <TextInfill> these separately for</s> For reference,</s>ERT, which</s> a combination of books and Wikipedia</s>\n",
      "\n",
      "source <TextInfill> GPT (Radford et al<mask> train a left-to<mask>\n",
      "model. This model is equivalent to the BART decoder,\n",
      "<mask>attention.\n",
      "Permuted Language Model Based on XLNet (Yang<mask>), we sample 1</s>\n",
      "target <TextInfill>.,\n",
      "2018), we</s>-right Transformer language</s>without cross-</s>\n",
      "et al., 2019</s>\n",
      "\n",
      "source <TextInfill>/6 of the tokens, and generate them in a random order<mask> consistency with other models, we do not implement the\n",
      "<mask>ings or attention across segments from XLNet.\n",
      "Masked Language<mask>Devlin\n",
      "</s>\n",
      "target <TextInfill> autoregressively. For</s>relative positional embedd</s> Model Following BERT (</s>\n",
      "\n",
      "source <TextInfill>et al., 2019), we replace 15% of tokens with [MASK<mask> and train the model to independently predict\n",
      "the original tokens.<|endoftext|></s>\n",
      "target <TextInfill>]\n",
      "symbols,</s>\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for item in list(task.process_document(document)):\n",
    "    print(\"source\", tokenizer.decode(item[0]))\n",
    "    print(\"target\", tokenizer.decode(item[1]))\n",
    "    print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'tokenizer'"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"tokenizer\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
