{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 109,
   "id": "e7cb530e-fb09-435a-97fe-2931f2743465",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_from_disk, load_dataset, DatasetDict, Dataset, concatenate_datasets\n",
    "import os\n",
    "os.chdir('/workspace/FutureGPT2/src/')\n",
    "from data.utils import get_tokenizer, get_token_dict\n",
    "import secrets\n",
    "import numpy as np\n",
    "from itertools import islice"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "id": "3a80bd75-5914-448d-a9e5-2188499656d5",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = load_from_disk('/workspace/corpus/msmarco/msmarco_LLAMA2_64tokens_1m/val')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e36739b9-462b-45c6-a952-38947348c4fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "# commented subsets <100k rows\n",
    "SUBSETS = [\n",
    "    'ArXiv',\n",
    "    #'BookCorpus2',\n",
    "    #'Books3',\n",
    "    'DM Mathematics',\n",
    "    'Enron Emails',\n",
    "    #'EuroParl',\n",
    "    'FreeLaw',\n",
    "    'Github',\n",
    "    #'Gutenberg (PG-19)',\n",
    "    'HackerNews',\n",
    "    'NIH ExPorter',\n",
    "    #'OpenSubtitles',\n",
    "    'OpenWebText2',\n",
    "    #'PhilPapers',\n",
    "    'Pile-CC',\n",
    "    'PubMed Abstracts',\n",
    "    'PubMed Central',\n",
    "    #'StackExchange',\n",
    "    'UPSTO Backgrounds',\n",
    "    #'Ubuntu IRC',\n",
    "    'Wikipedia (en)',\n",
    "    'YoutubeSubtitles',\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "5f694957-e53b-463a-96e6-280890131f5e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'065747e4dd8c63bf1b878054fe0f00c4'"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "secrets.token_hex(16)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "0352cadf-f825-468c-99b7-68b92ae01e9f",
   "metadata": {},
   "outputs": [],
   "source": [
    "MODEL_NAME_SHORT = 'MISTRAL'\n",
    "MODEL_NAME = {\n",
    "    'GPT2': 'gpt2',\n",
    "    'LLAMA2': 'daryl149/llama-2-7b-hf',\n",
    "    'MISTRAL': 'mistralai/Mistral-7B-v0.1',\n",
    "}[MODEL_NAME_SHORT]\n",
    "SAVE_PATH = '/workspace/corpus/msmarco/'\n",
    "MAX_LENGTH = 64"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "efe0f97d-25c2-4be2-912d-28266b0caa28",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "279ff18e3d4a433584a74e26aff01cf4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading readme:   0%|          | 0.00/7.03k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "57ea3388589745228c1a6ad36a1f0aa3",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Resolving data files:   0%|          | 0/3408 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "05c2d8f127ee4548ab1e0074141e2c24",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Resolving data files:   0%|          | 0/2112 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b7040bd69b7a4ee4a0351a924229e03b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Resolving data files:   0%|          | 0/222 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "fa2e516ca282413e88d3c9d423814cf4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Resolving data files:   0%|          | 0/96 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "ds = load_dataset('ArmelR/the-pile-splitted', 'ArXiv', split='train', streaming=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "id": "a2307d4e-cc00-44f2-ac90-d3d014836af6",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = get_tokenizer(MODEL_NAME)\n",
    "\n",
    "def tokenize_chunks(examples, seq_length=64, max_chunks=5):\n",
    "    tokens = tokenizer(\n",
    "        examples['text'],\n",
    "        truncation=True,\n",
    "        max_length=seq_length * max_chunks,\n",
    "        padding=False,\n",
    "        add_special_tokens=False,   # hopefully it's fine not to have bod tokens...\n",
    "    )\n",
    "    chunks = {\n",
    "        'input_ids': [],\n",
    "        'attention_mask': [],\n",
    "        'text': [],   # decoded text for convenience\n",
    "        'id': [],     # unique hex string for convenience\n",
    "    }\n",
    "    for input_ids, att_mask in zip(tokens['input_ids'], tokens['attention_mask']):\n",
    "        while len(input_ids) >= seq_length:\n",
    "            cur_ids, input_ids = input_ids[:seq_length], input_ids[seq_length:]\n",
    "            cur_mask, att_mask = att_mask[:seq_length], att_mask[seq_length:]\n",
    "            chunks['input_ids'].append(cur_ids)\n",
    "            chunks['attention_mask'].append(cur_mask)\n",
    "            chunks['text'].append(tokenizer.decode(cur_ids))\n",
    "            chunks['id'].append(secrets.token_hex(16))\n",
    "    return chunks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "id": "64e49a8d-03f9-4f75-9e3c-15331fce8e40",
   "metadata": {},
   "outputs": [],
   "source": [
    "ds_iter = iter(ds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "id": "0a169730-916a-4aba-80b9-28e4ab739605",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ef3e577cd21049e6b87845d2531df0d9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/100 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "494\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7c8525afcedd4d7cbc01c1b78f0f1430",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/100 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "990\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e8d5b6b874cb465fbc4b9809ece80d5e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/100 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1482\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0226958c8b6345029477758024a05312",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/100 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1975\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "89c455e1f00c403085ebcee312211017",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/100 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2467\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b3100762738c4335967f8b4da9ef8437",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/100 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2962\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f31bee7845864a059b6ddd7db9cc59d2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/100 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3458\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "dd0ac2194021451f8b26c1f115c32f3c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/100 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3939\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7db9ead950f94d76aa46cd8f1ba064a3",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/100 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4436\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1f3ff0a2df794f93b8ce4422305fe9a5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/100 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4923\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6bf0e7c3f19d4023923481de091b822f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/100 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5417\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e9a94254e85041aa8a19d15f0974d531",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/100 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5913\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b38b01c7f2364956b7dc360fe4202dd9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/100 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6410\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3d99453d6ade43cfae789a5125f63cd1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/100 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6904\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0fcc6e8c6ba74ef799d019e136f484ae",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/100 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "7398\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "476640dad39a412285ed7130503b0b48",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/100 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "7885\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ccf760a9d75349069cdf0a3fb001f110",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/100 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8375\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "70b3736fecb94cfa9c87c418c68d52cb",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/100 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8871\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "df19cb90311f4de0887fec5f6c730a69",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/100 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "9370\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "fbbf0fe0794845999a53820f81f8bf46",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/100 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "9863\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "11b7f6fcec4140d986899cbebd8360a9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/100 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10347\n"
     ]
    }
   ],
   "source": [
    "tokenized = Dataset.from_list([])\n",
    "while len(tokenized) < 10000:\n",
    "    batch = Dataset.from_list([next(ds_iter) for _ in range(100)])\n",
    "    tokenized = concatenate_datasets([\n",
    "        tokenized, \n",
    "        batch.map(tokenize_chunks, batched=True, batch_size=100, remove_columns=['text', 'meta', 'domain'])\n",
    "    ])\n",
    "    print(len(tokenized))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "id": "b9fb89fe-d3e1-4b70-90aa-32666bb37bf7",
   "metadata": {},
   "outputs": [],
   "source": [
    "a = iter(tokenized['text'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 171,
   "id": "9932b320-7d33-4401-bd4a-fda19ac7bd13",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"---\\nabstract: 'Spectral triples (of compact type) are constructed on arbitrary separable quasidiagonal $C^*$-algebras. On the other hand an example of a spectral triple on a non-quasidiagonal algebra is presented.'\\naddress:\\n- 'Department of\""
      ]
     },
     "execution_count": 171,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "next(a)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "3e09dc9f-b87c-42cf-8fb0-de427accaf59",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.eos_token_id in a['input_ids'][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "id": "94417423-19d4-4043-ada7-9cb96502c7b8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "11940"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a['input_ids'][0][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "abb4cf77-4ead-4ad5-b6b3-0b583302a6ef",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'</s>'"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.eos_token"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "id": "f73edda7-42a3-4d60-9bf6-8478f00def3c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'<s>'"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.decode([1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "80909028-0628-44eb-9b60-e6f2bdf78e9a",
   "metadata": {},
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "'list' object has no attribute 'strip'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[63], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstrip\u001b[49m(\u001b[38;5;241m0\u001b[39m)\n",
      "\u001b[0;31mAttributeError\u001b[0m: 'list' object has no attribute 'strip'"
     ]
    }
   ],
   "source": [
    "[0, 1, 2, 0].strip(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "1df9611a-2985-4efa-8bbd-ab82318df78b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.mean(a['attention_mask'][0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "657ad238-fd7b-493d-9e8d-6e303ad077b2",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "48d24347-bb7a-4c09-8d54-51bf8a323151",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10661"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(a['input_ids'][0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b5499d69-e209-4c83-8599-75cd46b8186b",
   "metadata": {},
   "outputs": [],
   "source": [
    "def chunk(examples, seq_len=64):\n",
    "    chunks = []\n",
    "    while len(examples['input_ids'] >= 64):\n",
    "        cur_ids, remain_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "1940a5be-c424-42f4-aecf-22ca6a542a79",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['input_ids', 'attention_mask'])"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "9b3d837f-eac9-4b17-8014-4d19cf1c6e7c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TOKENIZE\n"
     ]
    },
    {
     "ename": "AttributeError",
     "evalue": "'dict' object has no attribute 'map'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[37], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mTOKENIZE\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m----> 2\u001b[0m tokenized \u001b[38;5;241m=\u001b[39m \u001b[43mds\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[38;5;241;43m100\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap\u001b[49m(tokenize_text, batched\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, batch_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1024\u001b[39m)\n\u001b[1;32m      3\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mTokenized \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(tokenized)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m texts.\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
      "\u001b[0;31mAttributeError\u001b[0m: 'dict' object has no attribute 'map'"
     ]
    }
   ],
   "source": [
    "print('TOKENIZE')\n",
    "tokenized = ds[:100].map(tokenize_text, batched=True, batch_size=1024)\n",
    "print(f'Tokenized {len(tokenized)} texts.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "9c11793e-f994-4967-878b-8617f6b95bcc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(tokenized[0]['input_ids'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "1d5b6dd9-6a16-42a3-9bed-4b2f1f16ee85",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "FILTER\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e18512f2bcb94e85b8f8f964cf47e183",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Filter:   0%|          | 0/100000 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Truncated to 64 tokens and removed shorter texts.\n",
      "Filtered from 100000 to 100000 rows.\n"
     ]
    }
   ],
   "source": [
    "print('FILTER')\n",
    "# LLAMA2 tokenizer: shorter texts are padded at the *beginning* with tokenizer.pad_token\n",
    "filtered_msmarco = tokenized.filter(\n",
    "    lambda x: x['input_ids'][-1] != tokenizer.pad_token_id and x['input_ids'][0] != tokenizer.pad_token_id\n",
    ")\n",
    "print(f'Truncated to {MAX_LENGTH} tokens and removed shorter texts.')\n",
    "print(f'Filtered from {len(tokenized)} to {len(tokenized)} rows.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "c880b708-af09-4db6-b4da-2781ae47d03b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "19ca1e117f54403ca2681d04fe8eba28",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Saving the dataset (0/1 shards):   0%|          | 0/100000 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "fd763c7eeaf04267bdc6e2823f2e2f75",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Saving the dataset (0/1 shards):   0%|          | 0/100000 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "DatasetDict({'train': ds, 'test': ds}).save_to_disk('test_dataset')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "70706c28-d84d-426c-85c6-c40bca792552",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
     ]
    }
   ],
   "source": [
    "!rm -r test_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "9e3f81c9-fee4-467e-bde1-abb0af10d1da",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'text': 'this guy  cracks me up \\n---------------------- Forwarded by Drew Fossum/ET&S/Enron on 05/19/2000 \\n09:09 AM ---------------------------\\n\\n05/19/2000 08:29 AM\\nLouis Soldano\\nLouis Soldano\\nLouis Soldano\\n05/19/2000 08:29 AM\\n05/19/2000 08:29 AM\\nTo: Drew Fossum/ET&S/Enron@ENRON\\ncc:  \\n\\nSubject: Re: Emma Caplan  \\n\\nWell, if she\\'s \"very bright, personable, and enthusiastic\" she certainly \\nwon\\'t fit in with us.   \\n\\nI\\'ll follow up with mike and shelly on the person we were thiniking of \\nsplitting for the summer - that may already be a done deal, if not it may \\nmake sense to do something more permanent but we will need to look at \\nbudget...\\n\\ni could probably use some part time help with Bret Reich exiled to Phase IV \\nbut i should have a licensed lawyer for that.\\n\\n\\n\\n\\n   \\n\\t\\n\\t\\n\\tFrom:  Drew Fossum                           05/18/2000 05:50 PM\\n\\t\\n\\nTo: Michael Moran/ET&S/Enron@ENRON, Dorothy McCoppin/FGT/Enron@ENRON, Louis \\nSoldano/ET&S/Enron@ENRON, Britt Davis/Corp/Enron@ENRON, Shelley \\nCorman/ET&S/Enron@ENRON\\ncc: Robert Jones/Corp/Enron@ENRON, Emily Sellers/ET&S/Enron@ENRON \\n\\nSubject: Emma Caplan\\n\\nAt the suggestion of Robert Jones (you remember him--one of the growing \\nlegion of ex-GPG HR representatives), I interviewed a lady named Emma Caplan \\nyesterday.  She is the wife of a guy that Enron Networks brought over to \\nHouston from London.  She is a \"Solicitor\" in the English legal system, and \\ngot her law degree at the University of Wales.  She has not taken an American \\nbar exam but may be able to take DC or New York.  For some reason related to \\npractice of foreign lawyers in TX, she can\\'t take the TX bar until she has \\ngotten a US law degree.  This person is very bright, personable, and \\nenthusiastic.  She is extremely interested in working for Enron, either as a \\nlawyer once she gets her license squared away, or in a paralegal or analyst \\nposition until then. I left a copy of her resume with Emily, and I\\'d ask that \\nshe please forward it to each of you.  If any of you have need for such an \\nindividual or have ideas she should pursue within Enron (or outside Enron), \\nplease call Robert or myself.  Thanks.  DF',\n",
       " 'meta': {'pile_set_name': 'Enron Emails'},\n",
       " 'domain': 'Enron Emails'}"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "load_from_disk('test_dataset')['train'][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "1f1e932d-e7e9-485f-8d1c-a92876cf3a6e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'text': 'this guy  cracks me up \\n---------------------- Forwarded by Drew Fossum/ET&S/Enron on 05/19/2000 \\n09:09 AM ---------------------------\\n\\n05/19/2000 08:29 AM\\nLouis Soldano\\nLouis Soldano\\nLouis Soldano\\n05/19/2000 08:29 AM\\n05/19/2000 08:29 AM\\nTo: Drew Fossum/ET&S/Enron@ENRON\\ncc:  \\n\\nSubject: Re: Emma Caplan  \\n\\nWell, if she\\'s \"very bright, personable, and enthusiastic\" she certainly \\nwon\\'t fit in with us.   \\n\\nI\\'ll follow up with mike and shelly on the person we were thiniking of \\nsplitting for the summer - that may already be a done deal, if not it may \\nmake sense to do something more permanent but we will need to look at \\nbudget...\\n\\ni could probably use some part time help with Bret Reich exiled to Phase IV \\nbut i should have a licensed lawyer for that.\\n\\n\\n\\n\\n   \\n\\t\\n\\t\\n\\tFrom:  Drew Fossum                           05/18/2000 05:50 PM\\n\\t\\n\\nTo: Michael Moran/ET&S/Enron@ENRON, Dorothy McCoppin/FGT/Enron@ENRON, Louis \\nSoldano/ET&S/Enron@ENRON, Britt Davis/Corp/Enron@ENRON, Shelley \\nCorman/ET&S/Enron@ENRON\\ncc: Robert Jones/Corp/Enron@ENRON, Emily Sellers/ET&S/Enron@ENRON \\n\\nSubject: Emma Caplan\\n\\nAt the suggestion of Robert Jones (you remember him--one of the growing \\nlegion of ex-GPG HR representatives), I interviewed a lady named Emma Caplan \\nyesterday.  She is the wife of a guy that Enron Networks brought over to \\nHouston from London.  She is a \"Solicitor\" in the English legal system, and \\ngot her law degree at the University of Wales.  She has not taken an American \\nbar exam but may be able to take DC or New York.  For some reason related to \\npractice of foreign lawyers in TX, she can\\'t take the TX bar until she has \\ngotten a US law degree.  This person is very bright, personable, and \\nenthusiastic.  She is extremely interested in working for Enron, either as a \\nlawyer once she gets her license squared away, or in a paralegal or analyst \\nposition until then. I left a copy of her resume with Emily, and I\\'d ask that \\nshe please forward it to each of you.  If any of you have need for such an \\nindividual or have ideas she should pursue within Enron (or outside Enron), \\nplease call Robert or myself.  Thanks.  DF',\n",
       " 'meta': {'pile_set_name': 'Enron Emails'},\n",
       " 'domain': 'Enron Emails'}"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "DatasetDict({'train': ds, 'test': ds})['test'][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "a34ad275-63ee-4e89-8ba8-57f4530f6ec0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['text', 'meta', 'domain'],\n",
       "    num_rows: 100000\n",
       "})"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "be77c6a0-40fa-49e0-a275-05e3680b1df6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "64"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(tokenized[0]['input_ids'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "97ffc786-aeae-427b-be3a-bffb26775d06",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
