{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "7ef35ed7-4434-44bf-ab43-af74e55ba492",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('..')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "51c43ed9-b0c3-4be6-8cbb-2a346d40f4a5",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/envs/llm/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "import time \n",
    "import torch\n",
    "import re\n",
    "import argparse\n",
    "import os\n",
    "\n",
    "import numpy as np\n",
    "import torch.nn.functional as F\n",
    "\n",
    "# from torch import Dataset, DataLoader\n",
    "from tqdm import tqdm\n",
    "from datasets import load_dataset\n",
    "from transformers import T5Tokenizer, T5ForConditionalGeneration, get_cosine_schedule_with_warmup\n",
    "\n",
    "from src.utils import tprint, kl_divergence\n",
    "from src.data_utils import GSM8KCodexAugmentedDataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ec2057a3-061b-4abf-aba1-5d89f06fa7c2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "12/13/2022, 23:53:16 Loading dataset...\n",
      "12/13/2022, 23:55:51 .. finished\n"
     ]
    }
   ],
   "source": [
    "dataset = GSM8KCodexAugmentedDataset(base_path='../')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "267af756-b23b-487c-b96b-d17a1ec15f7a",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = T5Tokenizer.from_pretrained(\"google/flan-t5-xxl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "82aa7ee4-4a3f-4640-99d9-98e798c0d8a3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "32100"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(tokenizer.get_vocab())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "e6003131-6945-477b-9a33-7e712619b4b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "vocab = tokenizer.get_vocab()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "1b05c71d-5cd7-4bca-a0b9-de557bc113ff",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['<extra_id_-4>']"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.convert_ids_to_tokens([32103])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
