{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "c078adf3-b73f-4d4a-a90a-c39743f7c93a",
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import Counter, defaultdict\n",
    "from datasets import load_dataset\n",
    "import unicodedata\n",
    "import tiktoken\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "from datasets import get_dataset_config_names, load_dataset\n",
    "from transformers import AutoTokenizer\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8355cc14-553e-454e-bf34-377536fa2e24",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "dc11ddc0-c349-4ed9-9c17-c698cb859407",
   "metadata": {},
   "outputs": [],
   "source": [
    "def save_list_to_file(data_list, filename):\n",
    "    with open(filename, 'w', encoding='utf-8') as f:\n",
    "        f.writelines(f\"{item}\\n\" for item in data_list)\n",
    "\n",
    "def load_list_from_file(filename):\n",
    "    with open(filename, 'r', encoding='utf-8') as f:\n",
    "        return [line.strip() for line in f]\n",
    "\n",
    "def save_results_to_file(data_list, filename):\n",
    "    with open(filename, 'w', encoding='utf-8') as f:\n",
    "        f.write(str(data_list))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "7f2eea52-1de8-4e7f-9ddb-723eea61961f",
   "metadata": {},
   "outputs": [],
   "source": [
    "######### TOKENIZER #########"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "1ecb3167-54e8-4cc9-92cf-a74a0f49d370",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "import ast\n",
    "\n",
    "from tokenizers import Tokenizer\n",
    "from tokenizers.models import BPE\n",
    "from tokenizers.pre_tokenizers import Whitespace\n",
    "from collections import defaultdict\n",
    "from tokenizers.pre_tokenizers import PreTokenizer\n",
    "from transformers import PreTrainedTokenizerFast\n",
    "\n",
    "from tokenizers import (\n",
    "    decoders,\n",
    "    models,\n",
    "    normalizers,\n",
    "    pre_tokenizers,\n",
    "    processors,\n",
    "    trainers,\n",
    "    Tokenizer,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "92e896bd-5c5b-4151-a895-cf9face5f558",
   "metadata": {},
   "outputs": [],
   "source": [
    "output_dir = \"_bvv241-nemo\"\n",
    "os.makedirs(output_dir, exist_ok=True)\n",
    "languages = [\"en\", \"ru\", \"fr\", \"de\", \"zh\", \"he\", \"it\", \"es\", \"ar\", \"pt\", \"ko\", \"hu\", \"sa\", \"la\", \"ja\", \"el\",     \"sv\", \"nl\", \"pl\", \"vi\", \"fa\", \"no\", \"tr\", \"fi\", \"cs\", \"hy\", \"da\", \"bn\", \"az\", \"ka\", \"hi\", \"id\"]\n",
    "top_cnt = 906 #906 is optimal for 2-3-3-4-5-gramms fitting in unused unicod ranges (and > 65535). 1000 for 2-3-4-5   171 for 2-3-4  800 for 2 (bigramms)\n",
    "\n",
    "UNICODE_LIMIT =   0xD800 # from 0 до D7FF monograms\n",
    "SURROGATE_START = 0xD800\n",
    "SURROGATE_END =   0xE000\n",
    "\n",
    "SPECIAL_TOKENS_START = 0xE000\n",
    "SPECIAL_TOKENS_LIMIT = SPECIAL_TOKENS_START + 256  # 0xE100 = 57600\n",
    "NGRAM_START = SPECIAL_TOKENS_LIMIT  # 0xE100 = 57600\n",
    "NGRAM_LIMIT = 0xF900\n",
    "\n",
    "VOCAB_SIZE_TARGET = 65536\n",
    "\n",
    "NGRAM_START_EXT = 0x10000\n",
    "NGRAM_LIMIT_EXT = 0x1FFFF\n",
    "\n",
    "VOCAB_SIZE_TARGET = 2 * 65536"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "dcb8ba25-fb7b-40a4-9419-68e97512aae4",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer_nemo = AutoTokenizer.from_pretrained('mistralai/Mistral-Nemo-Instruct-2407')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "754abf37-29c6-46ff-ab24-440d64d99e9f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('_bvv241-nemo/tokenizer_config.json',\n",
       " '_bvv241-nemo/special_tokens_map.json',\n",
       " '_bvv241-nemo/tokenizer.json')"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer_nemo.save_pretrained(output_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "4e038761-39e4-4276-bd47-8348833abe45",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total: 131072\n",
      "Sample: ['Ġpainted', 'Ø¬Ø§Ø²', 'ĠAmin', 'ĠProvidence', 'ĠPicture', 'áĢ±áĢ¬áĢ', 'ĠØªØ¹Ø¯Ø§Ø¯', 'Âª', 'ĠBoxscore', 'ĠÑģÑĤÑĥ']\n"
     ]
    }
   ],
   "source": [
    "with open(output_dir + \"/tokenizer.json\", \"r\", encoding=\"utf-8\") as f:\n",
    "    mistral_tokens = json.load(f)\n",
    "\n",
    "mistral_vocab = mistral_tokens[\"model\"][\"vocab\"]\n",
    "\n",
    "mistral_tokens_set = {token for token in mistral_vocab.keys()}\n",
    "\n",
    "print(f\"Total: {len(mistral_tokens_set)}\")\n",
    "print(\"Sample:\", list(mistral_tokens_set)[:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "9214200d-c270-44a9-8228-11dc6a3f86f4",
   "metadata": {},
   "outputs": [],
   "source": [
    "vocab = mistral_vocab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "b3181781-2661-49bc-9f91-8e7a62ece80d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "131072"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "803f0a9e-2ca6-4d3f-a1ad-e9c1dffb7a94",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(os.path.join(output_dir, \"vocab.txt\"), \"w\", encoding=\"utf-8\") as f:\n",
    "    for token, idx in sorted(vocab.items(), key=lambda x: x[1]):\n",
    "        f.write(f\"{idx:>6} | {repr(token):<12} | len={len(token)}\\n\")\n",
    "\n",
    "with open(os.path.join(output_dir, \"vocab.json\"), \"w\", encoding=\"utf-8\") as f:\n",
    "    json.dump({k: v for k, v in sorted(vocab.items(), key=lambda x: x[1])},\n",
    "              f, indent=2, ensure_ascii=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6405817d-8f7b-45a8-9205-98421903ef61",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "1c82d08e-3f22-463f-b10e-55a3cacff309",
   "metadata": {},
   "outputs": [],
   "source": [
    "from PIL import Image, ImageDraw, ImageFont, ImageFilter\n",
    "from IPython.display import display\n",
    "import unicodedata\n",
    "import math\n",
    "import random\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "167bdcab-16ac-4f8b-9456-47bb12e305ca",
   "metadata": {},
   "outputs": [],
   "source": [
    "#https://en.wikipedia.org/wiki/GNU_Unifont"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "03a5bd39-4ada-4a88-baca-982ae5803f85",
   "metadata": {},
   "source": [
    "''' GNU Unifont is a free Unicode bitmap font created by Roman Czyborra. The main Unifont covers all of the Basic Multilingual Plane (BMP) '''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "0cd665d1-18b0-489a-8ed2-d7c71b4a1647",
   "metadata": {},
   "outputs": [],
   "source": [
    "#we use unifont-14.0.01.ttf for embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "38fcd932-fb9c-413b-940c-19b0d27581c8",
   "metadata": {},
   "outputs": [],
   "source": [
    "!wget -q https://ftp.gnu.org/gnu/unifont/unifont-14.0.01/unifont-14.0.01.ttf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "2209e8ee-2663-47ab-8528-e20b37883905",
   "metadata": {},
   "outputs": [],
   "source": [
    "font_size = 32                     \n",
    "glyph_size = (32, 32)              \n",
    "font = ImageFont.truetype('unifont-14.0.01.ttf', font_size)\n",
    "random.seed(42)\n",
    "np.random.seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "2a4eaa5b-934c-4b01-8c93-5995f219c30b",
   "metadata": {},
   "outputs": [],
   "source": [
    "def describe_char(c):\n",
    "    print(f\"symbol: {c}\")\n",
    "    print(f\"ord: U+{ord(c):04X}\")\n",
    "    print(f\"name: {unicodedata.name(c, 'UNKNOWN')}\")\n",
    "    print(f\"category: {unicodedata.category(c)}\")\n",
    "    print(f\"Bidirectional: {unicodedata.bidirectional(c)}\")\n",
    "    print(f\"Combining: {unicodedata.combining(c)}\")\n",
    "    print(f\"Decomposition: {unicodedata.decomposition(c)}\")\n",
    "    print(f\"Mirrored: {unicodedata.mirrored(c)}\")\n",
    "    print(f\"Decimal value: {unicodedata.decimal(c, '—')}\")\n",
    "    print(f\"isprintable: {c.isprintable()}\")\n",
    "    print(f\"isspace: {c.isspace()}\")\n",
    "    print(f\"isalpha: {c.isalpha()}\")\n",
    "    print(f\"isupper?: {c.isupper()}\") \n",
    "    print(f\"islower?: {c.islower()}\") \n",
    "    print(f\"Title-case?: {c.istitle()}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "080319d8-1a86-49d3-89bd-ccb6096dece9",
   "metadata": {},
   "outputs": [],
   "source": [
    "def is_complex_script(s):\n",
    "    complex_keywords = ['CJK', 'ARABIC', 'HEBREW', 'DEVANAGARI', 'BENGALI', 'THAI', 'KANNADA', 'TAMIL', 'TELUGU', 'KHMER', 'MYANMAR', 'SINHALA', 'SYRIAC']\n",
    "    for c in s:\n",
    "        name = unicodedata.name(c, '')\n",
    "        if any(kw in name for kw in complex_keywords):\n",
    "            return True\n",
    "    return False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "a58505d7-8881-4dcf-bf1a-b1f157c5ed15",
   "metadata": {},
   "outputs": [],
   "source": [
    "def is_easy_script(s):\n",
    "    for c in s:\n",
    "        code = ord(c)\n",
    "        if not (\n",
    "            (0x0000 <= code <= 0x007F) or  # Basic Latin\n",
    "            (0x0080 <= code <= 0x00FF) or  # Latin-1 Supplement\n",
    "            (0x0100 <= code <= 0x024F) or  # Latin Extended A/B\n",
    "            (0x0370 <= code <= 0x03FF) or  # Greek and Coptic\n",
    "            (0x0400 <= code <= 0x04FF) or  # Cyrillic\n",
    "            (0x0500 <= code <= 0x052F) or  # Cyrillic Supplement\n",
    "            (0x0530 <= code <= 0x058F)     # Armenian\n",
    "        ):\n",
    "            return False  \n",
    "    return True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "553bc206-d629-440d-81b6-0903625e3db4",
   "metadata": {},
   "outputs": [],
   "source": [
    "def image_to_bitlist_row_major(img):\n",
    "    bw = img.convert('1')  \n",
    "    pixels = bw.load()\n",
    "    width, height = bw.size\n",
    "    bits = [0] * (width * height)\n",
    "    for y in range(height):\n",
    "        for x in range(width):\n",
    "            bits[y * width + x] = 0 if pixels[x, y] else 1\n",
    "    return bits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "1d4c8061-1fef-4976-b695-5bb730c1d365",
   "metadata": {},
   "outputs": [],
   "source": [
    "def image_to_bitlist_polar_snake(img):\n",
    "    bw = img.convert('1')\n",
    "    pixels = bw.load()\n",
    "    width, height = bw.size\n",
    "    cx, cy = width / 2, height / 2\n",
    "    coords = []\n",
    "\n",
    "    for y in range(height):\n",
    "        for x in range(width):\n",
    "            dx = x - cx\n",
    "            dy = y - cy\n",
    "            r = math.hypot(dx, dy)\n",
    "            theta = math.atan2(dy, dx)\n",
    "            coords.append((r, theta, x, y))\n",
    "\n",
    "    coords.sort() \n",
    "    bitlist = []\n",
    "    for r, theta, x, y in coords:\n",
    "        value = 0 if pixels[x, y] else 1\n",
    "        bitlist.append(value)\n",
    "    return bitlist"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "903751a3-37ec-4c6b-9c55-e8982264b2b8",
   "metadata": {},
   "outputs": [],
   "source": [
    "def augment_image(img, chars):\n",
    "    is_complex = not is_easy_script(chars)\n",
    "\n",
    "    random.seed(42)\n",
    "    np.random.seed(42)\n",
    "    \n",
    "    max_rotation = 0 if is_complex else 7\n",
    "    max_shift = 0 if is_complex else 3\n",
    "    noise_level = 0.001 if is_complex else 0.03\n",
    "    apply_blur = False if is_complex else True\n",
    "\n",
    "    angle = random.uniform(-max_rotation, max_rotation)\n",
    "    img = img.rotate(angle, expand=0, fillcolor=255)\n",
    "\n",
    "    dx = random.randint(-max_shift, max_shift)\n",
    "    dy = random.randint(-max_shift, max_shift)\n",
    "    shifted = Image.new(\"L\", img.size, color=255)\n",
    "    shifted.paste(img, (dx, dy))\n",
    "    img = shifted\n",
    "\n",
    "    np_img = np.array(img)\n",
    "\n",
    "    noise_mask = np.random.rand(*np_img.shape) < noise_level\n",
    "    np_img[noise_mask] = 0  \n",
    "\n",
    "    img = Image.fromarray(np_img)\n",
    "    if apply_blur:\n",
    "        img = img.filter(ImageFilter.GaussianBlur(radius=0.5))\n",
    "\n",
    "    bw = img.point(lambda p: 0 if p < 128 else 255, mode='1') \n",
    "    return bw"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "5fc61ccb-b581-4b0a-8918-9804e00a7e54",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "32 32\n"
     ]
    },
    {
     "data": {
      "image/jpeg": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/wAALCAAgACABAREA/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/9oACAEBAAA/APd7P7YYi175CyNtIjhyRH8o3LuON/zbsNtXggbcjJpxTW2lXUFnMJPPvXRBdvGgN5OImJ3bAPnEcGSSqrjaF9Bchvref7OFk2SXERmjhlUxyFBtydjYYY3qDkcFgDg1YqnqF4um282o3VxHFp9rbyTXGYmZgFAbcCD0Ch8jaScjBGMHy+Lwld+L7fU/GV9DJc61C9xJ4WlEiIpt8LLZuVGBkMWIEnPzneDgY7Dwz4hudR8NGZ547zUrJI7C9h8h0KaioAlV3RWGzc6ZZEKqAzZI+70ljbva2cdvJL5vl5VGJYnYCdoYszMzBcAsT8xBPGcDn/GUP/FvPEl1PbQRX0uiTpOYjv8AuxOQu8gFlBZsZA6k4GTXF+F/CPji98IeHbm1+I8lpbiyglt7ZdIiYRKYcBGO4eYArY+bPIBxkAjQ+DdnNa2/i83d3Jd3g8R3UU85yiysoXLiMHahJYk49hkhRj0ysfXdPvNb0XU9IxBDDfRS2vn+YWZI3hI37Noywc427gNvO7Py1H4c02+0vwrpmnGeNXtbe2hXzbfDKiIiurhZCC52vgq2BuHDbctT8J+Fv7D/ALWllnnZr3W7rU1Q/u9nmfIFIR2Drgbhu9VJVWXjoPJ8iffbW0A8+XfdPnYxwm0NwDvb5UXkj5e/ygH/2Q==",
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAACAAAAAgAQAAAABbAUdZAAAAeklEQVR4AU2OzQmEUAyE5wUP3rQCfaVYipWsrxSb8Kx9CP4cPa23FRTHUVgQwpdkJgkBR9IRgCGI6mi1CvBLgnt14yfMMiSHCM0SbIvXzmIkhTQH45zpgMJWzVh6KJ3Inw24j/OlBdz+G5PawwuRjEFOT+5g1f4fknQBwWAyUQX0jhAAAAAASUVORK5CYII=",
      "text/plain": [
       "<PIL.Image.Image image mode=1 size=32x32>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "image/jpeg": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/wAALCAAgACABAREA/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/9oACAEBAAA/APd7P7YYi175CyNtIjhyRH8o3LuON/zbsNtXggbcjJpxTW2lXUFnMJPPvXRBdvGgN5OImJ3bAPnEcGSSqrjaF9Bchvref7OFk2SXERmjhlUxyFBtydjYYY3qDkcFgDg1YqnqF4um282o3VxHFp9rbyTXGYmZgFAbcCD0Ch8jaScjBGMHy+Lwld+L7fU/GV9DJc61C9xJ4WlEiIpt8LLZuVGBkMWIEnPzneDgY7Dwz4hudR8NGZ547zUrJI7C9h8h0KaioAlV3RWGzc6ZZEKqAzZI+70ljbva2cdvJL5vl5VGJYnYCdoYszMzBcAsT8xBPGcDn/GUP/FvPEl1PbQRX0uiTpOYjv8AuxOQu8gFlBZsZA6k4GTXF+F/CPji98IeHbm1+I8lpbiyglt7ZdIiYRKYcBGO4eYArY+bPIBxkAjQ+DdnNa2/i83d3Jd3g8R3UU85yiysoXLiMHahJYk49hkhRj0ysfXdPvNb0XU9IxBDDfRS2vn+YWZI3hI37Noywc427gNvO7Py1H4c02+0vwrpmnGeNXtbe2hXzbfDKiIiurhZCC52vgq2BuHDbctT8J+Fv7D/ALWllnnZr3W7rU1Q/u9nmfIFIR2Drgbhu9VJVWXjoPJ8iffbW0A8+XfdPnYxwm0NwDvb5UXkj5e/ygH/2Q==",
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAACAAAAAgAQAAAABbAUdZAAAAeklEQVR4AU2OzQmEUAyE5wUP3rQCfaVYipWsrxSb8Kx9CP4cPa23FRTHUVgQwpdkJgkBR9IRgCGI6mi1CvBLgnt14yfMMiSHCM0SbIvXzmIkhTQH45zpgMJWzVh6KJ3Inw24j/OlBdz+G5PawwuRjEFOT+5g1f4fknQBwWAyUQX0jhAAAAAASUVORK5CYII=",
      "text/plain": [
       "<PIL.Image.Image image mode=1 size=32x32>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "32 64\n"
     ]
    },
    {
     "data": {
      "image/jpeg": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/wAALCAAgAEABAREA/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/9oACAEBAAA/APe4I2ht4onmkndECtLIFDOQPvHaAMnrwAPQCs+yjhjFkdJEb2dwgnluwRN56iNUQmQvuZ2GwhzvysZBIJU1YmvF0zSxc6ncR5jRRLJFEwDucDCJlmyzEBUBYkkAZPW5VdF8m8ZUinZZ8yvKZNyIwCKFALZXI5AUbflYnBPzZ+raxYaJeQS6lr9jp8M3AhvpI4w4UNuMZJU7svHkksMKAAC26rkF015cRS2zxyae9uJVnXa6zlzlSjh+gUEnK4O9SG4YVJYzXE9nHJd2v2W45EkQkDgEEjKsOqnGQSAcEZCnIHN+K7hdN8Ga/NDJd/2nFpk1v9vFuySs8du8isZEUAAbmIYYUOxUYY4rl/DXw3tPEPhXSL/XvEfifVYL6yhuLnT7vVHa3kZkDYKgBsBiCPm6gda3PhlfXFxp2vWEsm610jW7jTLFNoHlW0QQRpnq2B3bJPcmuw+0PHP5c8WFkl8uBog0m4bNxL4XEfIYckjhecsFrP8AFl9caZ4N1y/s5PLurXT7iaF9oO11jYqcHg4IHWuP8IeD/Dd94ettfvfCsF9qeqxW1zeT3Mcc3nySoryTIrsVRd0jkgBT8pwpwudTwNo+paQ+tfaII2il1iZLV3ZkMVgq/uo41K5CI5ZVT5VAZmUkEbu0rm/HRm/4QXxSrRxiAaPcFHDksW8qTcCuMAAbcHJzk8DHPD+EfhvpGs+FdGXVPEfiDUIJtMtrifR5tUP2cK6ZQeWoDBAynbz1Tvg1ufDJpbzw9r2h6jL9usdK1W40e3S4jQ5tYkRFRwFAbgnJI5zXaCax08vABHaoHVyTH5cbPNIQMNgKzs5OQCTlhn7wzl+O/wDknniX/sFXX/opqr/Dy/s7vwD4eitruCaSLSrXzEjkDFPk2cgdPmR1+qMOoNHgjUoNQs9XihmnlkstVuLW4MlvFAgnBDSeUqf8syzFgXLOdx3MTzXUVj6/Zv4g8Jazp1ofLmvLS4tIzOjRgOVaPJyM7c9wDkcjIIzw+n6H8WLPQ7DQ4dS8KWNrbRQ2wvII5pZ0jTaNwVxsZto6EAHPbqOw8N+Hn8NWskMTQXMl5dm6vrjDRNJK0QEku3LAs0i52rsVVbjlfm2A9xBZw+an2q4+RJDAoQEkgM4Vm4UZLY3E4GBuOM13tbPU9JutIurj7fGYjZ3m5wHbdGNwfZjazKwPAX7wIAGK4uT4d6m7w2kvjLxBFpcL+Tp9tpDx2YsoAp2rI+S0wAVEBOWzzjliOs8N2Wnadp0lnp0M6+TKY7ia4tmikuZgAGmdiq+azYBMgyGPc1sV/9k=",
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAEAAAAAgAQAAAACK5kMpAAAAw0lEQVR4ASWPwU3EMBBFn2d9ZlNChCiAEnKgADrYEmgAaY3EnRZSAiWkEETSwXpPICWb4Y8ZeeQ3f77tMT57C2OEBcU56os7vgrXH08eMhTjF606SPEUklHUvBa4bDKXF7djln58xMqhxyvk0qxhbi9sDXq+n7Av2LhfsAf5YFDrNk8wCd7T8K8cRtB0mddeIHPVBgXrGoztwhpsVO74DOiUzwER10VwW3jrVOyczv6hD+5p9QhLe941CDbFgTxCyLPyD4VCWx0yyGQCAAAAAElFTkSuQmCC",
      "text/plain": [
       "<PIL.Image.Image image mode=1 size=64x32>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "image/jpeg": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/wAALCAAgACABAREA/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/9oACAEBAAA/APe5hMyAQSRo+9SS6Fht3DcMAjkrkA9iQcHGCCZWuHgAk3oiuSY2C4YkDDYwT8pyAcjjOMjNPWLlLGzS9m1WDTbe3lR5pbnaImQnaUYsRjO7ggjDbfvDKtYiuJZJVRrKeNT5mXcphdrBVzhifnB3DjoPm2niuD1LwxaeKPHWpabJqOs2Wm2VlbNNp9lO9tb3TTy3DS71AxIHAAZlPJLAnIOOo8M6pNeeCtC1K/eSa4u7K1eV0iJLSSKmTtQcDc2ScAAZJwAay9Q0PS/Evj6WLW7CC/h0vT7ea0inTciPM9wshK9GyIo/vA425GDk1qeELW5svCunW12kfmpbxl5V3hpnZFaSR1dFZXaQyEhhk9TgkgcnrPhu38TfEO6h1fVL6wj/ALPtYfsVjqBijv0aW8PlyDaC+Y4ySoxgF8Egbq6TwvLDfaD4d12+t45dYv8ATIEkvVthvO6PzWDMq/IhYE4OFyQByQDHDf2dv8TtQs5ruCO6utKsvs8LyAPLtkuy21Ty2BycdK1PDU1tc+FdInshILSSyheESRojBCgK5VAEU4xwoAHbiub1jTvGF54hOq+F7vSrOx1HT7e3nl1CCb7TBteVg6REAbgJs7X7jBA5rrNMs102yj0+C3jgs7REgtFWVnPlKigbsjIIOR1bgA5ySBz/AIq8K3esXsV1Y6zqWmpKiW9+umlEnuI1cmIrK5/dCNnkZtoyysRzwK6SwFuunWws4PItREghh8kxeWmBtXYQCmBgbSBjpgV//9k=",
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAACAAAAAgAQAAAABbAUdZAAAAe0lEQVR4ATWOyw3CMBBEZyc5cIgogkrSmV0CJVAGN1yKK4gjlAuS8bCL4LAfrd7TrAkAvaJ1cPi0/+0ONQlSFx84Ci+YCi07d/5pNdwb60BGS022vU5XVPecQw4uswS8xuYRYJ8PcMYCd98Zm3bxiQm0ukT4IJKUvm98ADbDMnRQfOpHAAAAAElFTkSuQmCC",
      "text/plain": [
       "<PIL.Image.Image image mode=1 size=32x32>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "images=[]\n",
    "for text in ['A', 'BC']:#, '꧄']\n",
    "    img_width = glyph_size[0] * len(text)\n",
    "    img_height = glyph_size[1]\n",
    "    img = Image.new(\"L\", (img_width, img_height), color=255)\n",
    "    draw = ImageDraw.Draw(img)\n",
    "\n",
    "    for i, char in enumerate(text):\n",
    "        try:\n",
    "            bbox = font.getbbox(char)\n",
    "            w = bbox[2] - bbox[0]\n",
    "            h = bbox[3] - bbox[1]\n",
    "            x = i * glyph_size[0] + (glyph_size[0] - w) // 2\n",
    "            y = (glyph_size[1] - h) // 2 - bbox[1]\n",
    "            draw.text((x, y), char, fill=0, font=font)\n",
    "        except Exception as e:\n",
    "            print(f\"Error '{char}': {e}\")\n",
    "            pass\n",
    "\n",
    "    \n",
    "    print(img_height,img_width)\n",
    "    \n",
    "    img_aug = augment_image(img, text)\n",
    "    images.append(img_aug)\n",
    "    display(img_aug)\n",
    "    snake_bits = image_to_bitlist_polar_snake(img_aug)\n",
    "\n",
    "    img_aug = img_aug.resize((32, 32), Image.BILINEAR) #LANCZOS)\n",
    "    display(img_aug)    \n",
    "    reduced = image_to_bitlist_polar_snake(img_aug)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "55b6b23f-8107-4f2b-99ca-a00f0907952a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "symbol: 1\n",
      "ord: U+0031\n",
      "name: DIGIT ONE\n",
      "category: Nd\n",
      "Bidirectional: EN\n",
      "Combining: 0\n",
      "Decomposition: \n",
      "Mirrored: 0\n",
      "Decimal value: 1\n",
      "isprintable: True\n",
      "isspace: False\n",
      "isalpha: False\n",
      "isupper?: False\n",
      "islower?: False\n",
      "Title-case?: False\n"
     ]
    }
   ],
   "source": [
    "describe_char('1') #'꧄')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "c1d96ab7-a714-472a-b719-611956d96cfa",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(os.path.join(output_dir, \"embeddings_nemo.txt\"), \"w\", encoding=\"utf-8\") as f:\n",
    "    for token, idx in sorted(vocab.items(), key=lambda x: x[1]):\n",
    "        text = token\n",
    "        img_width = glyph_size[0] * len(text)\n",
    "        img_height = glyph_size[1]\n",
    "        img = Image.new(\"L\", (img_width, img_height), color=255)\n",
    "        draw = ImageDraw.Draw(img)\n",
    "\n",
    "        for i, char in enumerate(text):\n",
    "            try:\n",
    "                bbox = font.getbbox(char)\n",
    "                w = bbox[2] - bbox[0]\n",
    "                h = bbox[3] - bbox[1]\n",
    "                x = i * glyph_size[0] + (glyph_size[0] - w) // 2\n",
    "                y = (glyph_size[1] - h) // 2 - bbox[1]\n",
    "                draw.text((x, y), char, fill=0, font=font)\n",
    "            except Exception as e:\n",
    "                print(f\"Error '{char}': {e}\")\n",
    "                pass\n",
    "                \n",
    "        img_aug = augment_image(img, text)\n",
    "        \n",
    "        if img_aug.size != (32, 32):\n",
    "            img_aug = img_aug.resize((32, 32), Image.BILINEAR)\n",
    "            \n",
    "        snake_bits = image_to_bitlist_polar_snake(img_aug)\n",
    "        \n",
    "        f.write(f\"{idx}\\n\")   \n",
    "        f.write(f\"{str(snake_bits)}\\n\")    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "3377785a-4fb3-43d5-a16e-de610dcc2838",
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_binary_file_np(filename):\n",
    "    data = []\n",
    "    with open(filename, 'r') as f:\n",
    "        while True:\n",
    "            index_line = f.readline()\n",
    "            if not index_line:\n",
    "                break\n",
    "            list_line = f.readline()\n",
    "            bits = np.fromstring(list_line.replace('[','').replace(']','').strip(), dtype=int, sep=',')\n",
    "            if bits.size != 1024:\n",
    "                print(index_line)\n",
    "                print(list_line)\n",
    "                raise ValueError(\"Invalid bit list length\")\n",
    "            data.append(bits)\n",
    "    return np.array(data, dtype=np.uint8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "c67c5539-6edc-4cfa-8516-2d26bfca839f",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_embeddings = load_binary_file_np(os.path.join(output_dir, \"embeddings_nemo.txt\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "902ee687-1805-4348-82c2-71bb1d7e9ddb",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.decomposition import PCA\n",
    "from sklearn.preprocessing import normalize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "6053fbc0-ef14-4268-81a3-908cd2a0dd5e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def apply_pca(data, n_components=1024):\n",
    "    pca = PCA(n_components=1024)\n",
    "    transformed = pca.fit_transform(data)\n",
    "    return transformed, pca"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "012b9fb5-813c-4e75-b06f-b02d1ef1769c",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_embeddings_pca, _pca = apply_pca(data_embeddings.astype(np.float32))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "0d60cfca-b0e1-49eb-b1f0-2faecc29e20e",
   "metadata": {},
   "outputs": [],
   "source": [
    "normalized_embeddings = normalize(data_embeddings_pca, norm='l2', axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "2612512c-3d9b-4cfe-8687-89bb86110a63",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "1f68ff20-261e-4ca5-a2a1-4ed164547852",
   "metadata": {},
   "outputs": [],
   "source": [
    "tensor_normalized_embeddings_weights = torch.tensor(normalized_embeddings, dtype=torch.float32)\n",
    "torch.save(tensor_normalized_embeddings_weights, os.path.join(output_dir, \"normalized_embeddings_weights.pt\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a7dbd352-7152-41e7-aef9-345cc8c3a19f",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "356e06bb-d60f-4f79-8791-6df41815b2d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "7928ec7f-a2db-4b06-a313-0981b494d483",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained(output_dir) \n",
    "tokenizer.is_fast"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "89d2ccbe-cbc4-499a-a72e-f93e3a2c6a17",
   "metadata": {},
   "outputs": [],
   "source": [
    "ids = tokenizer.encode('hello the test')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "5ede5794-25fb-4d36-886e-904fea1797e7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 29706, 1278, 2688]\n"
     ]
    }
   ],
   "source": [
    "print(ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "c0e2cdf1-65b3-43e9-82db-66488a305197",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<s>hello the test\n"
     ]
    }
   ],
   "source": [
    "print(tokenizer.decode(ids))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c9de6442-d59e-4f23-bba4-5529068dff45",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "python3_11",
   "language": "python",
   "name": "python3_11"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
