{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "c078adf3-b73f-4d4a-a90a-c39743f7c93a",
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import Counter, defaultdict\n",
    "from datasets import load_dataset\n",
    "import unicodedata\n",
    "import tiktoken\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "from datasets import get_dataset_config_names, load_dataset\n",
    "from transformers import AutoTokenizer\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8355cc14-553e-454e-bf34-377536fa2e24",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "dc11ddc0-c349-4ed9-9c17-c698cb859407",
   "metadata": {},
   "outputs": [],
   "source": [
    "def save_list_to_file(data_list, filename):\n",
    "    with open(filename, 'w', encoding='utf-8') as f:\n",
    "        f.writelines(f\"{item}\\n\" for item in data_list)\n",
    "\n",
    "def load_list_from_file(filename):\n",
    "    with open(filename, 'r', encoding='utf-8') as f:\n",
    "        return [line.strip() for line in f]\n",
    "\n",
    "def save_results_to_file(data_list, filename):\n",
    "    with open(filename, 'w', encoding='utf-8') as f:\n",
    "        f.write(str(data_list))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "7f2eea52-1de8-4e7f-9ddb-723eea61961f",
   "metadata": {},
   "outputs": [],
   "source": [
    "######### TOKENIZER #########"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "1ecb3167-54e8-4cc9-92cf-a74a0f49d370",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "import ast\n",
    "\n",
    "from tokenizers import Tokenizer\n",
    "from tokenizers.models import BPE\n",
    "from tokenizers.pre_tokenizers import Whitespace\n",
    "from collections import defaultdict\n",
    "from tokenizers.pre_tokenizers import PreTokenizer\n",
    "from transformers import PreTrainedTokenizerFast\n",
    "\n",
    "from tokenizers import (\n",
    "    decoders,\n",
    "    models,\n",
    "    normalizers,\n",
    "    pre_tokenizers,\n",
    "    processors,\n",
    "    trainers,\n",
    "    Tokenizer,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "d4baa3a5-b167-4c00-bf4d-aed6beea0457",
   "metadata": {},
   "outputs": [],
   "source": [
    "input_file = \"_n-gramms-2-3-4-5.txt\"\n",
    "with open(input_file, \"r\", encoding=\"utf-8\") as f:\n",
    "    raw_text = f.read()\n",
    "    ngram_data = ast.literal_eval(raw_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "1d78fd3a-6f2d-4a67-b2ba-3c5721db054e",
   "metadata": {},
   "outputs": [],
   "source": [
    "n_gramms_intersection_o200k_base_cl100k_base_mistral_tokens_qwen_tokens_qwen_3_tokens = set(load_list_from_file('_n-gramms-intersection.txt'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "92e896bd-5c5b-4151-a895-cf9face5f558",
   "metadata": {},
   "outputs": [],
   "source": [
    "output_dir = \"_bvv241-max\"\n",
    "os.makedirs(output_dir, exist_ok=True)\n",
    "languages =     [\"en\", \"ru\", \"fr\", \"de\", \"zh\", \"he\", \"it\", \"es\", \"ar\", \"pt\", \"ko\", \"hu\", \"sa\", \"la\", \"ja\", \"el\",     \"sv\", \"nl\", \"pl\", \"vi\", \"fa\", \"no\", \"tr\", \"fi\", \"cs\", \"hy\", \"da\", \"bn\", \"az\", \"ka\", \"hi\", \"id\"]\n",
    "top_cnt = 906 #906 is optimal for 2-3-3-4-5-gramms fitting in unused unicod ranges (and > 65535). 1000 for 2-3-4-5   171 for 2-3-4  800 for 2 (bigramms)\n",
    "\n",
    "UNICODE_LIMIT =   0xD800 # from 0 до D7FF monograms\n",
    "SURROGATE_START = 0xD800\n",
    "SURROGATE_END =   0xE000\n",
    "\n",
    "SPECIAL_TOKENS_START = 0xE000\n",
    "SPECIAL_TOKENS_LIMIT = SPECIAL_TOKENS_START + 256  # 0xE100 = 57600\n",
    "NGRAM_START = SPECIAL_TOKENS_LIMIT  # 0xE100 = 57600\n",
    "NGRAM_LIMIT = 0xF900\n",
    "\n",
    "VOCAB_SIZE_TARGET = 65536\n",
    "\n",
    "NGRAM_START_EXT = 0x10000\n",
    "NGRAM_LIMIT_EXT = 0x1FFFF\n",
    "\n",
    "VOCAB_SIZE_TARGET = 2 * 65536"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4e038761-39e4-4276-bd47-8348833abe45",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "9214200d-c270-44a9-8228-11dc6a3f86f4",
   "metadata": {},
   "outputs": [],
   "source": [
    "vocab = {}\n",
    "\n",
    "for cp in range(0x0000, UNICODE_LIMIT): \n",
    "    try:\n",
    "        ch = chr(cp)\n",
    "    except ValueError:\n",
    "        continue\n",
    "    if ch not in vocab:\n",
    "        vocab[ch] = cp\n",
    "\n",
    "#for cp in range(SURROGATE_START, SURROGATE_END):\n",
    "#    vocab[f\"<surrogate_{cp}>\"] = cp\n",
    "\n",
    "special_tokens = [\n",
    "    \"<pad>\", \"<s>\", \"</s>\", \"<unk>\", \"<think>\", \"</think>\", \"<emotions>\", \"</emotions>\", \"<tool_call>\", \"</tool_call>\", \"<tool_response>\", \"</tool_response>\", \"[INST]\", \"[/INST]\", \"[EOT]\", \"[USER]\", \"[ASSISTANT]\", \"[FIM_PREFIX]\", \"[FIM_MIDDLE]\", \"[FIM_SUFFIX]\", \"[FIM_PAD]\", \"[REPO_NAME]\", \"[FILE_SEP]\"\n",
    "]\n",
    "for i, tok in enumerate(special_tokens):\n",
    "    vocab[tok] = SPECIAL_TOKENS_START + i\n",
    "\n",
    "for cp in range(SPECIAL_TOKENS_START + len(special_tokens), SPECIAL_TOKENS_LIMIT):\n",
    "    vocab[f\"<special_token_{cp}>\"] = cp\n",
    "\n",
    "for cp in range(NGRAM_LIMIT, 0xFFFF): \n",
    "    try:\n",
    "        ch = chr(cp)\n",
    "    except ValueError:\n",
    "        continue\n",
    "    if ch not in vocab:\n",
    "        vocab[ch] = cp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "b3181781-2661-49bc-9f91-8e7a62ece80d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "57343"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "692ec456-a7ef-4510-a883-1518c0df60a7",
   "metadata": {},
   "outputs": [],
   "source": [
    "ngram_set = set()\n",
    "for entry in ngram_data:\n",
    "    if entry[\"lang\"] not in languages:\n",
    "        continue\n",
    "    top_ngrams = entry[\"top_ngrams\"]\n",
    "    for n in sorted(top_ngrams.keys(), reverse=True):  \n",
    "        top_list = top_ngrams[n][:top_cnt]\n",
    "        for gram, _ in top_list:\n",
    "            ngram_set.add(gram)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "c52cca4d-88c1-44ee-9ebc-9572ea14b755",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "59955"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(ngram_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "fa4b4da3-94c2-4ea9-81cc-a532d084947b",
   "metadata": {},
   "outputs": [],
   "source": [
    "ngram_list = list(n_gramms_intersection_o200k_base_cl100k_base_mistral_tokens_qwen_tokens_qwen_3_tokens)\n",
    "\n",
    "ngram_list = [ng for ng in ngram_list if len(ng) > 1]\n",
    "\n",
    "ngram_list_sorted = sorted(ngram_list, key=len)\n",
    "\n",
    "selected_ngrams = set(ngram_list_sorted[0:65536])\n",
    "\n",
    "ngram_new = ngram_set.union(selected_ngrams)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "048fd57c-b5e3-47f9-8bd7-585bb5703cc6",
   "metadata": {},
   "outputs": [],
   "source": [
    "ngram_set = ngram_new"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "2efa8efc-0b4b-458a-a3c9-dba4867c36ea",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "73768"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(ngram_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "c90e32c4-9077-443d-966a-51e19b68f75c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "SURROGATE_END next_id= 55296 57344 57344\n"
     ]
    }
   ],
   "source": [
    "next_id = SURROGATE_START\n",
    "added_ng = []\n",
    "for ng in sorted(ngram_set, key=lambda x: (-len(x), x)): \n",
    "    vocab[ng] = next_id\n",
    "    added_ng.append(ng)\n",
    "    next_id += 1\n",
    "    if next_id >= SURROGATE_END:\n",
    "        print('SURROGATE_END next_id=',SURROGATE_START, SURROGATE_END, next_id)\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "3d45c718-3fb4-4ffc-806e-6977ce2690ab",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "59391"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "919a9a10-0e94-4d4e-8cf3-dcf2e7a9368d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "NGRAM_LIMIT next_id= 57600 63744 63744\n"
     ]
    }
   ],
   "source": [
    "next_id = NGRAM_START\n",
    "for ng in sorted(ngram_set, key=lambda x: (-len(x), x)):  \n",
    "    if ng not in added_ng:\n",
    "        vocab[ng] = next_id\n",
    "        added_ng.append(ng)\n",
    "        next_id += 1\n",
    "        if next_id >= NGRAM_LIMIT:\n",
    "            print('NGRAM_LIMIT next_id=',NGRAM_START, NGRAM_LIMIT, next_id)\n",
    "            break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "8811ffed-2eab-4cdc-b740-305488df6731",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n"
     ]
    }
   ],
   "source": [
    "print(NGRAM_LIMIT - next_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "9f81e9bc-ca7e-4181-afb7-88a8271e700c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "65535"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "62fd04d7-11b0-4eb9-bfed-530f7d0fee82",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "NGRAM_LIMIT_EXT next_id= 65536 131071 131071\n",
      "0\n",
      "131070\n"
     ]
    }
   ],
   "source": [
    "next_id = NGRAM_START_EXT\n",
    "for ng in sorted(ngram_set, key=lambda x: (-len(x), x)):  \n",
    "    if ng not in added_ng:\n",
    "        vocab[ng] = next_id\n",
    "        added_ng.append(ng)\n",
    "        next_id += 1\n",
    "        if next_id >= NGRAM_LIMIT_EXT:\n",
    "            print('NGRAM_LIMIT_EXT next_id=',NGRAM_START_EXT, NGRAM_LIMIT_EXT, next_id)\n",
    "            break\n",
    "            \n",
    "print(NGRAM_LIMIT_EXT - next_id)\n",
    "print(len(vocab))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "21425c91-64db-407f-820e-3c4614e62e86",
   "metadata": {},
   "outputs": [],
   "source": [
    "used_ids = set(vocab.values())\n",
    "for fill_id in range(0, VOCAB_SIZE_TARGET):\n",
    "    if fill_id not in used_ids:\n",
    "        vocab[f\"<unused_{fill_id}>\"] = fill_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "e551523c-ac5c-4dff-b4e2-349fc59e330c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "131072"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "3caa6318-b4d2-4031-8372-38dd45e7681a",
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_merges_from_ngrams(vocab, ngram_set):\n",
    "    merges = []\n",
    "    for piece in sorted(ngram_set, key=lambda x: (-len(x), x)):\n",
    "        if piece not in vocab:\n",
    "            continue\n",
    "        i = 1\n",
    "        while i <= len(piece):\n",
    "            left = piece[:i]\n",
    "            right = piece[i:]\n",
    "            if left in vocab and right in vocab:\n",
    "                merges.append((left, right))\n",
    "                #break\n",
    "            i += 1\n",
    "    return merges"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "b6706826-c733-412e-b344-831063070112",
   "metadata": {},
   "outputs": [],
   "source": [
    "merges = generate_merges_from_ngrams(vocab, ngram_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "cebe3051-62d5-415a-ad57-452ceac4a295",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "182678"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(merges)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "803f0a9e-2ca6-4d3f-a1ad-e9c1dffb7a94",
   "metadata": {},
   "outputs": [],
   "source": [
    "os.makedirs(output_dir, exist_ok=True)\n",
    "\n",
    "tokenizer = Tokenizer(BPE(vocab=vocab, merges=merges, unk_token=\"<unk>\"))\n",
    "tokenizer.pre_tokenizer = None             \n",
    "tokenizer.decoder = decoders.Sequence([])  \n",
    "tokenizer.post_processor = None            \n",
    "tokenizer.add_special_tokens(special_tokens)\n",
    "tokenizer.save(os.path.join(output_dir, \"tokenizer.json\"))\n",
    "\n",
    "with open(os.path.join(output_dir, \"merges.txt\"), \"w\", encoding=\"utf-8\") as f:\n",
    "    for m in merges:\n",
    "        f.write(f\"{m[0]} {m[1]}\\n\")\n",
    "\n",
    "with open(os.path.join(output_dir, \"vocab.txt\"), \"w\", encoding=\"utf-8\") as f:\n",
    "    for token, idx in sorted(vocab.items(), key=lambda x: x[1]):\n",
    "        f.write(f\"{idx:>6} | {repr(token):<12} | len={len(token)}\\n\")\n",
    "\n",
    "with open(os.path.join(output_dir, \"vocab.json\"), \"w\", encoding=\"utf-8\") as f:\n",
    "    json.dump({k: v for k, v in sorted(vocab.items(), key=lambda x: x[1])},\n",
    "              f, indent=2, ensure_ascii=False)\n",
    "\n",
    "with open(os.path.join(output_dir, \"tokenizer_config.json\"), \"w\") as f:\n",
    "    json.dump({\n",
    "        \"model_type\": \"gpt2\",\n",
    "        \"bos_token\": \"<s>\",\n",
    "        \"eos_token\": \"</s>\",\n",
    "        \"unk_token\": \"<unk>\",\n",
    "        \"pad_token\": \"<pad>\",\n",
    "        \"vocab_size\": 131072\n",
    "    }, f, indent=2)\n",
    "\n",
    "with open(os.path.join(output_dir, \"special_tokens_map.json\"), \"w\") as f:\n",
    "    json.dump({\n",
    "        \"unk_token\": \"<unk>\",\n",
    "        \"pad_token\": \"<pad>\",\n",
    "        \"bos_token\": \"<s>\",\n",
    "        \"eos_token\": \"</s>\"\n",
    "    }, f, indent=2)\n",
    "\n",
    "with open(os.path.join(output_dir, \"special_tokens_map.json\"), \"w\") as f:\n",
    "    json.dump({\n",
    "        \"model_type\": \"PreTrainedTokenizerFast\",\n",
    "          \"model_type\": \"gpt2\",\n",
    "          \"bos_token\": \"<s>\",\n",
    "          \"eos_token\": \"</s>\",\n",
    "          \"unk_token\": \"<unk>\",\n",
    "          \"pad_token\": \"<pad>\",\n",
    "          \"vocab_size\": 131072\n",
    "    }, f, indent=2)\n",
    "\n",
    "with open(os.path.join(output_dir, \"config.json\"), \"w\") as f:\n",
    "    json.dump({\n",
    "        \"model_type\": \"gpt2\",\n",
    "        \"bos_token\": \"<s>\",\n",
    "        \"eos_token\": \"</s>\",\n",
    "        \"unk_token\": \"<unk>\",\n",
    "        \"pad_token\": \"<pad>\",\n",
    "        \"vocab_size\": 131072\n",
    "    }, f, indent=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6405817d-8f7b-45a8-9205-98421903ef61",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "1c82d08e-3f22-463f-b10e-55a3cacff309",
   "metadata": {},
   "outputs": [],
   "source": [
    "from PIL import Image, ImageDraw, ImageFont, ImageFilter\n",
    "from IPython.display import display\n",
    "import unicodedata\n",
    "import math\n",
    "import random\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "a1c86410-783c-4120-a211-5c52d3f2e974",
   "metadata": {},
   "outputs": [],
   "source": [
    "#https://en.wikipedia.org/wiki/GNU_Unifont"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "079d432c-eece-42c2-8735-19ba8e025393",
   "metadata": {},
   "source": [
    "''' GNU Unifont is a free Unicode bitmap font created by Roman Czyborra. The main Unifont covers all of the Basic Multilingual Plane (BMP) '''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "8c24d44f-07d7-4e0d-8bae-15506464978f",
   "metadata": {},
   "outputs": [],
   "source": [
    "#we use unifont-14.0.01.ttf for embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fc637349-3a50-4c79-9bed-b148a84f3ed3",
   "metadata": {},
   "outputs": [],
   "source": [
    "!wget -q https://ftp.gnu.org/gnu/unifont/unifont-14.0.01/unifont-14.0.01.ttf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "2209e8ee-2663-47ab-8528-e20b37883905",
   "metadata": {},
   "outputs": [],
   "source": [
    "font_size = 32                     \n",
    "glyph_size = (32, 32)              \n",
    "font = ImageFont.truetype('unifont-14.0.01.ttf', font_size)\n",
    "random.seed(42)\n",
    "np.random.seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "2a4eaa5b-934c-4b01-8c93-5995f219c30b",
   "metadata": {},
   "outputs": [],
   "source": [
    "def describe_char(c):\n",
    "    print(f\"symbol: {c}\")\n",
    "    print(f\"ord: U+{ord(c):04X}\")\n",
    "    print(f\"name: {unicodedata.name(c, 'UNKNOWN')}\")\n",
    "    print(f\"category: {unicodedata.category(c)}\")\n",
    "    print(f\"Bidirectional: {unicodedata.bidirectional(c)}\")\n",
    "    print(f\"Combining: {unicodedata.combining(c)}\")\n",
    "    print(f\"Decomposition: {unicodedata.decomposition(c)}\")\n",
    "    print(f\"Mirrored: {unicodedata.mirrored(c)}\")\n",
    "    print(f\"Decimal value: {unicodedata.decimal(c, '—')}\")\n",
    "    print(f\"isprintable: {c.isprintable()}\")\n",
    "    print(f\"isspace: {c.isspace()}\")\n",
    "    print(f\"isalpha: {c.isalpha()}\")\n",
    "    print(f\"isupper?: {c.isupper()}\") \n",
    "    print(f\"islower?: {c.islower()}\") \n",
    "    print(f\"Title-case?: {c.istitle()}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "080319d8-1a86-49d3-89bd-ccb6096dece9",
   "metadata": {},
   "outputs": [],
   "source": [
    "def is_complex_script(s):\n",
    "    complex_keywords = ['CJK', 'ARABIC', 'HEBREW', 'DEVANAGARI', 'BENGALI', 'THAI', 'KANNADA', 'TAMIL', 'TELUGU', 'KHMER', 'MYANMAR', 'SINHALA', 'SYRIAC']\n",
    "    for c in s:\n",
    "        name = unicodedata.name(c, '')\n",
    "        if any(kw in name for kw in complex_keywords):\n",
    "            return True\n",
    "    return False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "a58505d7-8881-4dcf-bf1a-b1f157c5ed15",
   "metadata": {},
   "outputs": [],
   "source": [
    "def is_easy_script(s):\n",
    "    for c in s:\n",
    "        code = ord(c)\n",
    "        if not (\n",
    "            (0x0000 <= code <= 0x007F) or  # Basic Latin\n",
    "            (0x0080 <= code <= 0x00FF) or  # Latin-1 Supplement\n",
    "            (0x0100 <= code <= 0x024F) or  # Latin Extended A/B\n",
    "            (0x0370 <= code <= 0x03FF) or  # Greek and Coptic\n",
    "            (0x0400 <= code <= 0x04FF) or  # Cyrillic\n",
    "            (0x0500 <= code <= 0x052F) or  # Cyrillic Supplement\n",
    "            (0x0530 <= code <= 0x058F)     # Armenian\n",
    "        ):\n",
    "            return False  \n",
    "    return True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "553bc206-d629-440d-81b6-0903625e3db4",
   "metadata": {},
   "outputs": [],
   "source": [
    "def image_to_bitlist_row_major(img):\n",
    "    bw = img.convert('1') \n",
    "    pixels = bw.load()\n",
    "    width, height = bw.size\n",
    "    bits = [0] * (width * height)\n",
    "    for y in range(height):\n",
    "        for x in range(width):\n",
    "            bits[y * width + x] = 0 if pixels[x, y] else 1\n",
    "    return bits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "1d4c8061-1fef-4976-b695-5bb730c1d365",
   "metadata": {},
   "outputs": [],
   "source": [
    "def image_to_bitlist_polar_snake(img):\n",
    "    bw = img.convert('1')\n",
    "    pixels = bw.load()\n",
    "    width, height = bw.size\n",
    "    cx, cy = width / 2, height / 2\n",
    "    coords = []\n",
    "\n",
    "    for y in range(height):\n",
    "        for x in range(width):\n",
    "            dx = x - cx\n",
    "            dy = y - cy\n",
    "            r = math.hypot(dx, dy)\n",
    "            theta = math.atan2(dy, dx)\n",
    "            coords.append((r, theta, x, y))\n",
    "\n",
    "    coords.sort() \n",
    "    bitlist = []\n",
    "    for r, theta, x, y in coords:\n",
    "        value = 0 if pixels[x, y] else 1\n",
    "        bitlist.append(value)\n",
    "    return bitlist"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "902bc751-4da6-4a16-918b-c98d008049bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "def augment_image(img, chars):\n",
    "    is_complex = not is_easy_script(chars)\n",
    "\n",
    "    random.seed(42)\n",
    "    np.random.seed(42)\n",
    "    \n",
    "    max_rotation = 0 if is_complex else 7\n",
    "    max_shift = 0 if is_complex else 3\n",
    "    noise_level = 0.001 if is_complex else 0.03\n",
    "    apply_blur = False if is_complex else True\n",
    "\n",
    "    angle = random.uniform(-max_rotation, max_rotation)\n",
    "    img = img.rotate(angle, expand=0, fillcolor=255)\n",
    "\n",
    "    dx = random.randint(-max_shift, max_shift)\n",
    "    dy = random.randint(-max_shift, max_shift)\n",
    "    shifted = Image.new(\"L\", img.size, color=255)\n",
    "    shifted.paste(img, (dx, dy))\n",
    "    img = shifted\n",
    "\n",
    "    np_img = np.array(img)\n",
    "\n",
    "    noise_mask = np.random.rand(*np_img.shape) < noise_level\n",
    "    np_img[noise_mask] = 0  \n",
    "\n",
    "    img = Image.fromarray(np_img)\n",
    "    if apply_blur:\n",
    "        img = img.filter(ImageFilter.GaussianBlur(radius=0.5))\n",
    "\n",
    "    bw = img.point(lambda p: 0 if p < 128 else 255, mode='1') \n",
    "    return bw"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "5fc61ccb-b581-4b0a-8918-9804e00a7e54",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "32 32\n"
     ]
    },
    {
     "data": {
      "image/jpeg": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/wAALCAAgACABAREA/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/9oACAEBAAA/APd7P7YYi175CyNtIjhyRH8o3LuON/zbsNtXggbcjJpxTW2lXUFnMJPPvXRBdvGgN5OImJ3bAPnEcGSSqrjaF9Bchvref7OFk2SXERmjhlUxyFBtydjYYY3qDkcFgDg1YqnqF4um282o3VxHFp9rbyTXGYmZgFAbcCD0Ch8jaScjBGMHy+Lwld+L7fU/GV9DJc61C9xJ4WlEiIpt8LLZuVGBkMWIEnPzneDgY7Dwz4hudR8NGZ547zUrJI7C9h8h0KaioAlV3RWGzc6ZZEKqAzZI+70ljbva2cdvJL5vl5VGJYnYCdoYszMzBcAsT8xBPGcDn/GUP/FvPEl1PbQRX0uiTpOYjv8AuxOQu8gFlBZsZA6k4GTXF+F/CPji98IeHbm1+I8lpbiyglt7ZdIiYRKYcBGO4eYArY+bPIBxkAjQ+DdnNa2/i83d3Jd3g8R3UU85yiysoXLiMHahJYk49hkhRj0ysfXdPvNb0XU9IxBDDfRS2vn+YWZI3hI37Noywc427gNvO7Py1H4c02+0vwrpmnGeNXtbe2hXzbfDKiIiurhZCC52vgq2BuHDbctT8J+Fv7D/ALWllnnZr3W7rU1Q/u9nmfIFIR2Drgbhu9VJVWXjoPJ8iffbW0A8+XfdPnYxwm0NwDvb5UXkj5e/ygH/2Q==",
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAACAAAAAgAQAAAABbAUdZAAAAeklEQVR4AU2OzQmEUAyE5wUP3rQCfaVYipWsrxSb8Kx9CP4cPa23FRTHUVgQwpdkJgkBR9IRgCGI6mi1CvBLgnt14yfMMiSHCM0SbIvXzmIkhTQH45zpgMJWzVh6KJ3Inw24j/OlBdz+G5PawwuRjEFOT+5g1f4fknQBwWAyUQX0jhAAAAAASUVORK5CYII=",
      "text/plain": [
       "<PIL.Image.Image image mode=1 size=32x32>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "image/jpeg": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/wAALCAAgACABAREA/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/9oACAEBAAA/APd7P7YYi175CyNtIjhyRH8o3LuON/zbsNtXggbcjJpxTW2lXUFnMJPPvXRBdvGgN5OImJ3bAPnEcGSSqrjaF9Bchvref7OFk2SXERmjhlUxyFBtydjYYY3qDkcFgDg1YqnqF4um282o3VxHFp9rbyTXGYmZgFAbcCD0Ch8jaScjBGMHy+Lwld+L7fU/GV9DJc61C9xJ4WlEiIpt8LLZuVGBkMWIEnPzneDgY7Dwz4hudR8NGZ547zUrJI7C9h8h0KaioAlV3RWGzc6ZZEKqAzZI+70ljbva2cdvJL5vl5VGJYnYCdoYszMzBcAsT8xBPGcDn/GUP/FvPEl1PbQRX0uiTpOYjv8AuxOQu8gFlBZsZA6k4GTXF+F/CPji98IeHbm1+I8lpbiyglt7ZdIiYRKYcBGO4eYArY+bPIBxkAjQ+DdnNa2/i83d3Jd3g8R3UU85yiysoXLiMHahJYk49hkhRj0ysfXdPvNb0XU9IxBDDfRS2vn+YWZI3hI37Noywc427gNvO7Py1H4c02+0vwrpmnGeNXtbe2hXzbfDKiIiurhZCC52vgq2BuHDbctT8J+Fv7D/ALWllnnZr3W7rU1Q/u9nmfIFIR2Drgbhu9VJVWXjoPJ8iffbW0A8+XfdPnYxwm0NwDvb5UXkj5e/ygH/2Q==",
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAACAAAAAgAQAAAABbAUdZAAAAeklEQVR4AU2OzQmEUAyE5wUP3rQCfaVYipWsrxSb8Kx9CP4cPa23FRTHUVgQwpdkJgkBR9IRgCGI6mi1CvBLgnt14yfMMiSHCM0SbIvXzmIkhTQH45zpgMJWzVh6KJ3Inw24j/OlBdz+G5PawwuRjEFOT+5g1f4fknQBwWAyUQX0jhAAAAAASUVORK5CYII=",
      "text/plain": [
       "<PIL.Image.Image image mode=1 size=32x32>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "32 64\n"
     ]
    },
    {
     "data": {
      "image/jpeg": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/wAALCAAgAEABAREA/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/9oACAEBAAA/APe4I2ht4onmkndECtLIFDOQPvHaAMnrwAPQCs+yjhjFkdJEb2dwgnluwRN56iNUQmQvuZ2GwhzvysZBIJU1YmvF0zSxc6ncR5jRRLJFEwDucDCJlmyzEBUBYkkAZPW5VdF8m8ZUinZZ8yvKZNyIwCKFALZXI5AUbflYnBPzZ+raxYaJeQS6lr9jp8M3AhvpI4w4UNuMZJU7svHkksMKAAC26rkF015cRS2zxyae9uJVnXa6zlzlSjh+gUEnK4O9SG4YVJYzXE9nHJd2v2W45EkQkDgEEjKsOqnGQSAcEZCnIHN+K7hdN8Ga/NDJd/2nFpk1v9vFuySs8du8isZEUAAbmIYYUOxUYY4rl/DXw3tPEPhXSL/XvEfifVYL6yhuLnT7vVHa3kZkDYKgBsBiCPm6gda3PhlfXFxp2vWEsm610jW7jTLFNoHlW0QQRpnq2B3bJPcmuw+0PHP5c8WFkl8uBog0m4bNxL4XEfIYckjhecsFrP8AFl9caZ4N1y/s5PLurXT7iaF9oO11jYqcHg4IHWuP8IeD/Dd94ettfvfCsF9qeqxW1zeT3Mcc3nySoryTIrsVRd0jkgBT8pwpwudTwNo+paQ+tfaII2il1iZLV3ZkMVgq/uo41K5CI5ZVT5VAZmUkEbu0rm/HRm/4QXxSrRxiAaPcFHDksW8qTcCuMAAbcHJzk8DHPD+EfhvpGs+FdGXVPEfiDUIJtMtrifR5tUP2cK6ZQeWoDBAynbz1Tvg1ufDJpbzw9r2h6jL9usdK1W40e3S4jQ5tYkRFRwFAbgnJI5zXaCax08vABHaoHVyTH5cbPNIQMNgKzs5OQCTlhn7wzl+O/wDknniX/sFXX/opqr/Dy/s7vwD4eitruCaSLSrXzEjkDFPk2cgdPmR1+qMOoNHgjUoNQs9XihmnlkstVuLW4MlvFAgnBDSeUqf8syzFgXLOdx3MTzXUVj6/Zv4g8Jazp1ofLmvLS4tIzOjRgOVaPJyM7c9wDkcjIIzw+n6H8WLPQ7DQ4dS8KWNrbRQ2wvII5pZ0jTaNwVxsZto6EAHPbqOw8N+Hn8NWskMTQXMl5dm6vrjDRNJK0QEku3LAs0i52rsVVbjlfm2A9xBZw+an2q4+RJDAoQEkgM4Vm4UZLY3E4GBuOM13tbPU9JutIurj7fGYjZ3m5wHbdGNwfZjazKwPAX7wIAGK4uT4d6m7w2kvjLxBFpcL+Tp9tpDx2YsoAp2rI+S0wAVEBOWzzjliOs8N2Wnadp0lnp0M6+TKY7ia4tmikuZgAGmdiq+azYBMgyGPc1sV/9k=",
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAEAAAAAgAQAAAACK5kMpAAAAw0lEQVR4ASWPwU3EMBBFn2d9ZlNChCiAEnKgADrYEmgAaY3EnRZSAiWkEETSwXpPICWb4Y8ZeeQ3f77tMT57C2OEBcU56os7vgrXH08eMhTjF606SPEUklHUvBa4bDKXF7djln58xMqhxyvk0qxhbi9sDXq+n7Av2LhfsAf5YFDrNk8wCd7T8K8cRtB0mddeIHPVBgXrGoztwhpsVO74DOiUzwER10VwW3jrVOyczv6hD+5p9QhLe941CDbFgTxCyLPyD4VCWx0yyGQCAAAAAElFTkSuQmCC",
      "text/plain": [
       "<PIL.Image.Image image mode=1 size=64x32>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "image/jpeg": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/wAALCAAgACABAREA/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/9oACAEBAAA/APe5hMyAQSRo+9SS6Fht3DcMAjkrkA9iQcHGCCZWuHgAk3oiuSY2C4YkDDYwT8pyAcjjOMjNPWLlLGzS9m1WDTbe3lR5pbnaImQnaUYsRjO7ggjDbfvDKtYiuJZJVRrKeNT5mXcphdrBVzhifnB3DjoPm2niuD1LwxaeKPHWpabJqOs2Wm2VlbNNp9lO9tb3TTy3DS71AxIHAAZlPJLAnIOOo8M6pNeeCtC1K/eSa4u7K1eV0iJLSSKmTtQcDc2ScAAZJwAay9Q0PS/Evj6WLW7CC/h0vT7ea0inTciPM9wshK9GyIo/vA425GDk1qeELW5svCunW12kfmpbxl5V3hpnZFaSR1dFZXaQyEhhk9TgkgcnrPhu38TfEO6h1fVL6wj/ALPtYfsVjqBijv0aW8PlyDaC+Y4ySoxgF8Egbq6TwvLDfaD4d12+t45dYv8ATIEkvVthvO6PzWDMq/IhYE4OFyQByQDHDf2dv8TtQs5ruCO6utKsvs8LyAPLtkuy21Ty2BycdK1PDU1tc+FdInshILSSyheESRojBCgK5VAEU4xwoAHbiub1jTvGF54hOq+F7vSrOx1HT7e3nl1CCb7TBteVg6REAbgJs7X7jBA5rrNMs102yj0+C3jgs7REgtFWVnPlKigbsjIIOR1bgA5ySBz/AIq8K3esXsV1Y6zqWmpKiW9+umlEnuI1cmIrK5/dCNnkZtoyysRzwK6SwFuunWws4PItREghh8kxeWmBtXYQCmBgbSBjpgV//9k=",
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAACAAAAAgAQAAAABbAUdZAAAAe0lEQVR4ATWOyw3CMBBEZyc5cIgogkrSmV0CJVAGN1yKK4gjlAuS8bCL4LAfrd7TrAkAvaJ1cPi0/+0ONQlSFx84Ci+YCi07d/5pNdwb60BGS022vU5XVPecQw4uswS8xuYRYJ8PcMYCd98Zm3bxiQm0ukT4IJKUvm98ADbDMnRQfOpHAAAAAElFTkSuQmCC",
      "text/plain": [
       "<PIL.Image.Image image mode=1 size=32x32>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "images=[]\n",
    "for text in ['A', 'BC']:#, '꧄']\n",
    "    img_width = glyph_size[0] * len(text)\n",
    "    img_height = glyph_size[1]\n",
    "    img = Image.new(\"L\", (img_width, img_height), color=255)\n",
    "    draw = ImageDraw.Draw(img)\n",
    "\n",
    "    for i, char in enumerate(text):\n",
    "        try:\n",
    "            bbox = font.getbbox(char)\n",
    "            w = bbox[2] - bbox[0]\n",
    "            h = bbox[3] - bbox[1]\n",
    "            x = i * glyph_size[0] + (glyph_size[0] - w) // 2\n",
    "            y = (glyph_size[1] - h) // 2 - bbox[1]\n",
    "            draw.text((x, y), char, fill=0, font=font)\n",
    "        except Exception as e:\n",
    "            print(f\"Error '{char}': {e}\")\n",
    "            pass\n",
    "\n",
    "    \n",
    "    print(img_height,img_width)\n",
    "    \n",
    "    img_aug = augment_image(img, text)\n",
    "    images.append(img_aug)\n",
    "    display(img_aug)\n",
    "    snake_bits = image_to_bitlist_polar_snake(img_aug)\n",
    "\n",
    "    img_aug = img_aug.resize((32, 32), Image.BILINEAR) #LANCZOS)\n",
    "    display(img_aug)    \n",
    "    reduced = image_to_bitlist_polar_snake(img_aug)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "55b6b23f-8107-4f2b-99ca-a00f0907952a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "symbol: 1\n",
      "ord: U+0031\n",
      "name: DIGIT ONE\n",
      "category: Nd\n",
      "Bidirectional: EN\n",
      "Combining: 0\n",
      "Decomposition: \n",
      "Mirrored: 0\n",
      "Decimal value: 1\n",
      "isprintable: True\n",
      "isspace: False\n",
      "isalpha: False\n",
      "isupper?: False\n",
      "islower?: False\n",
      "Title-case?: False\n"
     ]
    }
   ],
   "source": [
    "describe_char('1') #'꧄')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "c1d96ab7-a714-472a-b719-611956d96cfa",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(os.path.join(output_dir, \"embeddings_max.txt\"), \"w\", encoding=\"utf-8\") as f:\n",
    "    for token, idx in sorted(vocab.items(), key=lambda x: x[1]):\n",
    "        text = token\n",
    "        img_width = glyph_size[0] * len(text)\n",
    "        img_height = glyph_size[1]\n",
    "        img = Image.new(\"L\", (img_width, img_height), color=255)\n",
    "        draw = ImageDraw.Draw(img)\n",
    "\n",
    "        for i, char in enumerate(text):\n",
    "            try:\n",
    "                bbox = font.getbbox(char)\n",
    "                w = bbox[2] - bbox[0]\n",
    "                h = bbox[3] - bbox[1]\n",
    "                x = i * glyph_size[0] + (glyph_size[0] - w) // 2\n",
    "                y = (glyph_size[1] - h) // 2 - bbox[1]\n",
    "                draw.text((x, y), char, fill=0, font=font)\n",
    "            except Exception as e:\n",
    "                print(f\"Error '{char}': {e}\")\n",
    "                pass\n",
    "                \n",
    "        img_aug = augment_image(img, text)\n",
    "        \n",
    "        if img_aug.size != (32, 32):\n",
    "            img_aug = img_aug.resize((32, 32), Image.BILINEAR)\n",
    "            \n",
    "        snake_bits = image_to_bitlist_polar_snake(img_aug)\n",
    "        \n",
    "        f.write(f\"{idx}\\n\")   \n",
    "        f.write(f\"{str(snake_bits)}\\n\")    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "3377785a-4fb3-43d5-a16e-de610dcc2838",
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_binary_file_np(filename):\n",
    "    data = []\n",
    "    with open(filename, 'r') as f:\n",
    "        while True:\n",
    "            index_line = f.readline()\n",
    "            if not index_line:\n",
    "                break\n",
    "            list_line = f.readline()\n",
    "            bits = np.fromstring(list_line.replace('[','').replace(']','').strip(), dtype=int, sep=',')\n",
    "            if bits.size != 1024:\n",
    "                print(index_line)\n",
    "                print(list_line)\n",
    "                raise ValueError(\"Invalid bit list length\")\n",
    "            data.append(bits)\n",
    "    return np.array(data, dtype=np.uint8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "c67c5539-6edc-4cfa-8516-2d26bfca839f",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_embeddings = load_binary_file_np(os.path.join(output_dir, \"embeddings_max.txt\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "902ee687-1805-4348-82c2-71bb1d7e9ddb",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.decomposition import PCA\n",
    "from sklearn.preprocessing import normalize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "6053fbc0-ef14-4268-81a3-908cd2a0dd5e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def apply_pca(data, n_components=1024):\n",
    "    pca = PCA(n_components=1024)\n",
    "    transformed = pca.fit_transform(data)\n",
    "    return transformed, pca"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "012b9fb5-813c-4e75-b06f-b02d1ef1769c",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_embeddings_pca, _pca = apply_pca(data_embeddings.astype(np.float32))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "0d60cfca-b0e1-49eb-b1f0-2faecc29e20e",
   "metadata": {},
   "outputs": [],
   "source": [
    "normalized_embeddings = normalize(data_embeddings_pca, norm='l2', axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "2612512c-3d9b-4cfe-8687-89bb86110a63",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "1f68ff20-261e-4ca5-a2a1-4ed164547852",
   "metadata": {},
   "outputs": [],
   "source": [
    "tensor_normalized_embeddings_weights = torch.tensor(normalized_embeddings, dtype=torch.float32)\n",
    "torch.save(tensor_normalized_embeddings_weights, os.path.join(output_dir, \"normalized_embeddings_weights.pt\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a7dbd352-7152-41e7-aef9-345cc8c3a19f",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "356e06bb-d60f-4f79-8791-6df41815b2d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "7928ec7f-a2db-4b06-a313-0981b494d483",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained(output_dir) \n",
    "tokenizer.is_fast"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "89d2ccbe-cbc4-499a-a72e-f93e3a2c6a17",
   "metadata": {},
   "outputs": [],
   "source": [
    "ids = tokenizer.encode('hello the test')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "5ede5794-25fb-4d36-886e-904fea1797e7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[70960, 32, 113757, 32, 99060]\n"
     ]
    }
   ],
   "source": [
    "print(ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "c0e2cdf1-65b3-43e9-82db-66488a305197",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "hello the test\n"
     ]
    }
   ],
   "source": [
    "print(tokenizer.decode(ids))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c9de6442-d59e-4f23-bba4-5529068dff45",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "python3_11",
   "language": "python",
   "name": "python3_11"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
