{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "c078adf3-b73f-4d4a-a90a-c39743f7c93a",
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import Counter, defaultdict\n",
    "from datasets import load_dataset\n",
    "import unicodedata\n",
    "import tiktoken\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "from datasets import get_dataset_config_names, load_dataset\n",
    "from transformers import AutoTokenizer\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2d83e8a9-28d0-4b4c-a323-8464e839a2b5",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "dc11ddc0-c349-4ed9-9c17-c698cb859407",
   "metadata": {},
   "outputs": [],
   "source": [
    "def save_list_to_file(data_list, filename):\n",
    "    with open(filename, 'w', encoding='utf-8') as f:\n",
    "        f.writelines(f\"{item}\\n\" for item in data_list)\n",
    "\n",
    "def load_list_from_file(filename):\n",
    "    with open(filename, 'r', encoding='utf-8') as f:\n",
    "        return [line.strip() for line in f]\n",
    "\n",
    "def save_results_to_file(data_list, filename):\n",
    "    with open(filename, 'w', encoding='utf-8') as f:\n",
    "        f.write(str(data_list))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "7f2eea52-1de8-4e7f-9ddb-723eea61961f",
   "metadata": {},
   "outputs": [],
   "source": [
    "######### TOKENIZER #########"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "1ecb3167-54e8-4cc9-92cf-a74a0f49d370",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "import ast\n",
    "\n",
    "from tokenizers import Tokenizer\n",
    "from tokenizers.models import BPE\n",
    "from tokenizers.pre_tokenizers import Whitespace\n",
    "from collections import defaultdict\n",
    "from tokenizers.pre_tokenizers import PreTokenizer\n",
    "from transformers import PreTrainedTokenizerFast\n",
    "\n",
    "from tokenizers import (\n",
    "    decoders,\n",
    "    models,\n",
    "    normalizers,\n",
    "    pre_tokenizers,\n",
    "    processors,\n",
    "    trainers,\n",
    "    Tokenizer,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "5041a73a-b318-48be-b9d6-31261660ccdf",
   "metadata": {},
   "outputs": [],
   "source": [
    "input_file = \"_n-gramms-2-3-4-5.txt\"\n",
    "with open(input_file, \"r\", encoding=\"utf-8\") as f:\n",
    "    raw_text = f.read()\n",
    "    ngram_data = ast.literal_eval(raw_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "3a55ca01-6fe1-4bd9-8438-004c3d8d312a",
   "metadata": {},
   "outputs": [],
   "source": [
    "n_gramms_intersection_o200k_base_cl100k_base_mistral_tokens_qwen_tokens_qwen_3_tokens = set(load_list_from_file('_n-gramms-intersection.txt'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "95d6f096-4ca2-4f8c-b117-02ee08074160",
   "metadata": {},
   "outputs": [],
   "source": [
    "output_dir = \"_bvv241-abs\"\n",
    "os.makedirs(output_dir, exist_ok=True)\n",
    "languages =     [\"en\", \"ru\", \"fr\", \"de\", \"zh\", \"he\", \"it\", \"es\", \"ar\", \"pt\", \"ko\", \"hu\", \"sa\", \"la\", \"ja\", \"el\",     \"sv\", \"nl\", \"pl\", \"vi\", \"fa\", \"no\", \"tr\", \"fi\", \"cs\", \"hy\", \"da\", \"bn\", \"az\", \"ka\", \"hi\", \"id\"]\n",
    "top_cnt = 906 #906 is optimal for 2-3-3-4-5-gramms fitting in unused unicod ranges (and > 65535). 1000 for 2-3-4-5   171 for 2-3-4  800 for 2 (bigramms)\n",
    "\n",
    "UNICODE_LIMIT =   0xD800 # from 0 до D7FF monograms\n",
    "SURROGATE_START = 0xD800\n",
    "SURROGATE_END =   0xE000\n",
    "\n",
    "SPECIAL_TOKENS_START = 0xE000\n",
    "SPECIAL_TOKENS_LIMIT = SPECIAL_TOKENS_START + 256  # 0xE100 = 57600\n",
    "NGRAM_START = SPECIAL_TOKENS_LIMIT  # 0xE100 = 57600\n",
    "NGRAM_LIMIT = 0xF900\n",
    "\n",
    "VOCAB_SIZE_TARGET = 65536\n",
    "\n",
    "NGRAM_START_EXT = 0x10000\n",
    "NGRAM_LIMIT_EXT = 0x1FFFF\n",
    "\n",
    "VOCAB_SIZE_TARGET = 2 * 65536"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4e038761-39e4-4276-bd47-8348833abe45",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "9214200d-c270-44a9-8228-11dc6a3f86f4",
   "metadata": {},
   "outputs": [],
   "source": [
    "vocab = {}\n",
    "\n",
    "for cp in range(0x0000, UNICODE_LIMIT): \n",
    "    try:\n",
    "        ch = chr(cp)\n",
    "    except ValueError:\n",
    "        continue\n",
    "    if ch not in vocab:\n",
    "        vocab[ch] = cp\n",
    "\n",
    "#for cp in range(SURROGATE_START, SURROGATE_END):\n",
    "#    vocab[f\"<surrogate_{cp}>\"] = cp\n",
    "\n",
    "special_tokens = [\n",
    "    \"<pad>\", \"<s>\", \"</s>\", \"<unk>\", \"<think>\", \"</think>\", \"<emotions>\", \"</emotions>\", \"<tool_call>\", \"</tool_call>\", \"<tool_response>\", \"</tool_response>\", \"[INST]\", \"[/INST]\", \"[EOT]\", \"[USER]\", \"[ASSISTANT]\", \"[FIM_PREFIX]\", \"[FIM_MIDDLE]\", \"[FIM_SUFFIX]\", \"[FIM_PAD]\", \"[REPO_NAME]\", \"[FILE_SEP]\"\n",
    "]\n",
    "for i, tok in enumerate(special_tokens):\n",
    "    vocab[tok] = SPECIAL_TOKENS_START + i\n",
    "\n",
    "for cp in range(SPECIAL_TOKENS_START + len(special_tokens), SPECIAL_TOKENS_LIMIT):\n",
    "    vocab[f\"<special_token_{cp}>\"] = cp\n",
    "\n",
    "for cp in range(NGRAM_LIMIT, 0xFFFF): \n",
    "    try:\n",
    "        ch = chr(cp)\n",
    "    except ValueError:\n",
    "        continue\n",
    "    if ch not in vocab:\n",
    "        vocab[ch] = cp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "b3181781-2661-49bc-9f91-8e7a62ece80d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "57343"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "692ec456-a7ef-4510-a883-1518c0df60a7",
   "metadata": {},
   "outputs": [],
   "source": [
    "ngram_set = set()\n",
    "for entry in ngram_data:\n",
    "    if entry[\"lang\"] not in languages:\n",
    "        continue\n",
    "    top_ngrams = entry[\"top_ngrams\"]\n",
    "    for n in sorted(top_ngrams.keys(), reverse=True):  # 5 → 2\n",
    "        top_list = top_ngrams[n][:top_cnt]\n",
    "        for gram, _ in top_list:\n",
    "            ngram_set.add(gram)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "c52cca4d-88c1-44ee-9ebc-9572ea14b755",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "59955"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(ngram_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "fa4b4da3-94c2-4ea9-81cc-a532d084947b",
   "metadata": {},
   "outputs": [],
   "source": [
    "ngram_list = list(n_gramms_intersection_o200k_base_cl100k_base_mistral_tokens_qwen_tokens_qwen_3_tokens)\n",
    "\n",
    "ngram_list = [ng for ng in ngram_list if len(ng) > 1]\n",
    "\n",
    "ngram_list_sorted = sorted(ngram_list, key=len)\n",
    "\n",
    "selected_ngrams = set(ngram_list_sorted[0:65536])\n",
    "\n",
    "ngram_new = ngram_set.union(selected_ngrams)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "048fd57c-b5e3-47f9-8bd7-585bb5703cc6",
   "metadata": {},
   "outputs": [],
   "source": [
    "ngram_set = ngram_new"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "2efa8efc-0b4b-458a-a3c9-dba4867c36ea",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "73768"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(ngram_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "c90e32c4-9077-443d-966a-51e19b68f75c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "SURROGATE_END next_id= 55296 57344 57344\n"
     ]
    }
   ],
   "source": [
    "next_id = SURROGATE_START\n",
    "added_ng = []\n",
    "for ng in sorted(ngram_set, key=lambda x: (-len(x), x)): \n",
    "    vocab[ng] = next_id\n",
    "    added_ng.append(ng)\n",
    "    next_id += 1\n",
    "    if next_id >= SURROGATE_END:\n",
    "        print('SURROGATE_END next_id=',SURROGATE_START, SURROGATE_END, next_id)\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "3d45c718-3fb4-4ffc-806e-6977ce2690ab",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "59391"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "919a9a10-0e94-4d4e-8cf3-dcf2e7a9368d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "NGRAM_LIMIT next_id= 57600 63744 63744\n"
     ]
    }
   ],
   "source": [
    "next_id = NGRAM_START\n",
    "for ng in sorted(ngram_set, key=lambda x: (-len(x), x)): \n",
    "    if ng not in added_ng:\n",
    "        vocab[ng] = next_id\n",
    "        added_ng.append(ng)\n",
    "        next_id += 1\n",
    "        if next_id >= NGRAM_LIMIT:\n",
    "            print('NGRAM_LIMIT next_id=',NGRAM_START, NGRAM_LIMIT, next_id)\n",
    "            break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "8811ffed-2eab-4cdc-b740-305488df6731",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n"
     ]
    }
   ],
   "source": [
    "print(NGRAM_LIMIT - next_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "9f81e9bc-ca7e-4181-afb7-88a8271e700c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "65535"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "62fd04d7-11b0-4eb9-bfed-530f7d0fee82",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "NGRAM_LIMIT_EXT next_id= 65536 131071 131071\n",
      "0\n",
      "131070\n"
     ]
    }
   ],
   "source": [
    "next_id = NGRAM_START_EXT\n",
    "for ng in sorted(ngram_set, key=lambda x: (-len(x), x)):  \n",
    "    if ng not in added_ng:\n",
    "        vocab[ng] = next_id\n",
    "        added_ng.append(ng)\n",
    "        next_id += 1\n",
    "        if next_id >= NGRAM_LIMIT_EXT:\n",
    "            print('NGRAM_LIMIT_EXT next_id=',NGRAM_START_EXT, NGRAM_LIMIT_EXT, next_id)\n",
    "            break\n",
    "            \n",
    "print(NGRAM_LIMIT_EXT - next_id)\n",
    "print(len(vocab))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "21425c91-64db-407f-820e-3c4614e62e86",
   "metadata": {},
   "outputs": [],
   "source": [
    "used_ids = set(vocab.values())\n",
    "for fill_id in range(0, VOCAB_SIZE_TARGET):\n",
    "    if fill_id not in used_ids:\n",
    "        vocab[f\"<unused_{fill_id}>\"] = fill_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "e551523c-ac5c-4dff-b4e2-349fc59e330c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "131072"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "3caa6318-b4d2-4031-8372-38dd45e7681a",
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_merges_from_ngrams(vocab, ngram_set):\n",
    "    merges = []\n",
    "    for piece in sorted(ngram_set, key=lambda x: (-len(x), x)):\n",
    "        if piece not in vocab:\n",
    "            continue\n",
    "        i = 1\n",
    "        while i <= len(piece):\n",
    "            left = piece[:i]\n",
    "            right = piece[i:]\n",
    "            if left in vocab and right in vocab:\n",
    "                merges.append((left, right))\n",
    "                #break\n",
    "            i += 1\n",
    "    return merges"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "b6706826-c733-412e-b344-831063070112",
   "metadata": {},
   "outputs": [],
   "source": [
    "merges = generate_merges_from_ngrams(vocab, ngram_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "cebe3051-62d5-415a-ad57-452ceac4a295",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "182678"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(merges)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "803f0a9e-2ca6-4d3f-a1ad-e9c1dffb7a94",
   "metadata": {},
   "outputs": [],
   "source": [
    "os.makedirs(output_dir, exist_ok=True)\n",
    "\n",
    "tokenizer = Tokenizer(BPE(vocab=vocab, merges=merges, unk_token=\"<unk>\"))\n",
    "tokenizer.pre_tokenizer = None             \n",
    "tokenizer.decoder = decoders.Sequence([])  \n",
    "tokenizer.post_processor = None            \n",
    "tokenizer.add_special_tokens(special_tokens)\n",
    "tokenizer.save(os.path.join(output_dir, \"tokenizer.json\"))\n",
    "\n",
    "with open(os.path.join(output_dir, \"merges.txt\"), \"w\", encoding=\"utf-8\") as f:\n",
    "    for m in merges:\n",
    "        f.write(f\"{m[0]} {m[1]}\\n\")\n",
    "\n",
    "with open(os.path.join(output_dir, \"vocab.txt\"), \"w\", encoding=\"utf-8\") as f:\n",
    "    for token, idx in sorted(vocab.items(), key=lambda x: x[1]):\n",
    "        f.write(f\"{idx:>6} | {repr(token):<12} | len={len(token)}\\n\")\n",
    "\n",
    "with open(os.path.join(output_dir, \"vocab.json\"), \"w\", encoding=\"utf-8\") as f:\n",
    "    json.dump({k: v for k, v in sorted(vocab.items(), key=lambda x: x[1])},\n",
    "              f, indent=2, ensure_ascii=False)\n",
    "\n",
    "with open(os.path.join(output_dir, \"tokenizer_config.json\"), \"w\") as f:\n",
    "    json.dump({\n",
    "        \"model_type\": \"gpt2\",\n",
    "        \"bos_token\": \"<s>\",\n",
    "        \"eos_token\": \"</s>\",\n",
    "        \"unk_token\": \"<unk>\",\n",
    "        \"pad_token\": \"<pad>\",\n",
    "        \"vocab_size\": 131072\n",
    "    }, f, indent=2)\n",
    "\n",
    "with open(os.path.join(output_dir, \"special_tokens_map.json\"), \"w\") as f:\n",
    "    json.dump({\n",
    "        \"unk_token\": \"<unk>\",\n",
    "        \"pad_token\": \"<pad>\",\n",
    "        \"bos_token\": \"<s>\",\n",
    "        \"eos_token\": \"</s>\"\n",
    "    }, f, indent=2)\n",
    "\n",
    "with open(os.path.join(output_dir, \"special_tokens_map.json\"), \"w\") as f:\n",
    "    json.dump({\n",
    "        \"model_type\": \"PreTrainedTokenizerFast\",\n",
    "          \"model_type\": \"gpt2\",\n",
    "          \"bos_token\": \"<s>\",\n",
    "          \"eos_token\": \"</s>\",\n",
    "          \"unk_token\": \"<unk>\",\n",
    "          \"pad_token\": \"<pad>\",\n",
    "          \"vocab_size\": 131072\n",
    "    }, f, indent=2)\n",
    "\n",
    "with open(os.path.join(output_dir, \"config.json\"), \"w\") as f:\n",
    "    json.dump({\n",
    "        \"model_type\": \"gpt2\",\n",
    "        \"bos_token\": \"<s>\",\n",
    "        \"eos_token\": \"</s>\",\n",
    "        \"unk_token\": \"<unk>\",\n",
    "        \"pad_token\": \"<pad>\",\n",
    "        \"vocab_size\": 131072\n",
    "    }, f, indent=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6405817d-8f7b-45a8-9205-98421903ef61",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "1c82d08e-3f22-463f-b10e-55a3cacff309",
   "metadata": {},
   "outputs": [],
   "source": [
    "from PIL import Image, ImageDraw, ImageFont, ImageFilter\n",
    "from IPython.display import display\n",
    "import unicodedata\n",
    "import math\n",
    "import random\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "3bb46f67-c54c-423f-8347-a7ba75d6bf99",
   "metadata": {},
   "outputs": [],
   "source": [
    "#https://en.wikipedia.org/wiki/GNU_Unifont"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2f531040-da3b-477a-8b82-2036c8513086",
   "metadata": {},
   "source": [
    "''' GNU Unifont is a free Unicode bitmap font created by Roman Czyborra. The main Unifont covers all of the Basic Multilingual Plane (BMP) '''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "858ea1a9-9d2f-48ac-9d46-e33706389654",
   "metadata": {},
   "outputs": [],
   "source": [
    "#we use unifont-14.0.01.ttf for embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bdbff05c-9523-4b58-b581-0a083a94b585",
   "metadata": {},
   "outputs": [],
   "source": [
    "!wget -q https://ftp.gnu.org/gnu/unifont/unifont-14.0.01/unifont-14.0.01.ttf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "2209e8ee-2663-47ab-8528-e20b37883905",
   "metadata": {},
   "outputs": [],
   "source": [
    "font_size = 64                     \n",
    "glyph_size = (64, 64)              \n",
    "font = ImageFont.truetype('unifont-14.0.01.ttf', font_size)\n",
    "random.seed(42)\n",
    "np.random.seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "2a4eaa5b-934c-4b01-8c93-5995f219c30b",
   "metadata": {},
   "outputs": [],
   "source": [
    "def describe_char(c):\n",
    "    print(f\"symbol: {c}\")\n",
    "    print(f\"ord: U+{ord(c):04X}\")\n",
    "    print(f\"name: {unicodedata.name(c, 'UNKNOWN')}\")\n",
    "    print(f\"category: {unicodedata.category(c)}\")\n",
    "    print(f\"Bidirectional: {unicodedata.bidirectional(c)}\")\n",
    "    print(f\"Combining: {unicodedata.combining(c)}\")\n",
    "    print(f\"Decomposition: {unicodedata.decomposition(c)}\")\n",
    "    print(f\"Mirrored: {unicodedata.mirrored(c)}\")\n",
    "    print(f\"Decimal value: {unicodedata.decimal(c, '—')}\")\n",
    "    print(f\"isprintable: {c.isprintable()}\")\n",
    "    print(f\"isspace: {c.isspace()}\")\n",
    "    print(f\"isalpha: {c.isalpha()}\")\n",
    "    print(f\"isupper?: {c.isupper()}\") \n",
    "    print(f\"islower?: {c.islower()}\") \n",
    "    print(f\"Title-case?: {c.istitle()}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "080319d8-1a86-49d3-89bd-ccb6096dece9",
   "metadata": {},
   "outputs": [],
   "source": [
    "def is_complex_script(s):\n",
    "    complex_keywords = ['CJK', 'ARABIC', 'HEBREW', 'DEVANAGARI', 'BENGALI', 'THAI', 'KANNADA', 'TAMIL', 'TELUGU', 'KHMER', 'MYANMAR', 'SINHALA', 'SYRIAC']\n",
    "    for c in s:\n",
    "        name = unicodedata.name(c, '')\n",
    "        if any(kw in name for kw in complex_keywords):\n",
    "            return True\n",
    "    return False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "a58505d7-8881-4dcf-bf1a-b1f157c5ed15",
   "metadata": {},
   "outputs": [],
   "source": [
    "def is_easy_script(s):\n",
    "    for c in s:\n",
    "        code = ord(c)\n",
    "        if not (\n",
    "            (0x0000 <= code <= 0x007F) or  # Basic Latin\n",
    "            (0x0080 <= code <= 0x00FF) or  # Latin-1 Supplement\n",
    "            (0x0100 <= code <= 0x024F) or  # Latin Extended A/B\n",
    "            (0x0370 <= code <= 0x03FF) or  # Greek and Coptic\n",
    "            (0x0400 <= code <= 0x04FF) or  # Cyrillic\n",
    "            (0x0500 <= code <= 0x052F) or  # Cyrillic Supplement\n",
    "            (0x0530 <= code <= 0x058F)     # Armenian\n",
    "        ):\n",
    "            return False  \n",
    "    return True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "553bc206-d629-440d-81b6-0903625e3db4",
   "metadata": {},
   "outputs": [],
   "source": [
    "def image_to_bitlist_row_major(img):\n",
    "    bw = img.convert('1')  \n",
    "    pixels = bw.load()\n",
    "    width, height = bw.size\n",
    "    bits = [0] * (width * height)\n",
    "    for y in range(height):\n",
    "        for x in range(width):\n",
    "            bits[y * width + x] = 0 if pixels[x, y] else 1\n",
    "    return bits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "1d4c8061-1fef-4976-b695-5bb730c1d365",
   "metadata": {},
   "outputs": [],
   "source": [
    "def image_to_bitlist_polar_snake(img):\n",
    "    bw = img.convert('1')\n",
    "    pixels = bw.load()\n",
    "    width, height = bw.size\n",
    "    cx, cy = width / 2, height / 2\n",
    "    coords = []\n",
    "\n",
    "    for y in range(height):\n",
    "        for x in range(width):\n",
    "            dx = x - cx\n",
    "            dy = y - cy\n",
    "            r = math.hypot(dx, dy)\n",
    "            theta = math.atan2(dy, dx)\n",
    "            coords.append((r, theta, x, y))\n",
    "\n",
    "    coords.sort()  \n",
    "    bitlist = []\n",
    "    for r, theta, x, y in coords:\n",
    "        value = 0 if pixels[x, y] else 1\n",
    "        bitlist.append(value)\n",
    "    return bitlist"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "903751a3-37ec-4c6b-9c55-e8982264b2b8",
   "metadata": {},
   "outputs": [],
   "source": [
    "def augment_image(img, chars):\n",
    "    is_complex = not is_easy_script(chars)\n",
    "\n",
    "    random.seed(42)\n",
    "    np.random.seed(42)\n",
    "    \n",
    "    max_rotation = 0 if is_complex else 7\n",
    "    max_shift = 0 if is_complex else 3\n",
    "    noise_level = 0.001 if is_complex else 0.03\n",
    "    apply_blur = False if is_complex else True\n",
    "\n",
    "    angle = random.uniform(-max_rotation, max_rotation)\n",
    "    img = img.rotate(angle, expand=0, fillcolor=255)\n",
    "\n",
    "    dx = random.randint(-max_shift, max_shift)\n",
    "    dy = random.randint(-max_shift, max_shift)\n",
    "    shifted = Image.new(\"L\", img.size, color=255)\n",
    "    shifted.paste(img, (dx, dy))\n",
    "    img = shifted\n",
    "\n",
    "    np_img = np.array(img)\n",
    "\n",
    "    noise_mask = np.random.rand(*np_img.shape) < noise_level\n",
    "    np_img[noise_mask] = 0  \n",
    "\n",
    "    img = Image.fromarray(np_img)\n",
    "    if apply_blur:\n",
    "        img = img.filter(ImageFilter.GaussianBlur(radius=0.5))\n",
    "\n",
    "    bw = img.point(lambda p: 0 if p < 128 else 255, mode='1') \n",
    "    return bw"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "5fc61ccb-b581-4b0a-8918-9804e00a7e54",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "64 64\n"
     ]
    },
    {
     "data": {
      "image/jpeg": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/wAALCABAAEABAREA/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/9oACAEBAAA/APe4I2ht4onmkndECtLIFDOQPvHaAMnrwAPQCs+yjhjFkdJEb2dwgnluwRN56iNUQmQvuZ2GwhzvysZBIJU1YmvF0zSxc6ncR5jRRLJFEwDucDCJlmyzEBUBYkkAZPW5VdF8m8ZUinZZ8yvKZNyIwCKFALZXI5AUbflYnBPzDw3H2xZY7rEJwJIXjDDAD8qRghiWTJJYYTAAJLUb7htR2BNtqkWWZlB8xyeApDZG0KcgrzvXB4YUWM1xPZxyXdr9luORJEJA4BBIyrDqpxkEgHBGQpyBXtjb286qyzzXh2Wkl29qQ8pVDICzKgXb8zHIwgZiowx21cgkaa3ileGSB3QM0UhUshI+6dpIyOnBI9CakrzuP43eAi8wl12NUD4iK2tyxZdo5YeUNp3bhgZ4AOecD0Ss8TpexQ3X9mTyeVKjW5ljVHAdQDKquQybVkcMGCvw4CnI3FvL595Jcwad5beb9mmnnXynkjQMQyDBLKHZgA20csy5BUvoVXmS4m+0Qq/2eNogI7iJgZFc7gTtZSox8pBO7JJyMDmQTwtcPbrLGZ0RXeMMNyqxIUkdQCVYA99p9K8r+NGv6t4c8EW0mmalP5gu10+7F1ZxOl0rwliWDx7W6DlML8zgjIwsfh/4Y/DnUvDWlahf6BHbz3dla3Eh+1XKRb5wAqozSYJLcBck8rn7wzH8OPG3iLVPin4m8L6rqH26xsPP+zySwxpIvlTiMcxqoOQ3OR1AxjnPrFvf2d3LJFbXcE0kX+sSOQMU+Zk5A6fMjr9UYdQar6Pp0umWbxTXf2mSSV53KwJEis53MEVRwpYs3zFm+Y5ZjzWhVc3Dy2c0tpFvmXesaThoQzqSMElSQpI+8AcjkZBGZJxM1vKtvJHHOUIjeRC6q2OCVBBIz2yM+orwfSL+++I/xX17wj4oMep+H9PuLqaG2ZvIMJjl8tGVo1DOQH24ZgMMTkkAH2zT7T+xdDsLCGDzVtYobYJANoCjam4B3JCgc4LM2B/Eevj/AMSrWz+G+kt4m8I3HkeIrjUBZ6jfM4meTzI2mcPG2Y0ZiI3+VVxkYABxXqHhC6ll8EeGZTb72uNPtzIYUSNIswhs7RgBcgKAo43DgDJGxa3sV55/lJOvkytC3nQPFlh1K7wNy88MuVPYmrFFFfKFn/wmX/C5PFv/AAg//IT+13nm/wCp/wBT9o5/1vy/e2dOf1rr/wDjIf8Az/Z9cR44/wCFkf8ACG6d/wAJR5H/AAj3mxfYvs/2Ty93lt5ezyP4dm7H8OPwr6P8GTXEfgPwckFr50cun2yTv5gXyU+zZD4P3vmCrgf389Aa2LpNRi06C2sZ/MuztjN5cwrIq4GS8iK0ec4x8mMMwONoOLkBma3ia4jjjnKAyJG5dVbHIDEAkZ74GfQVXthfSXEkt2I4ER5Eiihm8xZUJXbI+UUq4wRtBIG48txiNbW8h06OyiuP78X2kOfMhjw2xh5nmeZIPkBLnDHcx/unwz4ZSNF+0F40ZIZJiXvF2oVBAN5GC3zEDABye+AcAnAPuepNLZ6TqlyJb6VhE8saWsaPMmI/uwqVwzZBIDbss2OmAPK/2iJln+HNi6CQAawqfPGyHKxzKeGAOMjg9CMEZBBr0DwNBC3gXwtcNFGZ00e3RJCo3KrRRlgD1AJVSR32j0roBBCtw9wsUYndFR5Ao3MqklQT1IBZiB23H1qm5ska6uRPPK1pKZZ44ZpJSjiIDb5ak5+QqwjA5LBgNxBq5OJmt5Vt5I45yhEbyIXVWxwSoIJGe2Rn1FR3wt3s5IruDz7ebEMkRhMocOQuGUA5XnnIwBknABNeP23gDxPY/Hu816zjnttCu/N8y9hlgQ/vID/yzH3sS4I3IclQzbjkn1yztobO4uo4LWRBK/2iS4Zg3nSMSCMklsqFUcjAXYq8LheT+KnhWXxP4E1O10zTILrV5fI8gkIr/JKDw7YxhXlxz/Gw/iOZPhj4cu9C+HmiabrVjHFqFk8z7GKSGJmkkwVZSQCUcjIPRiPWusVLweXungOJXMmISN0Z3bVHzcMMpluQdrfKNw2yTxtNbyxJNJA7oVWWMKWQkfeG4EZHXkEeoNEwmZAIJI0fepJdCw27huGARyVyAexIODjBIJGmt4pXhkgd0DNFIVLISPunaSMjpwSPQmq97LDZvHez+YsUSSCWYzhIoI9u9nkBYAgbAM4JG7sCxq5UcEbQ28UTzSTuiBWlkChnIH3jtAGT14AHoBVO1iSygnkt9OnWSe7ZpVLq0jkvt80sX5UKAQM5CKqhQQEGhVORmubg2slpdxxI4cXCyqisUKMPuvvwSSMEYOxw3BG7/9k=",
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAEAAAABAAQAAAACCEkxzAAABPElEQVR4AV1Ry03DQBB9O1jCJ+QDBQSJMnIwEoWkBW5ckPeWNlwCBUQkdJAjEiAvFWRvGMV4eDM2KGLk1b598+bjGWinboIWSKA19j6oQo+Ex08NajQQBT345ZqMBqMEcXYygOLDnxgS6Tf7DafYKwyAMrG+d4pXK9NsPcp11FjCnUdNjOJOK+tOtW/WDJVMBduBVA6sDaRsWJAXF3g0UPHs3UU+8qiu9Z6FhRWiEaJhsBsSUIxgX7KzgKKF1KQTVnzRWh4HcQJfoWRGZ4CHGQxncWYGA+faF8XkmqOA2Gcy9iyrN+kbdITXUj7jigDSLu0CihVGHyvzCGcB9sw51sCHNZ/yBgtBWKbLwgTY0kON4ob3JvB3zG7NhWww6Gj43y58G+S51ISR81HfS/ietKeLixzOE15OVznt/Qc+QZUYwSHPqwAAAABJRU5ErkJggg==",
      "text/plain": [
       "<PIL.Image.Image image mode=1 size=64x64>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "image/jpeg": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/wAALCABAAEABAREA/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/9oACAEBAAA/APe4I2ht4onmkndECtLIFDOQPvHaAMnrwAPQCs+yjhjFkdJEb2dwgnluwRN56iNUQmQvuZ2GwhzvysZBIJU1YmvF0zSxc6ncR5jRRLJFEwDucDCJlmyzEBUBYkkAZPW5VdF8m8ZUinZZ8yvKZNyIwCKFALZXI5AUbflYnBPzDw3H2xZY7rEJwJIXjDDAD8qRghiWTJJYYTAAJLUb7htR2BNtqkWWZlB8xyeApDZG0KcgrzvXB4YUWM1xPZxyXdr9luORJEJA4BBIyrDqpxkEgHBGQpyBXtjb286qyzzXh2Wkl29qQ8pVDICzKgXb8zHIwgZiowx21cgkaa3ileGSB3QM0UhUshI+6dpIyOnBI9CakrzuP43eAi8wl12NUD4iK2tyxZdo5YeUNp3bhgZ4AOecD0Ss8TpexQ3X9mTyeVKjW5ljVHAdQDKquQybVkcMGCvw4CnI3FvL595Jcwad5beb9mmnnXynkjQMQyDBLKHZgA20csy5BUvoVXmS4m+0Qq/2eNogI7iJgZFc7gTtZSox8pBO7JJyMDmQTwtcPbrLGZ0RXeMMNyqxIUkdQCVYA99p9K8r+NGv6t4c8EW0mmalP5gu10+7F1ZxOl0rwliWDx7W6DlML8zgjIwsfh/4Y/DnUvDWlahf6BHbz3dla3Eh+1XKRb5wAqozSYJLcBck8rn7wzH8OPG3iLVPin4m8L6rqH26xsPP+zySwxpIvlTiMcxqoOQ3OR1AxjnPrFvf2d3LJFbXcE0kX+sSOQMU+Zk5A6fMjr9UYdQar6Pp0umWbxTXf2mSSV53KwJEis53MEVRwpYs3zFm+Y5ZjzWhVc3Dy2c0tpFvmXesaThoQzqSMElSQpI+8AcjkZBGZJxM1vKtvJHHOUIjeRC6q2OCVBBIz2yM+orwfSL+++I/xX17wj4oMep+H9PuLqaG2ZvIMJjl8tGVo1DOQH24ZgMMTkkAH2zT7T+xdDsLCGDzVtYobYJANoCjam4B3JCgc4LM2B/Eevj/AMSrWz+G+kt4m8I3HkeIrjUBZ6jfM4meTzI2mcPG2Y0ZiI3+VVxkYABxXqHhC6ll8EeGZTb72uNPtzIYUSNIswhs7RgBcgKAo43DgDJGxa3sV55/lJOvkytC3nQPFlh1K7wNy88MuVPYmrFFFfKFn/wmX/C5PFv/AAg//IT+13nm/wCp/wBT9o5/1vy/e2dOf1rr/wDjIf8Az/Z9cR44/wCFkf8ACG6d/wAJR5H/AAj3mxfYvs/2Ty93lt5ezyP4dm7H8OPwr6P8GTXEfgPwckFr50cun2yTv5gXyU+zZD4P3vmCrgf389Aa2LpNRi06C2sZ/MuztjN5cwrIq4GS8iK0ec4x8mMMwONoOLkBma3ia4jjjnKAyJG5dVbHIDEAkZ74GfQVXthfSXEkt2I4ER5Eiihm8xZUJXbI+UUq4wRtBIG48txiNbW8h06OyiuP78X2kOfMhjw2xh5nmeZIPkBLnDHcx/unwz4ZSNF+0F40ZIZJiXvF2oVBAN5GC3zEDABye+AcAnAPuepNLZ6TqlyJb6VhE8saWsaPMmI/uwqVwzZBIDbss2OmAPK/2iJln+HNi6CQAawqfPGyHKxzKeGAOMjg9CMEZBBr0DwNBC3gXwtcNFGZ00e3RJCo3KrRRlgD1AJVSR32j0roBBCtw9wsUYndFR5Ao3MqklQT1IBZiB23H1qm5ska6uRPPK1pKZZ44ZpJSjiIDb5ak5+QqwjA5LBgNxBq5OJmt5Vt5I45yhEbyIXVWxwSoIJGe2Rn1FR3wt3s5IruDz7ebEMkRhMocOQuGUA5XnnIwBknABNeP23gDxPY/Hu816zjnttCu/N8y9hlgQ/vID/yzH3sS4I3IclQzbjkn1yztobO4uo4LWRBK/2iS4Zg3nSMSCMklsqFUcjAXYq8LheT+KnhWXxP4E1O10zTILrV5fI8gkIr/JKDw7YxhXlxz/Gw/iOZPhj4cu9C+HmiabrVjHFqFk8z7GKSGJmkkwVZSQCUcjIPRiPWusVLweXungOJXMmISN0Z3bVHzcMMpluQdrfKNw2yTxtNbyxJNJA7oVWWMKWQkfeG4EZHXkEeoNEwmZAIJI0fepJdCw27huGARyVyAexIODjBIJGmt4pXhkgd0DNFIVLISPunaSMjpwSPQmq97LDZvHez+YsUSSCWYzhIoI9u9nkBYAgbAM4JG7sCxq5UcEbQ28UTzSTuiBWlkChnIH3jtAGT14AHoBVO1iSygnkt9OnWSe7ZpVLq0jkvt80sX5UKAQM5CKqhQQEGhVORmubg2slpdxxI4cXCyqisUKMPuvvwSSMEYOxw3BG7/9k=",
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAEAAAABAAQAAAACCEkxzAAABPElEQVR4AV1Ry03DQBB9O1jCJ+QDBQSJMnIwEoWkBW5ckPeWNlwCBUQkdJAjEiAvFWRvGMV4eDM2KGLk1b598+bjGWinboIWSKA19j6oQo+Ex08NajQQBT345ZqMBqMEcXYygOLDnxgS6Tf7DafYKwyAMrG+d4pXK9NsPcp11FjCnUdNjOJOK+tOtW/WDJVMBduBVA6sDaRsWJAXF3g0UPHs3UU+8qiu9Z6FhRWiEaJhsBsSUIxgX7KzgKKF1KQTVnzRWh4HcQJfoWRGZ4CHGQxncWYGA+faF8XkmqOA2Gcy9iyrN+kbdITXUj7jigDSLu0CihVGHyvzCGcB9sw51sCHNZ/yBgtBWKbLwgTY0kON4ob3JvB3zG7NhWww6Gj43y58G+S51ISR81HfS/ietKeLixzOE15OVznt/Qc+QZUYwSHPqwAAAABJRU5ErkJggg==",
      "text/plain": [
       "<PIL.Image.Image image mode=1 size=64x64>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "64 128\n"
     ]
    },
    {
     "data": {
      "image/jpeg": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/wAALCABAAIABAREA/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/9oACAEBAAA/APa9PvoVENozRwhUjgSKe8EtwJvLLtFJy2XEYVs72LAseg3NoRwQwvM8UUaPM++VlUAu20LlvU7VUZPYAdqJxM1vKtvJHHOUIjeRC6q2OCVBBIz2yM+oqP7dbifyXk8uQy+SglUp5j7PMwmcb/lyflz91u6nFiqcTrO8F/YvHcQXKJulFyxTytrMrxqMqSSw5GMg5JO1RRHfrJcAIY5bd3MMcsLNJ+9Uv5ivtXCBdmMlvvEqQDjdYgjaG3iieaSd0QK0sgUM5A+8doAyevAA9AKkrL0+yVrLSJVlkijtrcbLeCFrWIkoACYj8yhRkCNjhc8gsqlbgt3gs4be1l2+VsUNOWmJQEbgSW3FioI3Enk5O7obFV0s0ivGuIj5XmZMyIigTOQgDscbiwVAo56HkHC4jmmgZw7i7zbXCp8kcoBd1CjhRh0xJyeVUgk4KEqRxrLqk1yJrvMKfZ/JYMkQJw5ZRgByQVG7LAbSBtO8GxDG0SFXmkmJdm3OFBALEhflAGADgd8AZJOSac729xZ31wbyea0MTxOloSxQoXEmwxDzPMzlSASQUGAGznQrk5/FfhCwEui6p4k0aSNnNn9muLlZGVRHhknLuxJyr5Z8ZyFOW5aS38ZeD7aWRYvFPhyO1b51iju4VIkZmaRid+DuLA9Ac7iSd3GponiLSvENv5um39pcOiI00UNzHM0BYcK/lswB4I4JBwcE1oQGZreJriOOOcoDIkbl1VscgMQCRnvgZ9BWfokWkQpfJo9xHKn22ZrlUuTMI7hm3SLyx2HcclBgAsTgZrUoqvM9xD9omVPtEaxAx28SgSM43EjczBTn5QAduCDk4PHJ+NviXpXgJ7VdYsNSYXTyLC1ukbBwioS3LggZkxzg5VuMYJ4+b9o3wqyAQWOso+9SS9rEw27huGBMOSuQD2JBwcYPaeBfiPo/xA+3/wBk219D9h8vzPtaIud+7GNrN/cPXHaukje5guAt3NHKkzlIRDauNhy7fO25gBsCDJ2jcD/fVRYghW2t4oEMhSNAimSRnYgDHLMSWPuSSe9SVTttRW7uJIoba7CRvJHJLLA0Sh1K8DfguDuJDKCp2n5umZLy6+yRBhbz3EjbhHFCmS7BS2MnCrnaQCxVckDOSK+WPAXw40fxB8Q/EHhTVrm+P9m+d5d1aOkW7ypRGcoyv97cDweMEc549Du/gB4OsrK/u57rxAkVmjPlrq1USqqBiVLKAo6rlyvKknAwTT1zTbf4HaH/AMJB4OvP7Q+3XYsLmPUZDMgK+Yfl8pkUMrIyncGIyQCvzBvYNH1G417SdE1aNPs1vd2iXcsYlDHLxgrGcp8y/OSWBU5ReCGIrQtont9tsq5tYokWOR52kkYjIIbcMnAC/MWJYk5xjJN6afp3mXl5mO3izNdXBVMhR8zuQAo6EnAAHoBViq9jNcT2ccl3a/ZbjkSRCQOAQSMqw6qcZBIBwRkKcgeAfDwW998WPFGheI4P+Eht7TfDb3GrQm+njEVyIkAJB2qfNJbAAGMnABr1+88DeD4ohcnwzpSR2+6WRIdKhkMqhT8u0Rlj1BAX5iVA5GQfN/jLpn/CGeDdM1Lw9N/Yl812ttdHRV+xR3OY2bcyocnBT5QWO0O3XOa9Y8PX1xrnhnQtWmk8ma5tIbqaOFRscvFkr82SFBbIwQflHJGQbmpacup2628lzdwxbwzi1naFpAAcKXXDqM4PylSdoBJBIJHdTTXU0US2jpDceVKVuCXRfKD/ADLt4fcy/KT90hs87auVXvrb7ZZyWxWB45cJKk8XmI8ZI3qVyM5XcOeBnJBHB+ePAuoaXp/x18cHV9RgsLWf7fD5s119nyWuV4V8ghsAkYIIxkdK9rj8aeCYXmeLxL4fR5n3yst/CC7bQuW+bk7VUZPYAdq8z+MV5YeNfBtlH4ZEGqau13DLcWunvHdXMcSxy/fEJb5VaXGclQX4Pzc+meC7KZPBnh6HUrGOKeysrYQiQEyRsLdVYkMoMbgtImBnjvyQOkqOeFbm3lgcyBJEKMY5GRgCMcMpBU+4II7VHfL5lnJEYp5FlxEwgk8t1ViFLBtyldoJbIO7jjJwKkgmW5t4p0EgSRA6iSNkYAjPKsAVPsQCO9fPnw3vrew/aA8aS3UnlQ/8TAtKynZGFuFYs7dEUBT8zEDoM5IB9/t7NLWWQwny4X58hEVUVyzM7jAzuYtk5J6ZwCSTHqWk6brNutvqmn2l9Arh1juoVlUNgjIDAjOCRn3NR2M8Vtpkai0+zQxSm1igghchVWQxphdgIXAU5A2gchio3GwtlEvl4ef93K8y5nc5Zt2Qcn5l+c4U5UYXAG1cRyXxNwba3hkedXAbzY5I49oKFyJNhUkLJkD+IgjI2sVE0+G0RBp0FpauqRQ5EAwIUbiMBSMAKzheyls4PINiOFYnmdTITK+9t0jMAdoX5QThRhRwMDOT1JJ+XNI8EWfj74yeNNJvLqe18uW+uIZYQDiQXAVdwI+ZfnJIBBOOorv7b9nXwncbbldU1wWssSNHG+yORScklt0eRkFflKgqQc5zgYnijw9pvwUsn1fQra7vri/dtMki1lFntZrdkDuf3YXB3Kq4cgkB8KQN1e3+GtSm1nwrpGqXCxrPe2UNxIsYIUM6BiBkk4yfU1oTwQ3VvLb3EUc0EqFJI5FDK6kYIIPBBHGKjSzRNRmvQf3ksUcTDYo4QuR8wG4/fPBJA7AEtmNIL6O3u0F7HJK7u1tJLBnygRlVcKV3hWzjG0lcAkkFyXt61s8cMcUbTzpILfzpljR5VXcIyeWyQGOVVsBGJ6AHwv4Wf8nC+Nf+37/0rSvoCq9xY29zLHLLH++i4SVGKOo3KxUMMHaSiZXOGxggjirFV/tDyT+XBFlY5fLnaUNHtGzcCmVxJyVHBA5bnKlasVXvkuJLOSO1fy5nwgkDBTGCQGdcqwLKMkAqQSADgHNR6irBLeaKwjvJ4riMorMqmIM2x5FJ6FY3c4HJGV714X8LP+ThfGv/AG/f+laV75BI01vFK8MkDugZopCpZCR907SRkdOCR6E15H+0d/yTzT/+wrH/AOipa7zwRCsvgDwo7GQGLTLV12yMoJ8gL8wBwwwx4ORnB6gEal1Y2S6jBqs0c7XUW2GN42kbaGO3GxeNpLAtkY+RGb/VqVsPK73iwxNt8vEk2+BiHRg4AV8hQ24An72AOQNymrFRwzw3KF4JY5UDshZGDAMrFWHHcMCCOxBFeB/Cz/k4Xxr/ANv3/pWle8P5UF4sh88yXGIQBvdBtDsCQMqn8WWOM/KCSdorx/4ya1qmneCJdQ0261zS5pvEAizJP5RKJC8Z8rY2RCxj3gNjJJbGCK9I8D3/APaHgjRJZLv7TdDT7b7S7Sb3EjQo53nruIYNzydwPetiX7HaSteS+RDJL5cDTPhS/wAxEaFu/wAzkAer8cmrFV7yb7NELmS5gt7WHdJcvMOBGFOfmyAmDgljkYBGOcixXzh4stvGHhb436r4h8K+Gr6eFtv+psZmt598K793l43fOSx5++MnkVf/AOFp/F//AKEL/wAo93/8XUltqnjb4nX9t4W8ZeEZNP0W5cvLexabNFJbsiMylJJSyqSQEJI5Dkd69Y0DSbTRPD9voA1O7ksAj2Vr9sd4rpmBkLBZMqSAo+TYowse4EjBHSVGZ4VuEt2ljE7ozpGWG5lUgMQOpALKCe24etRteIfMFuPtMkUqRSxwupaMttPzZIxhXDkddvQEkAyTQrOgRzIAHV/kkZDlWDDlSDjI5HQjIOQSK4vxf4KuNb0zU7zSI7HTvE0+xLe/TCPEEkxvE6xiUM8WFIyQPujjJbzSf4ZfF22t5Z38dyFI0LsI9UvXYgDPCqhLH2AJPatfw98LvHy63D/wlXimPU9FKSLcWr3styGJRgjeXNGY2KuVYbgQCoODivYH0qxe3tLcW0ccVm6PbLF+78koMLs242jblSBwVJU5BINiGRpULPDJCQ7LtcqSQGIDfKSMEDI74IyAcgE0bSoFSaSEh1bcgUkgMCV+YEYIGD3wTgg4Ijs7xL6BpYxhVlkiPzq3KOyHlSR1U8ZyOhAIIEkcbI8zNNJIHfcqsFxGNoG1cAHGQTzk5Y84wATCZkAgkjR96kl0LDbuG4YBHJXIB7Eg4OMEmEzIBBJGj71JLoWG3cNwwCOSuQD2JBwcYMlU4ZL6dyzwx2saXDLtc+Y8sQUgN8pAQluR975QMhWYhC8N2txatDHI8CvmUROgZskKAQwwUAZnJDKw8sABtxFF/Jbb7O2uoY5RcXAWNXKYDorSq2GIJIMeRtBIIBwACRcoquZvtdnM1hcwGT540lI81EkUlTuAIztYEFcg8EZBqvf6f9r8PXOmzj7f51o9u4uJPK+0ZQqd7Rr8u7uVXjPA7VcnMy28rW8cck4QmNJHKKzY4BYAkDPfBx6Gq9srQ3EkEFhHb2ivIzPuVd7sVfcqrnIZnk3Fip3L0IbdUn2nbqP2V2gXfF5kQ8394+Dh/kx91d0fzAnl8EDjNiv/2Q==",
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAIAAAABAAQAAAAD6rULSAAACK0lEQVR4AU1TMY7UQBAsz1l3m+0EBARI7Ek8gBcgS/AQnkC42c4FQHpPuJCMDyDWiA8QggDZSAQESDYS0i7y2E1Vzx6i5Onpru7p7unZhXVWsDNXwd1s4Jp2XAcDVWFnB+M37EksJ2r2HYO2yaU0C9cg6ihZQLJjiLKYfe0Mn2Xt9rQKlHRvxu+EYLhqgL6cR0KpcuvmziPd4PZgs712gubOKRfeutvLB6xJYVEPjhFrquEdnlcRN6VKzY1BHmrWlyMiLjzr2kbLKssINm+2nsZpEiGP8NK2pAOwxWo7Ql3z+xKOhg6rCDQQHoRXMy5dbRZm46yesnLLVSFUtPsAw7eHGTLIYxNQ2SbWXr25SG/AKnKiohyBJyQW2Y4oGRTzb4ItFplIG5TABqEQJEPKwFt8ZPrE2wovXIZn9PKKo6odqYfrXiqHRpxzBWykYmwplDAsOB9+JcT7NASWKeGXftfsUaWvP/LXTqS70lcSPSkguZN98faMcIUdyMULB1fyb7c+pQj+YIj6jmSU8EFyP2Wg7zQfz+ARbDdJEW64NLHE8YzU84ZC99EW+Qh10y+lunNNjfoe/WbvLT/isOaMs9lfn349TRjB38JS53quzn7qUBQfgjcQRbQSt1X0Lo8tkyxl6VR+oKFHrzHZodNOsKyDBflv3JtVfn1l4xGvXibst78Sy4gFIau4Byg5n+a7XrUyO44ILVX05zpTVT/Y7X9J5TpF+Exbmkf8BWiunYAWQRbWAAAAAElFTkSuQmCC",
      "text/plain": [
       "<PIL.Image.Image image mode=1 size=128x64>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "image/jpeg": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/wAALCABAAEABAREA/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/9oACAEBAAA/APd7W5+0eejNB50MrRyJDL5mz+Jd3AwxRkYrjjd1IwSSzfZpWlubmCO1by44w42kSMxXlicHcWRQuAc9zuABdRO/kSRLvkilVgpnaNcH5WJ2g7sKzEKRgsF6HDCPTb9dRt2mQxsm87JImZo5EIDI6OVAcFGUkrkAkrk7SasTQrOgRzIAHV/kkZDlWDDlSDjI5HQjIOQSKkqOaFZ0COZAA6v8kjIcqwYcqQcZHI6EZByCRUcVvLHKrtezyKPMyjhMNuYMucKD8gG0c9D82481X057eWV5oLye886JJVnJJhaJmdk2FQIzgEjK/MVCbiflJ5//AITHwPNrH26DxF4cS8h/0ae4lmi8x4tu8RpIWHy7mVsjcuQwxuyV6ixv7PU7OO8sLuC7tZM7JoJBIjYJBww4OCCPwqPSYbG20axg0sxnT47eNLUxyb1MQUBMNk7htxzk5qS8e4iiEtunm+Xud4VUF5gFOEQllVWLbeWOOCOM7h5/4p+Nfhvwj4ju9Dv7LVZLq12b3gijKHciuMEyA9GHauk0LxOvijRNK8Q6YZIdLuXdJIJ7NpJ3O8xLgo5CAMCSSGG3k7cE10lU5tRWPSxfx213OHRWjgSBhK5bAVdjYKnJGd20LyWKgEj588FfCDwxr3wztvFOqXmqpMYp5ZkguIY0xG7jgyLheEHLMB6kCvR/hy6Wd5rfg3S7zfo2h/Y5bG8Qq8s6XAad9zYKMpJKgqo+U8HOGr0A3P2azmub9oLaOLe7uZcosak4ZmIGPlAJ7DkZIGTIZGFwkQhkKMjMZQV2qQRhTznJySMAj5TkjjPk/wANfDmg6z8KdO1nWNJtLy/kee4ubySwS5uJ9tw5YEsjM5YDbwCxB4wcGtjwhG0PxD8caNZzSWem2L6c1ra24VYoA0ZeRUQgqoc53YAJyTkHBHcG2mguLi6hkknkneJfLmmKxxRggHYApGcF25GWJALBQu2xPBDdW8tvcRRzQSoUkjkUMrqRggg8EEcYryf4PeJvDOlfDXSYr/xDptreBJUkguNQVCgE8rL+7ZsKSHznAJG3OQBjU8FWFnd/EDxzqNtaQTaNfS2E9pcxxhre4kRWZ3Rh8rMsvJIyQ/XmvRJ4VubeWBzIEkQoxjkZGAIxwykFT7ggjtQIVW4ecGTe6KhBkYrhSSMLnAPzHJAyeM5wMeZ/BJrO8+HGhCOXbfaf9oEi+WA5jkmkOMsufLYqDlCAWixn5WWvQEgt7XXGeDT4EmvYjJc3SREPIY9ioGYLhuHbG5gRj5QRuK2LOyisYjHE87KdvM07ynhQo5ck9FGfU5JySSZIYIbZCkEUcSF2cqihQWZizHjuWJJPckmvnT4ffBHR/F3hDS9fvNUvoftO/wA2CEJ/DJKnysQcZxEeQcbX67ht9L+HEEOheIPEng+0ijNnoSWiW91Io+0SrMJJiJHGAwVnfaABgMepJNdxPpltPf298Y41uIXDeYIkLOAkihSxUkAea5G0g8kZwzAxyvcadZs0l5BPtijihN2RCZZySo3yKNo3sUACpwc4ByFHH/BL/kkOhf8Abx/6USV6BUcZmLzCWONUD4iKuWLLtHLDA2nduGBngA55wI5bNLiVjcHzoT5ZWCRFZEdGLBxxndnaeScbFIAOSeH+CX/JIdC/7eP/AEokqPwxCs/xb+IiOZAA+lP8kjIcrCWHKkHGRyOhGQcgkV2n2cWH7u1lnEl3d+YWnM1yoJ+ZwMt+6UqrBeQisRgEnabhnhW4S3aWMTujOkZYbmVSAxA6kAsoJ7bh61538HPKX4Q+HJpfP/dyzbRDvOWa4lQblT7y/Pk7gVGNxxtyLHgnWbzVviF4z8576O1WLTJbeyuyQbXzLcsy7CSEbP3gO4rvIYIbZCkEUcSF2cqihQWZizHjuWJJPckmo4Zs3lzbvcwPIu2RYUGHjjYYG7k5yySYbAHGMZUk+AReJfiP4J1bXNI8P+Dp7nSBqt1LaeZplw6RxtIcLFsIUR8ZAAx8xPeuv8Ctqet3l34l8UW0/h/Ur7OnXNqtvNaQ3MICCOTcSCtxvmCI5YkqCqrkFl9Yorxvxv4A8bXN7/aXhrXY9Gt2t2nv7e1uZozJcl3d2CwRDzTtKoGK72CDOT16j4d+Dda0LS71fF2pR6zfz3EbpI8huAiR4aP55ED5DlmAJIU4K4JYnuIJGmt4pXhkgd0DNFIVLISPunaSMjpwSPQmo7C8TUNOtr2IYjuIklUb1fAYAj5kJU9eqkg9iRViq9w9wZY4rdNu752mdQyKAy5QjcG3MpbBAIGMnsrR3E405HnmlklSW4iREZo0Ee9kjAUnbkbjuwSWJYhc/KtXKy72Fdb0a9jQ2l/p99ZFIohIyLMHVs5mUn5GVlAKrkcn5sgC55O7UfPe2g/dxbIp85kG45dcY+VfkjPBOSOQNoJkhMzITPHGj72ACOWG3cdpyQOSuCR2JIycZP8A/9k=",
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAEAAAABAAQAAAACCEkxzAAABRklEQVR4AVVSMU7DQBCcW6VwhwsKKjA1BSmoicVL8pOcEA+I+EDyDDqHF5AnmIYCIdkRRSLh3DK75wixPp9Huzuze7cOimwCDEh8wuh5gXbqhkaPjaFTCExGazxpE9YOqk/70NNfZEBWNuiPcn3/p0Nfaya3QGBRWV7XkWRAMVONM7USQLPsCRJXIpg9M0UqC234JlFFfTDfIgAFG5dHjUpFiIb7AMYmoX/QXTGdC0qEcsA6K0+YQw3a4QSKEbSkeRu3qrJB/0UmpEZ5bukkYngypDdv2rFbAQNnvBEXZPFqVB5LsH2Ib+xbBGmNYVtLD5mbmpR+wpzszJEevR+bioPJQbIHxfyS1Vcfqu2VXRRLbnrO4v0OqEtIZbIxtwqsEJnjZ4ro9OhTge9pPw5Fg5doTYoh5nSncetui4Xunfb3A1A9WYlfvj6o5a3k9k0AAAAASUVORK5CYII=",
      "text/plain": [
       "<PIL.Image.Image image mode=1 size=64x64>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "images=[]\n",
    "for text in ['A', 'BC']:#, '꧄']\n",
    "    img_width = glyph_size[0] * len(text)\n",
    "    img_height = glyph_size[1]\n",
    "    img = Image.new(\"L\", (img_width, img_height), color=255)\n",
    "    draw = ImageDraw.Draw(img)\n",
    "\n",
    "    for i, char in enumerate(text):\n",
    "        try:\n",
    "            bbox = font.getbbox(char)\n",
    "            w = bbox[2] - bbox[0]\n",
    "            h = bbox[3] - bbox[1]\n",
    "            x = i * glyph_size[0] + (glyph_size[0] - w) // 2\n",
    "            y = (glyph_size[1] - h) // 2 - bbox[1]\n",
    "            draw.text((x, y), char, fill=0, font=font)\n",
    "        except Exception as e:\n",
    "            print(f\"Error '{char}': {e}\")\n",
    "            pass\n",
    "\n",
    "    \n",
    "    print(img_height,img_width)\n",
    "    \n",
    "    img_aug = augment_image(img, text)\n",
    "    images.append(img_aug)\n",
    "    display(img_aug)\n",
    "    snake_bits = image_to_bitlist_polar_snake(img_aug)\n",
    "\n",
    "    img_aug = img_aug.resize((64, 64), Image.BILINEAR) #LANCZOS)\n",
    "    display(img_aug)    \n",
    "    reduced = image_to_bitlist_polar_snake(img_aug)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "55b6b23f-8107-4f2b-99ca-a00f0907952a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "symbol: 1\n",
      "ord: U+0031\n",
      "name: DIGIT ONE\n",
      "category: Nd\n",
      "Bidirectional: EN\n",
      "Combining: 0\n",
      "Decomposition: \n",
      "Mirrored: 0\n",
      "Decimal value: 1\n",
      "isprintable: True\n",
      "isspace: False\n",
      "isalpha: False\n",
      "isupper?: False\n",
      "islower?: False\n",
      "Title-case?: False\n"
     ]
    }
   ],
   "source": [
    "describe_char('1') #'꧄')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "45c172c1-2606-465c-ad2d-85eacd01a0f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(os.path.join(output_dir, \"embeddings_abs.txt\"), \"w\", encoding=\"utf-8\") as f:\n",
    "    for token, idx in sorted(vocab.items(), key=lambda x: x[1]):\n",
    "        text = token\n",
    "        img_width = glyph_size[0] * len(text)\n",
    "        img_height = glyph_size[1]\n",
    "        img = Image.new(\"L\", (img_width, img_height), color=255)\n",
    "        draw = ImageDraw.Draw(img)\n",
    "\n",
    "        for i, char in enumerate(text):\n",
    "            try:\n",
    "                bbox = font.getbbox(char)\n",
    "                w = bbox[2] - bbox[0]\n",
    "                h = bbox[3] - bbox[1]\n",
    "                x = i * glyph_size[0] + (glyph_size[0] - w) // 2\n",
    "                y = (glyph_size[1] - h) // 2 - bbox[1]\n",
    "                draw.text((x, y), char, fill=0, font=font)\n",
    "            except Exception as e:\n",
    "                print(f\"Error '{char}': {e}\")\n",
    "                pass\n",
    "                \n",
    "        img_aug = augment_image(img, text)\n",
    "        \n",
    "        if img_aug.size != (64, 64):\n",
    "            img_aug = img_aug.resize((64, 64), Image.BILINEAR)\n",
    "            \n",
    "        snake_bits = image_to_bitlist_polar_snake(img_aug)\n",
    "        \n",
    "        f.write(f\"{idx}\\n\")   \n",
    "        f.write(f\"{str(snake_bits)}\\n\") "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "3377785a-4fb3-43d5-a16e-de610dcc2838",
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_binary_file_np(filename):\n",
    "    data = []\n",
    "    with open(filename, 'r') as f:\n",
    "        while True:\n",
    "            index_line = f.readline()\n",
    "            if not index_line:\n",
    "                break\n",
    "            list_line = f.readline()\n",
    "            bits = np.fromstring(list_line.replace('[','').replace(']','').strip(), dtype=int, sep=',')\n",
    "            if bits.size != 4096:\n",
    "                print(index_line)\n",
    "                print(list_line)\n",
    "                raise ValueError(\"Invalid bit list length\")\n",
    "            data.append(bits)\n",
    "    return np.array(data, dtype=np.uint8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "c67c5539-6edc-4cfa-8516-2d26bfca839f",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_embeddings = load_binary_file_np(os.path.join(output_dir, \"embeddings_abs.txt\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "902ee687-1805-4348-82c2-71bb1d7e9ddb",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.decomposition import PCA\n",
    "from sklearn.preprocessing import normalize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "6053fbc0-ef14-4268-81a3-908cd2a0dd5e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def apply_pca(data, n_components=4096):\n",
    "    pca = PCA(n_components=4096)\n",
    "    transformed = pca.fit_transform(data)\n",
    "    return transformed, pca"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "012b9fb5-813c-4e75-b06f-b02d1ef1769c",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_embeddings_pca, _pca = apply_pca(data_embeddings.astype(np.float32))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "0d60cfca-b0e1-49eb-b1f0-2faecc29e20e",
   "metadata": {},
   "outputs": [],
   "source": [
    "normalized_embeddings = normalize(data_embeddings_pca, norm='l2', axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "4b3da680-0025-48b6-88ba-af5899ad5540",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "2612512c-3d9b-4cfe-8687-89bb86110a63",
   "metadata": {},
   "outputs": [],
   "source": [
    "tensor_normalized_embeddings_weights = torch.tensor(normalized_embeddings, dtype=torch.float32)\n",
    "torch.save(tensor_normalized_embeddings_weights, os.path.join(output_dir, \"normalized_embeddings_weights.pt\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a7dbd352-7152-41e7-aef9-345cc8c3a19f",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "356e06bb-d60f-4f79-8791-6df41815b2d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "7928ec7f-a2db-4b06-a313-0981b494d483",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained(output_dir) \n",
    "tokenizer.is_fast"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "89d2ccbe-cbc4-499a-a72e-f93e3a2c6a17",
   "metadata": {},
   "outputs": [],
   "source": [
    "ids = tokenizer.encode('hello the test')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "5ede5794-25fb-4d36-886e-904fea1797e7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[70960, 32, 113757, 32, 99060]\n"
     ]
    }
   ],
   "source": [
    "print(ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "c0e2cdf1-65b3-43e9-82db-66488a305197",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "hello the test\n"
     ]
    }
   ],
   "source": [
    "print(tokenizer.decode(ids))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c9de6442-d59e-4f23-bba4-5529068dff45",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "python3_11",
   "language": "python",
   "name": "python3_11"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
