{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "c078adf3-b73f-4d4a-a90a-c39743f7c93a",
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import Counter, defaultdict\n",
    "from datasets import load_dataset\n",
    "import unicodedata\n",
    "import tiktoken\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "from datasets import get_dataset_config_names, load_dataset\n",
    "from transformers import AutoTokenizer\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8355cc14-553e-454e-bf34-377536fa2e24",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "dc11ddc0-c349-4ed9-9c17-c698cb859407",
   "metadata": {},
   "outputs": [],
   "source": [
    "def save_list_to_file(data_list, filename):\n",
    "    with open(filename, 'w', encoding='utf-8') as f:\n",
    "        f.writelines(f\"{item}\\n\" for item in data_list)\n",
    "\n",
    "def load_list_from_file(filename):\n",
    "    with open(filename, 'r', encoding='utf-8') as f:\n",
    "        return [line.strip() for line in f]\n",
    "\n",
    "def save_results_to_file(data_list, filename):\n",
    "    with open(filename, 'w', encoding='utf-8') as f:\n",
    "        f.write(str(data_list))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "7f2eea52-1de8-4e7f-9ddb-723eea61961f",
   "metadata": {},
   "outputs": [],
   "source": [
    "######### TOKENIZER #########"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "1ecb3167-54e8-4cc9-92cf-a74a0f49d370",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "import ast\n",
    "\n",
    "from tokenizers import Tokenizer\n",
    "from tokenizers.models import BPE\n",
    "from tokenizers.pre_tokenizers import Whitespace\n",
    "from collections import defaultdict\n",
    "from tokenizers.pre_tokenizers import PreTokenizer\n",
    "from transformers import PreTrainedTokenizerFast\n",
    "\n",
    "from tokenizers import (\n",
    "    decoders,\n",
    "    models,\n",
    "    normalizers,\n",
    "    pre_tokenizers,\n",
    "    processors,\n",
    "    trainers,\n",
    "    Tokenizer,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "d4baa3a5-b167-4c00-bf4d-aed6beea0457",
   "metadata": {},
   "outputs": [],
   "source": [
    "input_file = \"_n-gramms-2-3.txt\"\n",
    "with open(input_file, \"r\", encoding=\"utf-8\") as f:\n",
    "    raw_text = f.read()\n",
    "    ngram_data = ast.literal_eval(raw_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "92e896bd-5c5b-4151-a895-cf9face5f558",
   "metadata": {},
   "outputs": [],
   "source": [
    "output_dir = \"_bvv241-2-3\"\n",
    "os.makedirs(output_dir, exist_ok=True)\n",
    "top_cnt = 338 #338 is optimal fo 2-3-gramms fitting in unused unicod ranges. 1000 for 2-3-4-5   171 for 2-3-4  800 for 2 (bigramms)\n",
    "\n",
    "UNICODE_LIMIT =   0xD800 # from 0 до D7FF monograms\n",
    "SURROGATE_START = 0xD800\n",
    "SURROGATE_END =   0xE000\n",
    "\n",
    "SPECIAL_TOKENS_START = 0xE000\n",
    "SPECIAL_TOKENS_LIMIT = SPECIAL_TOKENS_START + 256  # 0xE100 = 57600\n",
    "NGRAM_START = SPECIAL_TOKENS_LIMIT  # 0xE100 = 57600\n",
    "NGRAM_LIMIT = 0xF900\n",
    "\n",
    "VOCAB_SIZE_TARGET = 65536\n",
    "\n",
    "NGRAM_START_EXT = 0x10000\n",
    "NGRAM_LIMIT_EXT = 0x1FFFF\n",
    "\n",
    "VOCAB_SIZE_TARGET = 65536"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4e038761-39e4-4276-bd47-8348833abe45",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "9214200d-c270-44a9-8228-11dc6a3f86f4",
   "metadata": {},
   "outputs": [],
   "source": [
    "vocab = {}\n",
    "\n",
    "for cp in range(0x0000, UNICODE_LIMIT): \n",
    "    try:\n",
    "        ch = chr(cp)\n",
    "    except ValueError:\n",
    "        continue\n",
    "    if ch not in vocab:\n",
    "        vocab[ch] = cp\n",
    "\n",
    "#for cp in range(SURROGATE_START, SURROGATE_END):\n",
    "#    vocab[f\"<surrogate_{cp}>\"] = cp\n",
    "\n",
    "special_tokens = [\n",
    "    \"<pad>\", \"<s>\", \"</s>\", \"<unk>\", \"<think>\", \"</think>\", \"<emotions>\", \"</emotions>\", \"<tool_call>\", \"</tool_call>\", \"<tool_response>\", \"</tool_response>\", \"[INST]\", \"[/INST]\", \"[EOT]\", \"[USER]\", \"[ASSISTANT]\", \"[FIM_PREFIX]\", \"[FIM_MIDDLE]\", \"[FIM_SUFFIX]\", \"[FIM_PAD]\", \"[REPO_NAME]\", \"[FILE_SEP]\"\n",
    "]\n",
    "for i, tok in enumerate(special_tokens):\n",
    "    vocab[tok] = SPECIAL_TOKENS_START + i\n",
    "\n",
    "for cp in range(SPECIAL_TOKENS_START + len(special_tokens), SPECIAL_TOKENS_LIMIT):\n",
    "    vocab[f\"<special_token_{cp}>\"] = cp\n",
    "\n",
    "for cp in range(NGRAM_LIMIT, 0xFFFF): \n",
    "    try:\n",
    "        ch = chr(cp)\n",
    "    except ValueError:\n",
    "        continue\n",
    "    if ch not in vocab:\n",
    "        vocab[ch] = cp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "b3181781-2661-49bc-9f91-8e7a62ece80d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "57343"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "26fe34c7-d113-4e3a-9f03-be0ecab94bee",
   "metadata": {},
   "outputs": [],
   "source": [
    "languages =     [\"en\", \"ru\", \"fr\", \"de\", \"zh\", \"he\", \"it\", \"es\", \"ar\", \"pt\", \"ko\", \"hu\", \"sa\", \"la\", \"ja\", \"el\",     \"sv\", \"nl\", \"pl\", \"vi\", \"fa\", \"no\", \"tr\", \"fi\", \"cs\", \"hy\", \"da\", \"bn\", \"az\", \"ka\", \"hi\", \"id\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "692ec456-a7ef-4510-a883-1518c0df60a7",
   "metadata": {},
   "outputs": [],
   "source": [
    "ngram_set = set()\n",
    "for entry in ngram_data:\n",
    "    if entry[\"lang\"] not in languages:\n",
    "        continue\n",
    "    top_ngrams = entry[\"top_ngrams\"]\n",
    "    for n in sorted(top_ngrams.keys(), reverse=True):  # 5 → 2\n",
    "        top_list = top_ngrams[n][:top_cnt]\n",
    "        for gram, _ in top_list:\n",
    "            ngram_set.add(gram)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "c52cca4d-88c1-44ee-9ebc-9572ea14b755",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8177"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(ngram_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "c90e32c4-9077-443d-966a-51e19b68f75c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "SURROGATE_END next_id= 55296 57344 57344\n"
     ]
    }
   ],
   "source": [
    "next_id = SURROGATE_START\n",
    "added_ng = []\n",
    "for ng in sorted(ngram_set, key=lambda x: (-len(x), x)): \n",
    "    vocab[ng] = next_id\n",
    "    added_ng.append(ng)\n",
    "    next_id += 1\n",
    "    if next_id >= SURROGATE_END:\n",
    "        print('SURROGATE_END next_id=',SURROGATE_START, SURROGATE_END, next_id)\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "3d45c718-3fb4-4ffc-806e-6977ce2690ab",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "59391"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "919a9a10-0e94-4d4e-8cf3-dcf2e7a9368d",
   "metadata": {},
   "outputs": [],
   "source": [
    "next_id = NGRAM_START\n",
    "for ng in sorted(ngram_set, key=lambda x: (-len(x), x)): \n",
    "    if ng not in added_ng:\n",
    "        vocab[ng] = next_id\n",
    "        added_ng.append(ng)\n",
    "        next_id += 1\n",
    "        if next_id >= NGRAM_LIMIT:\n",
    "            print('NGRAM_LIMIT next_id=',NGRAM_START, NGRAM_LIMIT, next_id)\n",
    "            break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "8811ffed-2eab-4cdc-b740-305488df6731",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "15\n"
     ]
    }
   ],
   "source": [
    "print(NGRAM_LIMIT - next_id) #unused in this case (for 2-3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "9f81e9bc-ca7e-4181-afb7-88a8271e700c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "65520"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "84896bed-f747-4f06-8040-2d24a5b8e34a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "21425c91-64db-407f-820e-3c4614e62e86",
   "metadata": {},
   "outputs": [],
   "source": [
    "# --- dummy-tokens for unused positions\n",
    "used_ids = set(vocab.values())\n",
    "for fill_id in range(0, VOCAB_SIZE_TARGET):\n",
    "    if fill_id not in used_ids:\n",
    "        vocab[f\"<unused_{fill_id}>\"] = fill_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "e551523c-ac5c-4dff-b4e2-349fc59e330c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "65536"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "3caa6318-b4d2-4031-8372-38dd45e7681a",
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_merges_from_ngrams(vocab, ngram_set):\n",
    "    merges = []\n",
    "    for piece in sorted(ngram_set, key=lambda x: (-len(x), x)):\n",
    "        if piece not in vocab:\n",
    "            continue\n",
    "        i = 1\n",
    "        while i <= len(piece):\n",
    "            left = piece[:i]\n",
    "            right = piece[i:]\n",
    "            if left in vocab and right in vocab:\n",
    "                merges.append((left, right))\n",
    "                #break\n",
    "            i += 1\n",
    "    return merges"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "b6706826-c733-412e-b344-831063070112",
   "metadata": {},
   "outputs": [],
   "source": [
    "merges = generate_merges_from_ngrams(vocab, ngram_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "cebe3051-62d5-415a-ad57-452ceac4a295",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "12224"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(merges)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "803f0a9e-2ca6-4d3f-a1ad-e9c1dffb7a94",
   "metadata": {},
   "outputs": [],
   "source": [
    "os.makedirs(output_dir, exist_ok=True)\n",
    "\n",
    "tokenizer = Tokenizer(BPE(vocab=vocab, merges=merges, unk_token=\"<unk>\"))\n",
    "tokenizer.pre_tokenizer = None             \n",
    "tokenizer.decoder = decoders.Sequence([])  \n",
    "tokenizer.post_processor = None            \n",
    "tokenizer.add_special_tokens(special_tokens)\n",
    "tokenizer.save(os.path.join(output_dir, \"tokenizer.json\"))\n",
    "\n",
    "with open(os.path.join(output_dir, \"merges.txt\"), \"w\", encoding=\"utf-8\") as f:\n",
    "    for m in merges:\n",
    "        f.write(f\"{m[0]} {m[1]}\\n\")\n",
    "\n",
    "with open(os.path.join(output_dir, \"vocab.txt\"), \"w\", encoding=\"utf-8\") as f:\n",
    "    for token, idx in sorted(vocab.items(), key=lambda x: x[1]):\n",
    "        f.write(f\"{idx:>6} | {repr(token):<12} | len={len(token)}\\n\")\n",
    "\n",
    "with open(os.path.join(output_dir, \"vocab.json\"), \"w\", encoding=\"utf-8\") as f:\n",
    "    json.dump({k: v for k, v in sorted(vocab.items(), key=lambda x: x[1])},\n",
    "              f, indent=2, ensure_ascii=False)\n",
    "\n",
    "with open(os.path.join(output_dir, \"tokenizer_config.json\"), \"w\") as f:\n",
    "    json.dump({\n",
    "        \"model_type\": \"gpt2\",\n",
    "        \"bos_token\": \"<s>\",\n",
    "        \"eos_token\": \"</s>\",\n",
    "        \"unk_token\": \"<unk>\",\n",
    "        \"pad_token\": \"<pad>\",\n",
    "        \"vocab_size\": 65536\n",
    "    }, f, indent=2)\n",
    "\n",
    "with open(os.path.join(output_dir, \"special_tokens_map.json\"), \"w\") as f:\n",
    "    json.dump({\n",
    "        \"unk_token\": \"<unk>\",\n",
    "        \"pad_token\": \"<pad>\",\n",
    "        \"bos_token\": \"<s>\",\n",
    "        \"eos_token\": \"</s>\"\n",
    "    }, f, indent=2)\n",
    "\n",
    "with open(os.path.join(output_dir, \"special_tokens_map.json\"), \"w\") as f:\n",
    "    json.dump({\n",
    "        \"model_type\": \"PreTrainedTokenizerFast\",\n",
    "          \"model_type\": \"gpt2\",\n",
    "          \"bos_token\": \"<s>\",\n",
    "          \"eos_token\": \"</s>\",\n",
    "          \"unk_token\": \"<unk>\",\n",
    "          \"pad_token\": \"<pad>\",\n",
    "          \"vocab_size\": 65536\n",
    "    }, f, indent=2)\n",
    "\n",
    "with open(os.path.join(output_dir, \"config.json\"), \"w\") as f:\n",
    "    json.dump({\n",
    "        \"model_type\": \"gpt2\",\n",
    "        \"bos_token\": \"<s>\",\n",
    "        \"eos_token\": \"</s>\",\n",
    "        \"unk_token\": \"<unk>\",\n",
    "        \"pad_token\": \"<pad>\",\n",
    "        \"vocab_size\": 65536\n",
    "    }, f, indent=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6405817d-8f7b-45a8-9205-98421903ef61",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "1c82d08e-3f22-463f-b10e-55a3cacff309",
   "metadata": {},
   "outputs": [],
   "source": [
    "from PIL import Image, ImageDraw, ImageFont, ImageFilter\n",
    "from IPython.display import display\n",
    "import unicodedata\n",
    "import math\n",
    "import random\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "267bf5a6-5e8a-4de1-937d-ab1e43bcfd1a",
   "metadata": {},
   "outputs": [],
   "source": [
    "#https://en.wikipedia.org/wiki/GNU_Unifont"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e95afa52-b339-4132-ab35-0a6cab0a74da",
   "metadata": {},
   "source": [
    "'''\n",
    "GNU Unifont is a free Unicode bitmap font created by Roman Czyborra. \n",
    "The main Unifont covers all of the Basic Multilingual Plane (BMP)\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "dc4472c4-1647-4b77-807a-bf9cf412c1cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "#we use unifont-14.0.01.ttf for embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0f5d53d1-5a87-4e5e-ac3b-f7aa08059c33",
   "metadata": {},
   "outputs": [],
   "source": [
    "!wget -q https://ftp.gnu.org/gnu/unifont/unifont-14.0.01/unifont-14.0.01.ttf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "2209e8ee-2663-47ab-8528-e20b37883905",
   "metadata": {},
   "outputs": [],
   "source": [
    "font_size = 32                     \n",
    "glyph_size = (32, 32)              \n",
    "font = ImageFont.truetype('unifont-14.0.01.ttf', font_size)\n",
    "random.seed(42)\n",
    "np.random.seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "2a4eaa5b-934c-4b01-8c93-5995f219c30b",
   "metadata": {},
   "outputs": [],
   "source": [
    "def describe_char(c):\n",
    "    print(f\"symbol: {c}\")\n",
    "    print(f\"ord: U+{ord(c):04X}\")\n",
    "    print(f\"name: {unicodedata.name(c, 'UNKNOWN')}\")\n",
    "    print(f\"category: {unicodedata.category(c)}\")\n",
    "    print(f\"Bidirectional: {unicodedata.bidirectional(c)}\")\n",
    "    print(f\"Combining: {unicodedata.combining(c)}\")\n",
    "    print(f\"Decomposition: {unicodedata.decomposition(c)}\")\n",
    "    print(f\"Mirrored: {unicodedata.mirrored(c)}\")\n",
    "    print(f\"Decimal value: {unicodedata.decimal(c, '—')}\")\n",
    "    print(f\"isprintable: {c.isprintable()}\")\n",
    "    print(f\"isspace: {c.isspace()}\")\n",
    "    print(f\"isalpha: {c.isalpha()}\")\n",
    "    print(f\"isupper?: {c.isupper()}\") \n",
    "    print(f\"islower?: {c.islower()}\") \n",
    "    print(f\"Title-case?: {c.istitle()}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "080319d8-1a86-49d3-89bd-ccb6096dece9",
   "metadata": {},
   "outputs": [],
   "source": [
    "def is_complex_script(s):\n",
    "    complex_keywords = ['CJK', 'ARABIC', 'HEBREW', 'DEVANAGARI', 'BENGALI', 'THAI', 'KANNADA', 'TAMIL', 'TELUGU', 'KHMER', 'MYANMAR', 'SINHALA', 'SYRIAC']\n",
    "    for c in s:\n",
    "        name = unicodedata.name(c, '')\n",
    "        if any(kw in name for kw in complex_keywords):\n",
    "            return True\n",
    "    return False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "a58505d7-8881-4dcf-bf1a-b1f157c5ed15",
   "metadata": {},
   "outputs": [],
   "source": [
    "def is_easy_script(s):\n",
    "    for c in s:\n",
    "        code = ord(c)\n",
    "        if not (\n",
    "            (0x0000 <= code <= 0x007F) or  # Basic Latin\n",
    "            (0x0080 <= code <= 0x00FF) or  # Latin-1 Supplement\n",
    "            (0x0100 <= code <= 0x024F) or  # Latin Extended A/B\n",
    "            (0x0370 <= code <= 0x03FF) or  # Greek and Coptic\n",
    "            (0x0400 <= code <= 0x04FF) or  # Cyrillic\n",
    "            (0x0500 <= code <= 0x052F) or  # Cyrillic Supplement\n",
    "            (0x0530 <= code <= 0x058F)     # Armenian\n",
    "        ):\n",
    "            return False  \n",
    "    return True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "553bc206-d629-440d-81b6-0903625e3db4",
   "metadata": {},
   "outputs": [],
   "source": [
    "def image_to_bitlist_row_major(img):\n",
    "    bw = img.convert('1')  \n",
    "    pixels = bw.load()\n",
    "    width, height = bw.size\n",
    "    bits = [0] * (width * height)\n",
    "    for y in range(height):\n",
    "        for x in range(width):\n",
    "            bits[y * width + x] = 0 if pixels[x, y] else 1\n",
    "    return bits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "1d4c8061-1fef-4976-b695-5bb730c1d365",
   "metadata": {},
   "outputs": [],
   "source": [
    "def image_to_bitlist_polar_snake(img):\n",
    "    bw = img.convert('1')\n",
    "    pixels = bw.load()\n",
    "    width, height = bw.size\n",
    "    cx, cy = width / 2, height / 2\n",
    "    coords = []\n",
    "\n",
    "    for y in range(height):\n",
    "        for x in range(width):\n",
    "            dx = x - cx\n",
    "            dy = y - cy\n",
    "            r = math.hypot(dx, dy)\n",
    "            theta = math.atan2(dy, dx)\n",
    "            coords.append((r, theta, x, y))\n",
    "\n",
    "    coords.sort() \n",
    "    bitlist = []\n",
    "    for r, theta, x, y in coords:\n",
    "        value = 0 if pixels[x, y] else 1\n",
    "        bitlist.append(value)\n",
    "    return bitlist"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "903751a3-37ec-4c6b-9c55-e8982264b2b8",
   "metadata": {},
   "outputs": [],
   "source": [
    "def augment_image(img, chars):\n",
    "    is_complex = not is_easy_script(chars)\n",
    "\n",
    "    random.seed(42)\n",
    "    np.random.seed(42)\n",
    "    \n",
    "    max_rotation = 0 if is_complex else 4\n",
    "    max_shift = 0 if is_complex else 2\n",
    "    noise_level = 0.001 if is_complex else 0.01\n",
    "    apply_blur = False if is_complex else True\n",
    "\n",
    "    angle = random.uniform(-max_rotation, max_rotation)\n",
    "    img = img.rotate(angle, expand=0, fillcolor=255)\n",
    "\n",
    "    dx = random.randint(-max_shift, max_shift)\n",
    "    dy = random.randint(-max_shift, max_shift)\n",
    "    shifted = Image.new(\"L\", img.size, color=255)\n",
    "    shifted.paste(img, (dx, dy))\n",
    "    img = shifted\n",
    "\n",
    "    np_img = np.array(img)\n",
    "\n",
    "    noise_mask = np.random.rand(*np_img.shape) < noise_level\n",
    "    np_img[noise_mask] = 0  \n",
    "\n",
    "    img = Image.fromarray(np_img)\n",
    "    if apply_blur:\n",
    "        img = img.filter(ImageFilter.GaussianBlur(radius=0.5))\n",
    "\n",
    "    bw = img.point(lambda p: 0 if p < 128 else 255, mode='1') \n",
    "    return bw"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "506f8caa-fffa-4a41-ae2a-e21c4633eace",
   "metadata": {},
   "outputs": [],
   "source": [
    "images=[]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "5fc61ccb-b581-4b0a-8918-9804e00a7e54",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "32 32\n"
     ]
    },
    {
     "data": {
      "image/jpeg": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/wAALCAAgACABAREA/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/9oACAEBAAA/APe4DM1vE1xHHHOUBkSNy6q2OQGIBIz3wM+grn9f8ZaB4It7CLxFq8kTzoVilkgZ2mKBdzHykwD8wPQDngV0EEy3NvFOgkCSIHUSRsjAEZ5VgCp9iAR3qSivP/hl/wAT3+1PHv8AqP8AhJPK/wBB+99n+z74f9Zxv3Y3fdGOnPWj4Zf8SL+1PAX+v/4Rvyv9O+79o+0b5v8AV87Nudv3jnrx0r0CivD/AIW/C3wb4j+HGk6tq2jfaL6fzvMl+1TJu2zOo4VwBwAOBXQfDfRNO8OfEfx9pOk2/wBnsYP7O8uLez7d0LseWJJ5JPJr1Cis/RNE07w5o8Gk6Tb/AGexg3eXFvZ9u5ix5Yknkk8mo4PD2mWut3msQQSR3968b3MizyASmNGjTcu7aQFY8YxnB6gEalf/2Q==",
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAACAAAAAgAQAAAABbAUdZAAAASElEQVR4AWP8z8DAwATEQKKBgYERzAUyoCyGj0D2/w///zMx3HeAqQPKgnUw/Lc/D5SAcaFiDPwQMWyyaIqB3D8wi1BMQagDAEIhFM9rGfUKAAAAAElFTkSuQmCC",
      "text/plain": [
       "<PIL.Image.Image image mode=1 size=32x32>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "image/jpeg": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/wAALCAAgACABAREA/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/9oACAEBAAA/APe4DM1vE1xHHHOUBkSNy6q2OQGIBIz3wM+grn9f8ZaB4It7CLxFq8kTzoVilkgZ2mKBdzHykwD8wPQDngV0EEy3NvFOgkCSIHUSRsjAEZ5VgCp9iAR3qSivP/hl/wAT3+1PHv8AqP8AhJPK/wBB+99n+z74f9Zxv3Y3fdGOnPWj4Zf8SL+1PAX+v/4Rvyv9O+79o+0b5v8AV87Nudv3jnrx0r0CivD/AIW/C3wb4j+HGk6tq2jfaL6fzvMl+1TJu2zOo4VwBwAOBXQfDfRNO8OfEfx9pOk2/wBnsYP7O8uLez7d0LseWJJ5JPJr1Cis/RNE07w5o8Gk6Tb/AGexg3eXFvZ9u5ix5Yknkk8mo4PD2mWut3msQQSR3968b3MizyASmNGjTcu7aQFY8YxnB6gEalf/2Q==",
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAACAAAAAgAQAAAABbAUdZAAAASElEQVR4AWP8z8DAwATEQKKBgYERzAUyoCyGj0D2/w///zMx3HeAqQPKgnUw/Lc/D5SAcaFiDPwQMWyyaIqB3D8wi1BMQagDAEIhFM9rGfUKAAAAAElFTkSuQmCC",
      "text/plain": [
       "<PIL.Image.Image image mode=1 size=32x32>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "32 64\n"
     ]
    },
    {
     "data": {
      "image/jpeg": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/wAALCAAgAEABAREA/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/9oACAEBAAA/APe4I2ht4onmkndECtLIFDOQPvHaAMnrwAPQCvO0+Mvw/t4rWG38TYhhwr+dZ3UrugUgDeVzuztJZt2cHucjtPDmtWniHw/Z6pZXkd5BMhH2iOF4VkZSVYhH+ZRuU8En6nrWpRXH638UvBvhzWJ9J1bWfs99Bt8yL7LM+3coYcqhB4IPBroNE1vTvEejwatpNx9osZ93ly7GTdtYqeGAI5BHIrQorw/4W/FLwb4c+HGk6Tq2s/Z76DzvMi+yzPt3TOw5VCDwQeDXYeAv+Jp4v8W+K7L97oms/Y/sF193zvJjaOT5Dhlw4I+YDPUZFegUV5/8Ev8AkkOhf9vH/pRJW5pvhqbSfGeu69FcRzprT2wlhYFDbrDEyZU87yWK8HbgE8nGD0lRgzfaHVo4xAEUo4cli2TuBXGAANuDk5yeBjnz/wCCM8LfCnRbdZYzOiTO8YYblVriYKSOoBKsAe+0+lHgqNofit8QYnmkndE0tWlkChnItz8x2gDJ68AD0Ar0SivO/gjPC3wp0W3WWMzokzvGGG5Va4mCkjqASrAHvtPpXSaR4n/tTxf4j0D7H5X9jfZv3/m7vO86Mv8AdwNuMY6nPtXQVHDMs6F0EgAdk+eNkOVYqeGAOMjg9CMEZBBrg/8AhSXw8/6F7/yduP8A45XYaJomneHNHg0nSbf7PYwbvLi3s+3cxY8sSTySeTVyaRokDJDJMS6rtQqCAWALfMQMAHJ74BwCcAyVxev/AAx8H+INbbVNU0CS8vLtws063UiBQqYDMBIoxhFX5QTkjjGSOk0TRNO8OaPBpOk2/wBnsYN3lxb2fbuYseWJJ5JPJrQr/9k=",
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAEAAAAAgAQAAAACK5kMpAAAAdUlEQVR4AXWP3Q2AIAyED+IADuIOZTMZwZFwAh3Fd421LX8+aJMmX8vdAY6RyyMWmjkX+MzouqaJXQDWAF8WANPG2m3zC1N1DeICiYvfLlos822PdVPgsHmHHw1QAxX0TNuSE/Xkr1+ITkvvunEJJB3kXU38AKJBLhfRnF+IAAAAAElFTkSuQmCC",
      "text/plain": [
       "<PIL.Image.Image image mode=1 size=64x32>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "image/jpeg": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/wAALCAAgACABAREA/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/9oACAEBAAA/APW/FvjXQ/B6Wg1jVI7B7pyYi9rLMHVGXzBhBwdrYBPQkHDYIrcsL631PTra/s5PMtbqJJoX2kbkYAqcHkZBHWs/xJ4p0bwjp0d/rl59ktZJRCr+U8mXIJAwgJ6KfyrQsL631PTra/s5PMtbqJJoX2kbkYAqcHkZBHWuP8X+KdG8I+MtCv8AXLz7JayaffQq/lPJlzJakDCAnop/KtjwX8/hpLleYby7u72Bv78M1xJLG2O2UdTg8jOCAcii8/5KHo3/AGCr/wD9G2lXNC02bSrW6tZWjdGvbi4idScss0rS4YY4IaRl4JyFB4ztGXdXPl/EPQkvGggmm0/Uo4UEufMxLbMuMgZYopYqBxhuSBmrHg3/AJAdz/2FdS/9LZqj1CeGH4jaAkssaPNpl+kSswBdvMtWwvqdqscDsCe1amjan/a9jJc+T5Wy7ubbbu3Z8mZ4t2cDrszjtnHPWsvX/B/hvx1b2E+t6bJdJChe3EjzQMgcKTlQVIPyrwwyMdua3LCxt9M062sLOPy7W1iSGFNxO1FACjJ5OAB1rL8VeGtI8U6XFZazpkmoW6XCSLCkxjKtym/IZeFV2JGegOATgVqWFjb6Zp1tYWcfl2trEkMKbidqKAFGTycADrX/2Q==",
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAACAAAAAgAQAAAABbAUdZAAAAVUlEQVR4AU1OQQ6AMAjrSMy+safwM77GkzxqMq04h3qANG1pKQQgMXOdgcrLYbXNHrUq9efLi3ZzTXbPgKqJ0gIJ0SX42dHRIYcvPvK+toyPH4zkEC4SHhaTBRkQ3wAAAABJRU5ErkJggg==",
      "text/plain": [
       "<PIL.Image.Image image mode=1 size=32x32>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "for text in ['A', 'BC']:#, '꧄']\n",
    "    img_width = glyph_size[0] * len(text)\n",
    "    img_height = glyph_size[1]\n",
    "    img = Image.new(\"L\", (img_width, img_height), color=255)\n",
    "    draw = ImageDraw.Draw(img)\n",
    "\n",
    "    for i, char in enumerate(text):\n",
    "        try:\n",
    "            bbox = font.getbbox(char)\n",
    "            w = bbox[2] - bbox[0]\n",
    "            h = bbox[3] - bbox[1]\n",
    "            x = i * glyph_size[0] + (glyph_size[0] - w) // 2\n",
    "            y = (glyph_size[1] - h) // 2 - bbox[1]\n",
    "            draw.text((x, y), char, fill=0, font=font)\n",
    "        except Exception as e:\n",
    "            print(f\"Ошибка при отображении символа '{char}': {e}\")\n",
    "            pass\n",
    "\n",
    "    \n",
    "    print(img_height,img_width)\n",
    "    \n",
    "    img_aug = augment_image(img, text)\n",
    "    images.append(img_aug)\n",
    "    display(img_aug)\n",
    "    snake_bits = image_to_bitlist_polar_snake(img_aug)\n",
    "\n",
    "    img_aug = img_aug.resize((32, 32), Image.BILINEAR) #LANCZOS)\n",
    "    display(img_aug)    \n",
    "    reduced = image_to_bitlist_polar_snake(img_aug)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "55b6b23f-8107-4f2b-99ca-a00f0907952a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "symbol: 1\n",
      "ord: U+0031\n",
      "name: DIGIT ONE\n",
      "category: Nd\n",
      "Bidirectional: EN\n",
      "Combining: 0\n",
      "Decomposition: \n",
      "Mirrored: 0\n",
      "Decimal value: 1\n",
      "isprintable: True\n",
      "isspace: False\n",
      "isalpha: False\n",
      "isupper?: False\n",
      "islower?: False\n",
      "Title-case?: False\n"
     ]
    }
   ],
   "source": [
    "describe_char('1') #'꧄')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "c1d96ab7-a714-472a-b719-611956d96cfa",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(os.path.join(output_dir, \"embeddings.txt\"), \"w\", encoding=\"utf-8\") as f:\n",
    "    for token, idx in sorted(vocab.items(), key=lambda x: x[1]):\n",
    "        text = token\n",
    "        img_width = glyph_size[0] * len(text)\n",
    "        img_height = glyph_size[1]\n",
    "        img = Image.new(\"L\", (img_width, img_height), color=255)\n",
    "        draw = ImageDraw.Draw(img)\n",
    "\n",
    "        for i, char in enumerate(text):\n",
    "            try:\n",
    "                bbox = font.getbbox(char)\n",
    "                w = bbox[2] - bbox[0]\n",
    "                h = bbox[3] - bbox[1]\n",
    "                x = i * glyph_size[0] + (glyph_size[0] - w) // 2\n",
    "                y = (glyph_size[1] - h) // 2 - bbox[1]\n",
    "                draw.text((x, y), char, fill=0, font=font)\n",
    "            except Exception as e:\n",
    "                print(f\"Error: '{char}': {e}\")\n",
    "                pass\n",
    "                \n",
    "        img_aug = augment_image(img, text)\n",
    "        \n",
    "        if img_aug.size != (32, 32):\n",
    "            img_aug = img_aug.resize((32, 32), Image.BILINEAR)\n",
    "            \n",
    "        snake_bits = image_to_bitlist_polar_snake(img_aug)\n",
    "        \n",
    "        f.write(f\"{idx}\\n\")   \n",
    "        f.write(f\"{str(snake_bits)}\\n\")    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "3377785a-4fb3-43d5-a16e-de610dcc2838",
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_binary_file_np(filename):\n",
    "    data = []\n",
    "    with open(filename, 'r') as f:\n",
    "        while True:\n",
    "            index_line = f.readline()\n",
    "            if not index_line:\n",
    "                break\n",
    "            list_line = f.readline()\n",
    "            bits = np.fromstring(list_line.replace('[','').replace(']','').strip(), dtype=int, sep=',')\n",
    "            if bits.size != 1024:\n",
    "                print(index_line)\n",
    "                print(list_line)\n",
    "                raise ValueError(\"Invalid bit list length\")\n",
    "            data.append(bits)\n",
    "    return np.array(data, dtype=np.uint8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "c67c5539-6edc-4cfa-8516-2d26bfca839f",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_embeddings = load_binary_file_np(os.path.join(output_dir, \"embeddings.txt\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "902ee687-1805-4348-82c2-71bb1d7e9ddb",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.decomposition import PCA\n",
    "from sklearn.preprocessing import normalize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "6053fbc0-ef14-4268-81a3-908cd2a0dd5e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def apply_pca(data, n_components=1024):\n",
    "    pca = PCA(n_components=1024)\n",
    "    transformed = pca.fit_transform(data)\n",
    "    return transformed, pca"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "012b9fb5-813c-4e75-b06f-b02d1ef1769c",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_embeddings_pca, _pca = apply_pca(data_embeddings.astype(np.float32))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "0d60cfca-b0e1-49eb-b1f0-2faecc29e20e",
   "metadata": {},
   "outputs": [],
   "source": [
    "normalized_embeddings = normalize(data_embeddings_pca, norm='l2', axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "2612512c-3d9b-4cfe-8687-89bb86110a63",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "1f68ff20-261e-4ca5-a2a1-4ed164547852",
   "metadata": {},
   "outputs": [],
   "source": [
    "tensor_normalized_embeddings_weights = torch.tensor(normalized_embeddings, dtype=torch.float32)\n",
    "torch.save(tensor_normalized_embeddings_weights, os.path.join(output_dir, \"normalized_embeddings_weights.pt\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a7dbd352-7152-41e7-aef9-345cc8c3a19f",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "356e06bb-d60f-4f79-8791-6df41815b2d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "7928ec7f-a2db-4b06-a313-0981b494d483",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained(output_dir) \n",
    "tokenizer.is_fast"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "89d2ccbe-cbc4-499a-a72e-f93e3a2c6a17",
   "metadata": {},
   "outputs": [],
   "source": [
    "ids = tokenizer.encode('hello the test')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "5ede5794-25fb-4d36-886e-904fea1797e7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[104, 56035, 111, 32, 57811, 32, 116, 56117]\n"
     ]
    }
   ],
   "source": [
    "print(ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "c0e2cdf1-65b3-43e9-82db-66488a305197",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "hello the test\n"
     ]
    }
   ],
   "source": [
    "print(tokenizer.decode(ids))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8fa5746f-f4e6-4bcc-9c69-5ab1a32b5325",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "python3_11",
   "language": "python",
   "name": "python3_11"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
