{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "c078adf3-b73f-4d4a-a90a-c39743f7c93a",
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import Counter, defaultdict\n",
    "from datasets import load_dataset\n",
    "import unicodedata\n",
    "import tiktoken\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "from datasets import get_dataset_config_names, load_dataset\n",
    "from transformers import AutoTokenizer\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8355cc14-553e-454e-bf34-377536fa2e24",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "175664d7-6fb8-425c-9398-2e67edd5219e",
   "metadata": {},
   "outputs": [],
   "source": [
    "languages =     [\"en\", \"ru\", \"fr\", \"de\", \"zh\", \"he\", \"it\", \"es\", \"ar\", \"pt\", \"ko\", \"hu\", \"sa\", \"la\", \"ja\", \"el\",     \"sv\", \"nl\", \"pl\", \"vi\", \"fa\", \"no\", \"tr\", \"fi\", \"cs\", \"hy\", \"da\", \"bn\", \"az\", \"ka\", \"hi\", \"id\"]\n",
    "\n",
    "lang_name_map = {\n",
    "    \"en\": \"English\",\n",
    "    \"ru\": \"Russian\",\n",
    "    \"fr\": \"French\",\n",
    "    \"de\": \"German\",\n",
    "    \"zh\": \"Chinese\",\n",
    "    \"he\": \"Hebrew\",\n",
    "    \"it\": \"Italian\",\n",
    "    \"es\": \"Spanish\",\n",
    "    \"ar\": \"Arabic\",\n",
    "    \"pt\": \"Portuguese\",\n",
    "    \"ko\": \"Korean\",\n",
    "    \"hu\": \"Hungarian\",\n",
    "    \"sa\": \"Sanskrit\",\n",
    "    \"la\": \"Latin\",\n",
    "    \"ja\": \"Japanese\",\n",
    "    \"el\": \"Greek\",\n",
    "\n",
    "    \"sv\": \"Swedish\",\n",
    "    \"nl\": \"Dutch\",\n",
    "    \"pl\": \"Polish\",\n",
    "    \"vi\": \"Vietnamese\",\n",
    "    \"fa\": \"Persian\",\n",
    "    \"no\": \"Norwegian\",\n",
    "    \"tr\": \"Turkish\",\n",
    "    \"fi\": \"Finnish\",\n",
    "    \"cs\": \"Czech\",\n",
    "    \"hy\": \"Armenian\",\n",
    "    \"da\": \"Danish\",\n",
    "    \"bn\": \"Bangla\",\n",
    "    \"az\": \"Azerbaijani\",\n",
    "    \"ka\": \"Georgian\",\n",
    "    \"hi\": \"Hindi\",\n",
    "    \"id\": \"Indonesian\"\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "2d83e8a9-28d0-4b4c-a323-8464e839a2b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "wikipedia_dataset_id = \"wikimedia/wikipedia\"\n",
    "wikisource_dataset_id = \"wikimedia/wikisource\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "dc11ddc0-c349-4ed9-9c17-c698cb859407",
   "metadata": {},
   "outputs": [],
   "source": [
    "def save_list_to_file(data_list, filename):\n",
    "    with open(filename, 'w', encoding='utf-8') as f:\n",
    "        f.writelines(f\"{item}\\n\" for item in data_list)\n",
    "\n",
    "def load_list_from_file(filename):\n",
    "    with open(filename, 'r', encoding='utf-8') as f:\n",
    "        return [line.strip() for line in f]\n",
    "\n",
    "def save_results_to_file(data_list, filename):\n",
    "    with open(filename, 'w', encoding='utf-8') as f:\n",
    "        f.write(str(data_list))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "454ba720-3d52-4370-ad9e-1969c935408b",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "44aff98c-74ad-4ec0-9658-79bc85d03e6d",
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import Counter\n",
    "import re\n",
    "import itertools\n",
    "\n",
    "top_n = 1000  # how many n-gramms to extract\n",
    "ngram_range = [2,3,4,5] # -> 'n-gramms-2-3-4-5.txt'\n",
    "\n",
    "# ranges\n",
    "CJK_UNICODE_RANGES = [\n",
    "    ('\\u4e00', '\\u9fff'),  # CJK Unified Ideographs\n",
    "    ('\\u3040', '\\u309f'),  # Hiragana\n",
    "    ('\\u30a0', '\\u30ff'),  # Katakana\n",
    "    ('\\uac00', '\\ud7af'),  # Hangul Syllables\n",
    "]\n",
    "\n",
    "def is_cjk(char):\n",
    "    return any(start <= char <= end for start, end in CJK_UNICODE_RANGES)\n",
    "\n",
    "def preprocess(text, lang):\n",
    "    # filtering\n",
    "    text = text.replace(\"’\", \"'\").replace(\"‘\", \"'\").replace(\"`\", \"'\")\n",
    "    # numbers - to remove\n",
    "    text = re.sub(r'[0-9]', '', text)\n",
    "    # remove tabs and special symbols\n",
    "    text = re.sub(r'[\\t\\r\\f\\v]', ' ', text)\n",
    "    return text\n",
    "\n",
    "def extract_ngrams(text, lang, n):\n",
    "    tokens = []\n",
    "\n",
    "    if any(is_cjk(c) for c in text):\n",
    "        text = preprocess(text, lang)\n",
    "        # CJK - by symbols\n",
    "        chars = [c for c in text if not c.isspace()]\n",
    "        tokens = [''.join(chars[i:i+n]) for i in range(len(chars)-n+1)]\n",
    "    else:\n",
    "        # words\n",
    "        text = preprocess(text, lang)\n",
    "        # splitting\n",
    "        words = re.findall(r\"\\b[a-zA-Z\\u00C0-\\u017F\\u0400-\\u04FF']+\\b\", text)\n",
    "\n",
    "        for word in words:\n",
    "            # example: can't  →  ['can', \"'t\"] \n",
    "            if len(word) >= n:\n",
    "                chars = list(word)\n",
    "                ngrams = [''.join(chars[i:i+n]) for i in range(len(chars)-n+1)]\n",
    "                tokens.extend(ngrams)\n",
    "    return tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "473f77ee-0d90-4e1f-89d5-e9461ae89748",
   "metadata": {},
   "outputs": [],
   "source": [
    "top_pct = 0.01 # 1% of Wiki"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "1856071f-59d7-4cf4-a774-237eab7d2a86",
   "metadata": {},
   "outputs": [],
   "source": [
    "configs_wikipedia = get_dataset_config_names(wikipedia_dataset_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "ceb2b792-6ba8-4ba1-887f-525d047f1c23",
   "metadata": {},
   "outputs": [],
   "source": [
    "configs_wikisource = get_dataset_config_names(wikisource_dataset_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "49ac1692-3fe3-42d8-ad7d-af89fa2a2738",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a72305fd3c9b46b8838093a9f8790cf0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7f96b34088d74e378e9de6e5d34d315b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading dataset shards:   0%|          | 0/41 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "en: top ngrams extracted.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3ca1ab5ce2004f7280030e261a27f6d3",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6ee38e13b5d6415599df463ff8306141",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading dataset shards:   0%|          | 0/21 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "14320713cc4c49578dbf921e233ff68a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "184c101405f34bdc815db6122f99b63d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading dataset shards:   0%|          | 0/18 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ru: top ngrams extracted.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "95d56d36a80e4ac080d420f3cc936159",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "effd99820ef84109917efaa05dfbcc75",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading dataset shards:   0%|          | 0/17 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "fr: top ngrams extracted.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "96b170e8ee1a46589734cd678c4b3e63",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ac192f97880145c59d0d7845e47eebf4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading dataset shards:   0%|          | 0/20 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "de: top ngrams extracted.\n",
      "zh: top ngrams extracted.\n",
      "he: top ngrams extracted.\n",
      "it: top ngrams extracted.\n",
      "es: top ngrams extracted.\n",
      "ar: top ngrams extracted.\n",
      "pt: top ngrams extracted.\n",
      "ko: top ngrams extracted.\n",
      "hu: top ngrams extracted.\n",
      "sa: top ngrams extracted.\n",
      "la: top ngrams extracted.\n",
      "ja: top ngrams extracted.\n",
      "el: top ngrams extracted.\n",
      "sv: top ngrams extracted.\n",
      "nl: top ngrams extracted.\n",
      "pl: top ngrams extracted.\n",
      "vi: top ngrams extracted.\n",
      "fa: top ngrams extracted.\n",
      "no: top ngrams extracted.\n",
      "tr: top ngrams extracted.\n",
      "fi: top ngrams extracted.\n",
      "cs: top ngrams extracted.\n",
      "hy: top ngrams extracted.\n",
      "da: top ngrams extracted.\n",
      "bn: top ngrams extracted.\n",
      "az: top ngrams extracted.\n",
      "ka: top ngrams extracted.\n",
      "hi: top ngrams extracted.\n",
      "id: top ngrams extracted.\n"
     ]
    }
   ],
   "source": [
    "results=[]\n",
    "\n",
    "for lang in languages:\n",
    "    try:\n",
    "        ngram_counters = {n: Counter() for n in ngram_range}\n",
    "\n",
    "        try:\n",
    "            configs = configs_wikipedia\n",
    "            latest_config = sorted([c for c in configs if c.endswith(f\".{lang}\")])[-1]\n",
    "            ds = load_dataset(wikipedia_dataset_id, latest_config, split=\"train\")\n",
    "            total_len = len(ds)\n",
    "            sample_size = int(total_len * top_pct)                  \n",
    "            ds_sub = ds.shuffle(seed=42).select(range(sample_size)) \n",
    "            texts = [x['text'] for x in ds_sub]\n",
    "        except Exception as e:\n",
    "            print(f\"Error in Wikipedia for {lang}: {e}\")\n",
    "            pass\n",
    "\n",
    "        if lang != 'ka': \n",
    "            try:\n",
    "                configs = configs_wikisource\n",
    "                latest_config = sorted([c for c in configs if c.endswith(f\".{lang}\")])[-1]\n",
    "                ds = load_dataset(wikisource_dataset_id, latest_config, split=\"train\")                \n",
    "                total_len = len(ds)\n",
    "                sample_size = int(total_len * top_pct)                  \n",
    "                ds_sub = ds.shuffle(seed=42).select(range(sample_size)) \n",
    "                texts = texts + [x['text'] for x in ds_sub]\n",
    "            except Exception as e:\n",
    "                print(f\"Error in Wikisource for {lang}: {e}\")\n",
    "                pass\n",
    "\n",
    "        for text in texts:\n",
    "            for n in ngram_range:\n",
    "                ngrams = extract_ngrams(text, lang, n)\n",
    "                ngram_counters[n].update(ngrams)\n",
    "        \n",
    "        top_ngrams_by_n = {n: ngram_counters[n].most_common(top_n) for n in ngram_range}\n",
    "        results.append({\n",
    "            'lang': lang,\n",
    "            'language_name': lang_name_map.get(lang, lang),\n",
    "            'top_ngrams': top_ngrams_by_n\n",
    "        })\n",
    "       \n",
    "        print(f\"{lang}: top ngrams extracted.\")\n",
    "        save_results_to_file(results, '_n-gramms-2-3-4-5.txt')\n",
    "    except Exception as e:\n",
    "        print(f\"Error for language '{lang}': {e}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c0670eef-443e-4e3a-bba8-3333d2f8cf70",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "python3_11",
   "language": "python",
   "name": "python3_11"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
