{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer, AutoModel\n",
    "model_id = \"meta-llama/Meta-Llama-3-8B-Instruct\"\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir = '/workspace/CACHE/MODELS')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = \"./matches-sim-and-diff.txt\"\n",
    "with open(path, \"r\") as f:\n",
    "    eng_lines = f.readlines()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def convert_text_to_data(text):\n",
    "\n",
    "    text = text.split(\"\\t\")\n",
    "    text1 = eval(text[0])\n",
    "    text1 = [text1[0], text1[1]]\n",
    "    text2 = eval(text[1])\n",
    "    text2 = [text2[0], text2[1]]\n",
    "    score = float(text[2])\n",
    "    score = text[2]\n",
    "\n",
    "    return [text1, text2, score]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "lines = eng_lines \n",
    "swap_map = [convert_text_to_data(line) for line in lines]\n",
    "old_vocab = tokenizer.get_vocab()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i, (text1, text2, score) in enumerate(swap_map):\n",
    "    text1[1] = old_vocab[text1[0]]\n",
    "    text2[1] = old_vocab[text2[0]]\n",
    "    swap_map[i] = (text1, text2, score)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "old_vocab = tokenizer.get_vocab()\n",
    "for token1, token2, score in swap_map:\n",
    "    tok1, idx1 = token1\n",
    "    tok2, idx2 = token2\n",
    "    old_vocab[tok1] = idx2\n",
    "    old_vocab[tok2] = idx1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json \n",
    "\n",
    "with open(\"/workspace/codes/AlienLM/tokenizers/original/8b-Instruct/tokenizer.json\", \"r\") as f:\n",
    "    tokenizer_json = json.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tok: Ġreproduced, tok_id: 2041\n"
     ]
    }
   ],
   "source": [
    "for tok, tok_id in old_vocab.items():\n",
    "    if tok_id == 2041:\n",
    "        print(f\"tok: {tok}, tok_id: {tok_id}\")\n",
    "    if tokenizer_json['model']['vocab'].get(tok) is not None:\n",
    "        tokenizer_json['model']['vocab'][tok] = tok_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('/workspace/codes/AlienLMv2/alien_tokenizer/alien/full/tokenizer.json', 'w') as f:\n",
    "    json.dump(tokenizer_json, f, indent=4)\n",
    "new_tokenizer = AutoTokenizer.from_pretrained('/workspace/codes/AlienLMv2/alien_tokenizer/alien/full')\n",
    "original_tokenizer = AutoTokenizer.from_pretrained(model_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Original tokens: ['say', 'Ġmy', 'Ġname', '.', 'Ġthe', 'Ġdoom', \"'s\", 'Ġday', 'Ġis', 'Ġcoming']\n",
      "trained tokens: ['say', 'Ġmy', 'Ġname', '.', 'Ġthe', 'Ġdoom', \"'s\", 'Ġday', 'Ġis', 'Ġcoming']\n",
      "Original token ids: [37890, 856, 836, 13, 279, 59714, 596, 1938, 374, 5108]\n",
      "trained token ids: [112850, 1835, 97208, 11, 264, 8888, 2846, 79716, 42156, 45764]\n",
      "Decoded with original tokenizer:  sayesinde             nameLabel, a Thom'm Everyday                                                                           forthcoming\n"
     ]
    }
   ],
   "source": [
    "test_text = \"say my name. the doom's day is coming\"\n",
    "\n",
    "original_tokens = original_tokenizer.tokenize(test_text)\n",
    "randomized_tokens = new_tokenizer.tokenize(test_text)\n",
    "\n",
    "print(\"Original tokens:\", original_tokens)\n",
    "print(\"trained tokens:\", new_tokenizer.tokenize(test_text))\n",
    "\n",
    "print(\"Original token ids:\", original_tokenizer.convert_tokens_to_ids(original_tokens))\n",
    "print(\"trained token ids:\", new_tokenizer.convert_tokens_to_ids(new_tokenizer.tokenize(test_text)))\n",
    "\n",
    "print(\"Decoded with original tokenizer:\", original_tokenizer.decode(new_tokenizer.convert_tokens_to_ids(new_tokenizer.tokenize(test_text))))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
