{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Using device: cuda\n",
      "Tokenizer vocabulary size: 30001\n",
      "GPT2 embedding size: 50257\n",
      "Additional tokens: 0\n",
      "Cosine similarity between the sentences: 0.7998372912406921\n",
      "Similarity between 'The quick brown fox jumps over the lazy dog.' and 'A fast auburn canine leaps above the indolent hound.': 0.8345\n",
      "Similarity between 'The quick brown fox jumps over the lazy dog.' and 'Python is a popular programming language.': 0.7662\n",
      "Similarity between 'The quick brown fox jumps over the lazy dog.' and 'Java is widely used in enterprise software development.': 0.8026\n",
      "Similarity between 'A fast auburn canine leaps above the indolent hound.' and 'Python is a popular programming language.': 0.7927\n",
      "Similarity between 'A fast auburn canine leaps above the indolent hound.' and 'Java is widely used in enterprise software development.': 0.8257\n",
      "Similarity between 'Python is a popular programming language.' and 'Java is widely used in enterprise software development.': 0.8039\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import torch\n",
    "from transformers import GPT2TokenizerFast, GPT2Model\n",
    "\n",
    "# GPU 사용 설정 (가능한 경우)\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "print(f\"Using device: {device}\")\n",
    "\n",
    "# 커스텀 토크나이저 로드\n",
    "tokenizer_path = '.'  # tokenizer.json 파일이 있는 디렉토리 경로\n",
    "tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_path)\n",
    "\n",
    "# GPT2 모델 로드 (임베딩 레이어만 사용)\n",
    "gpt2_model = GPT2Model.from_pretrained('gpt2')\n",
    "embedding_layer = gpt2_model.wte.to(device)\n",
    "\n",
    "# 디버깅을 위한 정보 출력\n",
    "print(f\"Tokenizer vocabulary size: {len(tokenizer.get_vocab())}\")\n",
    "print(f\"GPT2 embedding size: {gpt2_model.wte.num_embeddings}\")\n",
    "\n",
    "# 새로운 임베딩 로드\n",
    "new_embeddings_path = 'new_embeddings.pth'  # new_embeddings.pth 파일 경로\n",
    "\n",
    "# 새로운 임베딩의 크기 계산\n",
    "new_vocab_size = len(tokenizer.get_vocab())\n",
    "gpt2_vocab_size = gpt2_model.wte.num_embeddings\n",
    "additional_tokens = max(0, new_vocab_size - gpt2_vocab_size)\n",
    "\n",
    "print(f\"Additional tokens: {additional_tokens}\")\n",
    "\n",
    "new_embeddings = torch.nn.Embedding(additional_tokens, gpt2_model.wte.embedding_dim)\n",
    "new_embeddings.load_state_dict(torch.load(new_embeddings_path))\n",
    "new_embeddings = new_embeddings.to(device)\n",
    "\n",
    "# 문장 임베딩 함수\n",
    "def get_sentence_embedding(sentence):\n",
    "    tokens = tokenizer.encode(sentence, truncation=True, max_length=512)\n",
    "    token_embeddings = torch.cat([embedding_layer.weight, new_embeddings.weight])[tokens]\n",
    "    return token_embeddings.mean(dim=0)\n",
    "\n",
    "# 코사인 유사도 계산 함수\n",
    "def cosine_similarity(emb1, emb2):\n",
    "    return torch.cosine_similarity(emb1, emb2, dim=0).item()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Cosine similarity between the sentences: 0.667129397392273\n"
     ]
    }
   ],
   "source": [
    "\n",
    "# 사용 예시\n",
    "sentence1 = \"2/3\"\n",
    "sentence2 = \"\\frac{2}{3}\"\n",
    "\n",
    "emb1 = get_sentence_embedding(sentence1)\n",
    "emb2 = get_sentence_embedding(sentence2)\n",
    "\n",
    "similarity = cosine_similarity(emb1, emb2)\n",
    "print(f\"Cosine similarity between the sentences: {similarity}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Using device: cuda\n",
      "Tokenizer vocabulary size: 30001\n",
      "GPT2 embedding size: 50257\n",
      "Additional tokens: 0\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import torch\n",
    "from transformers import GPT2TokenizerFast, GPT2Model\n",
    "\n",
    "# GPU 사용 설정 (가능한 경우)\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "print(f\"Using device: {device}\")\n",
    "\n",
    "# 커스텀 토크나이저 로드\n",
    "tokenizer_path = '.'  # tokenizer.json 파일이 있는 디렉토리 경로\n",
    "tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_path)\n",
    "\n",
    "# GPT2 모델 로드 (임베딩 레이어와 positional 임베딩 사용)\n",
    "gpt2_model = GPT2Model.from_pretrained('gpt2')\n",
    "embedding_layer = gpt2_model.wte.to(device)\n",
    "positional_embedding = gpt2_model.wpe.to(device)\n",
    "\n",
    "# 디버깅을 위한 정보 출력\n",
    "print(f\"Tokenizer vocabulary size: {len(tokenizer.get_vocab())}\")\n",
    "print(f\"GPT2 embedding size: {gpt2_model.wte.num_embeddings}\")\n",
    "\n",
    "# 새로운 임베딩 로드\n",
    "new_embeddings_path = 'new_embeddings.pth'  # new_embeddings.pth 파일 경로\n",
    "\n",
    "# 새로운 임베딩의 크기 계산\n",
    "new_vocab_size = len(tokenizer.get_vocab())\n",
    "gpt2_vocab_size = gpt2_model.wte.num_embeddings\n",
    "additional_tokens = max(0, new_vocab_size - gpt2_vocab_size)\n",
    "\n",
    "print(f\"Additional tokens: {additional_tokens}\")\n",
    "\n",
    "new_embeddings = torch.nn.Embedding(additional_tokens, gpt2_model.wte.embedding_dim)\n",
    "new_embeddings.load_state_dict(torch.load(new_embeddings_path))\n",
    "new_embeddings = new_embeddings.to(device)\n",
    "\n",
    "# 문장 임베딩 함수 (positional embedding 포함)\n",
    "def get_sentence_embedding(sentence):\n",
    "    tokens = tokenizer.encode(sentence, truncation=True, max_length=512)\n",
    "    token_ids = torch.tensor(tokens).unsqueeze(0).to(device)  # [1, seq_len]\n",
    "    \n",
    "    # 토큰 임베딩\n",
    "    token_embeddings = torch.cat([embedding_layer.weight, new_embeddings.weight])[token_ids]  # [1, seq_len, embed_dim]\n",
    "    \n",
    "    # Positional 임베딩\n",
    "    position_ids = torch.arange(0, token_ids.size(1)).unsqueeze(0).to(device)  # [1, seq_len]\n",
    "    pos_embeddings = positional_embedding(position_ids)  # [1, seq_len, embed_dim]\n",
    "    \n",
    "    # 토큰 임베딩과 positional 임베딩 결합\n",
    "    combined_embeddings = token_embeddings + pos_embeddings  # [1, seq_len, embed_dim]\n",
    "    \n",
    "    # 평균 계산\n",
    "    sentence_embedding = combined_embeddings.mean(dim=1).squeeze(0)  # [embed_dim]\n",
    "    \n",
    "    return sentence_embedding\n",
    "\n",
    "# 코사인 유사도 계산 함수\n",
    "def cosine_similarity(emb1, emb2):\n",
    "    return torch.cosine_similarity(emb1, emb2, dim=0).item()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Cosine similarity between the sentences: 0.9905670285224915\n"
     ]
    }
   ],
   "source": [
    "\n",
    "# 사용 예시\n",
    "sentence1 = r\"F(x)={\\sqrt{\\frac{10x^{-8}}{-8}}}+C_{1} =-{\\frac{5}{4x^{8}}}+C_{1}\\quad{\\mathrm {if~}}x<0.\"\n",
    "sentence2 = r\"F(x)=\\sqrt{10}x^{-8/-8}+C_{1}=-\\frac{5} {4}x^{8}+C_{1}\\quad\\mathrm{if~}x<0.\"\n",
    "\n",
    "\n",
    "emb1 = get_sentence_embedding(sentence1)\n",
    "emb2 = get_sentence_embedding(sentence2)\n",
    "\n",
    "similarity = cosine_similarity(emb1, emb2)\n",
    "print(f\"Cosine similarity between the sentences: {similarity}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([768])"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "emb2.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Using device: cuda\n",
      "Loaded new embeddings with shape: torch.Size([0, 768])\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "import numpy as np\n",
    "from transformers import GPT2TokenizerFast, GPT2Model\n",
    "import re\n",
    "\n",
    "\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "print(f\"Using device: {device}\")\n",
    "\n",
    "# 토크나이저 및 모델 로드\n",
    "tokenizer = GPT2TokenizerFast.from_pretrained('.')\n",
    "gpt2_model = GPT2Model.from_pretrained('gpt2')\n",
    "embedding_layer = gpt2_model.wte.to(device)\n",
    "positional_embedding = gpt2_model.wpe.to(device)\n",
    "\n",
    "# 새로운 임베딩 로드 (필요한 경우)\n",
    "\n",
    "new_embeddings_state = torch.load('new_embeddings.pth')\n",
    "new_vocab_size, embedding_dim = new_embeddings_state['weight'].shape\n",
    "new_embeddings = torch.nn.Embedding(new_vocab_size, embedding_dim).to(device)\n",
    "new_embeddings.load_state_dict(new_embeddings_state)\n",
    "print(f\"Loaded new embeddings with shape: {new_embeddings.weight.shape}\")\n",
    "\n",
    "def spacing(text):\n",
    "    return re.sub(r'(?<!\\s)\\\\', r' \\\\', text)\n",
    "\n",
    "def get_token_embeddings(sentence):\n",
    "    sentence = spacing(sentence)\n",
    "    tokens = tokenizer.encode(sentence, truncation=True, max_length=512)\n",
    "    print(f\"Tokenized text: {tokens}\")\n",
    "    decoded_tokens = [tokenizer.decode([token]) for token in tokens]\n",
    "    print(f\"Decoded tokens: {decoded_tokens}\")\n",
    "    \n",
    "    token_ids = torch.tensor(tokens).unsqueeze(0).to(device)\n",
    "    positions = torch.arange(0, token_ids.size(1)).unsqueeze(0).to(device)\n",
    "    \n",
    "    if new_embeddings is not None:\n",
    "        token_embeddings = torch.cat([embedding_layer.weight, new_embeddings.weight])[token_ids]\n",
    "    else:\n",
    "        token_embeddings = embedding_layer(token_ids)\n",
    "    \n",
    "    pos_embeddings = positional_embedding(positions)\n",
    "    \n",
    "    return list(zip(token_embeddings[0], pos_embeddings[0]))\n",
    "\n",
    "def cosine_distance(emb1, emb2):\n",
    "    return 1 - torch.cosine_similarity(emb1.unsqueeze(0), emb2.unsqueeze(0)).item()\n",
    "\n",
    "def token_distance(token1, token2, w_emb=0.7, w_pos=0.3):\n",
    "    emb1, pos1 = token1\n",
    "    emb2, pos2 = token2\n",
    "    emb_dist = cosine_distance(emb1, emb2)\n",
    "    pos_dist = torch.abs(pos1 - pos2).float().mean().item()\n",
    "    return w_emb * emb_dist + w_pos * pos_dist\n",
    "\n",
    "def n_gram_similarity(ref_tokens, pred_tokens, n, max_d=2.0):\n",
    "    ref_ngrams = [ref_tokens[i:i+n] for i in range(len(ref_tokens)-n+1)]\n",
    "    pred_ngrams = [pred_tokens[i:i+n] for i in range(len(pred_tokens)-n+1)]\n",
    "    \n",
    "    L_n = min(len(ref_ngrams), len(pred_ngrams))\n",
    "    if L_n == 0:\n",
    "        return 0\n",
    "    \n",
    "    total_distance = sum(\n",
    "        sum(token_distance(ref_token, pred_token) \n",
    "            for ref_token, pred_token in zip(ref_ngram, pred_ngram))\n",
    "        for ref_ngram, pred_ngram in zip(ref_ngrams[:L_n], pred_ngrams[:L_n])\n",
    "    )\n",
    "    \n",
    "    return 1 - (total_distance / (L_n * n * max_d))\n",
    "\n",
    "def texbleu(reference, prediction, max_n=4, weights=None):\n",
    "    if weights is None:\n",
    "        weights = [1/max_n] * max_n\n",
    "    \n",
    "    ref_tokens = get_token_embeddings(reference)\n",
    "    pred_tokens = get_token_embeddings(prediction)\n",
    "    \n",
    "    n_gram_scores = [n_gram_similarity(ref_tokens, pred_tokens, n) \n",
    "                     for n in range(1, max_n+1)]\n",
    "    \n",
    "    return np.exp(sum(w * np.log(max(s, 1e-10)) \n",
    "                      for w, s in zip(weights, n_gram_scores)))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "sentence1 = r\"F(x)={\\sqrt{\\frac{10x^{-8}}{-8}}}+C_{1} =-{\\frac{5}{4x^{8}}}+C_{1}\\quad{\\mathrm {if~}}x<0.\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'F(x)={ \\\\sqrt{ \\\\frac{10x^{-8}}{-8}}}+C_{1} =-{ \\\\frac{5}{4x^{8}}}+C_{1} \\\\quad{ \\\\mathrm {if~}}x<0.'"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "spacing(sentence1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Tokenized text: [65, 35, 115, 8680, 118, 87, 8364, 118, 87, 7911, 118, 8006, 115, 8193, 51, 7842, 8511, 51, 8461, 38, 8870, 118, 44, 120, 10796, 118, 87, 7911, 118, 48, 7869, 22121, 7862, 51, 8461, 38, 8870, 118, 44, 120, 87, 8318, 118, 87, 8195, 118, 8069, 121, 7842, 115, 55, 43, 41]\n",
      "Decoded tokens: ['F', '(', 'x', ')=', '{', '\\\\', 'sqrt', '{', '\\\\', 'frac', '{', '10', 'x', '^{-', '8', '}}', '{-', '8', '}}}', '+', 'C_', '{', '1', '}', '=-', '{', '\\\\', 'frac', '{', '5', '}{', '4x', '^{', '8', '}}}', '+', 'C_', '{', '1', '}', '\\\\', 'quad', '{', '\\\\', 'mathrm', '{', 'if', '~', '}}', 'x', '<', '0', '.']\n",
      "Tokenized text: [65, 35, 115, 8680, 87, 8364, 118, 8006, 120, 115, 8193, 51, 24915, 51, 9109, 8870, 118, 44, 16802, 87, 7911, 118, 48, 120, 118, 47, 120, 115, 7862, 51, 9109, 8870, 118, 44, 120, 87, 8318, 87, 8195, 118, 8069, 22545, 115, 55, 43, 41]\n",
      "Decoded tokens: ['F', '(', 'x', ')=', '\\\\', 'sqrt', '{', '10', '}', 'x', '^{-', '8', '/-', '8', '}+', 'C_', '{', '1', '}=-', '\\\\', 'frac', '{', '5', '}', '{', '4', '}', 'x', '^{', '8', '}+', 'C_', '{', '1', '}', '\\\\', 'quad', '\\\\', 'mathrm', '{', 'if', '~}', 'x', '<', '0', '.']\n",
      "Score between reference and prediction1: 0.7761\n"
     ]
    }
   ],
   "source": [
    "\n",
    "score1 = bleu_like_score(sentence1, sentence2)\n",
    "\n",
    "print(f\"Score between reference and prediction1: {score1:.4f}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
