{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "114306db-240d-4551-bbc0-312ba048834d",
   "metadata": {},
   "source": [
    "# Prepare sentences and pairs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e28d767a-8e22-437c-84fc-dc0fa545156d",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading Vectorizer\n",
      "Loading Data\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "from gensim.models import FastText\n",
    "from scipy.stats import spearmanr\n",
    "import numpy as np\n",
    "import os\n",
    "import random\n",
    "random.seed(42)\n",
    "from time import time\n",
    "import pickle\n",
    "from collections import defaultdict\n",
    "from Tools import Tools\n",
    "from scipy.stats import spearmanr\n",
    "from contextlib import redirect_stdout\n",
    "from DirectoriesUtil import Dicrectories\n",
    "\n",
    "target_word_weight=defaultdict(list)\n",
    "target_similarity=defaultdict(list)\n",
    "\n",
    "def preprocess_text(text):\n",
    "    return text\n",
    "\n",
    "dataset_name = \"rg-65\"\n",
    "dataset_dir = os.path.join(\"datasets\", dataset_name)\n",
    "files_start_name = os.path.join(dataset_dir, dataset_name)\n",
    "\n",
    "print(\"Loading Vectorizer\")\n",
    "vectorizer_X = Tools.read_pickle_data(\"vectorizer_X.pickle\")\n",
    "feature_names = vectorizer_X.get_feature_names_out()\n",
    "number_of_features = vectorizer_X.get_feature_names_out().shape[0]\n",
    "print(\"Loading Data\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "827fcd4f-933e-49c8-90fb-e274133c61fb",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentences = []\n",
    "X_train = Tools.read_pickle_data(\"X.pickle\")\n",
    "for i in range(X_train.shape[0]):\n",
    "    # Find the indices of non-zero elements in the row\n",
    "    word_indices = X_train[i].indices\n",
    "    # Map indices to the actual words\n",
    "    words = [feature_names[idx] for idx in word_indices]\n",
    "    sentences.append(words)\n",
    "print(sentences[0])\n",
    "with open('big_sentences.pickle', 'wb') as file:\n",
    "    pickle.dump(sentences, file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b99a7c66-cf00-47f5-a0de-19d2f89358c2",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['apparent', 'close', 'control', 'day', 'disease', 'flu', 'open', 'school']\n"
     ]
    }
   ],
   "source": [
    "with open('big_sentences.pickle', 'rb') as file:\n",
    "    sentences = pickle.load(file)\n",
    "print(sentences[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "416fc1c0-6c44-4e3b-9f81-5dc6d0bca2b5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset prepared and saved to GloVe/text8\n"
     ]
    }
   ],
   "source": [
    "output_file_path = \"glove/text8\"\n",
    "with open(output_file_path, 'w', encoding='utf-8') as f:\n",
    "    for sentence in sentences:\n",
    "        f.write(' '.join(sentence) + '\\n')\n",
    "print(f\"Dataset prepared and saved to {output_file_path}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7cddad26-396c-480a-a298-9f6189926095",
   "metadata": {},
   "source": [
    "## Train\n",
    "\n",
    "#### [GloVe GitHub Repository](https://github.com/stanfordnlp/GloVe)\n",
    "```bash\n",
    "# Navigate to the GloVe directory and compile the code\n",
    "cd glove && make\n",
    "# Run the demo script\n",
    "./demo.shh\n",
    "```\n",
    "#### output is vectors.txt"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bdd7a8bd-4740-479a-812a-dc7085433898",
   "metadata": {},
   "source": [
    "### Extract similarity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a668040-f453-435e-9a9e-796d491ffcb7",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset words count:  48\n",
      "\n",
      "Using GloVe embeddings to calc similarity\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "import numpy as np\n",
    "import os\n",
    "from time import time\n",
    "from collections import defaultdict\n",
    "from Tools import Tools\n",
    "from contextlib import redirect_stdout\n",
    "from DirectoriesUtil import Dicrectories\n",
    "from scipy.stats import spearmanr, kendalltau\n",
    "import pandas as pd\n",
    "\n",
    "pair_list = Tools.get_dataset_pairs(files_start_name)\n",
    "# print(\"Pair list:\", pair_list)\n",
    "# print(\"Number of pairs:\", len(pair_list))\n",
    "\n",
    "output_active, target_words = Tools.get_dataset_targets(files_start_name, vectorizer_X, pair_list)\n",
    "# print(\"Target words:\", target_words)\n",
    "# print(\"Output active:\", output_active)\n",
    "\n",
    "# Path to the GloVe file (e.g., vectors.txt)\n",
    "glove_file_path = 'glove/vectors.txt'\n",
    "\n",
    "# Load GloVe embeddings\n",
    "def load_glove_embeddings(glove_file_path):\n",
    "    embeddings_index = {}\n",
    "    with open(glove_file_path, 'r', encoding='utf-8') as f:\n",
    "        for line in f:\n",
    "            values = line.split()\n",
    "            word = values[0]\n",
    "            coefs = np.asarray(values[1:], dtype='float32')\n",
    "            embeddings_index[word] = coefs\n",
    "    return embeddings_index\n",
    "\n",
    "# Load the GloVe embeddings\n",
    "embeddings_index = load_glove_embeddings(glove_file_path)\n",
    "vector_size = len(next(iter(embeddings_index.values()))) \n",
    "# Print words missing from embeddings\n",
    "missing_words = [word for word in target_words if word not in embeddings_index]\n",
    "if missing_words:\n",
    "    print(\"Missing words in GloVe embeddings:\", missing_words)\n",
    "\n",
    "print(\"\\nUsing GloVe embeddings to calc similarity\")\n",
    "profile = np.empty((len(target_words), vector_size))\n",
    "for i, word in enumerate(target_words):\n",
    "    if word in embeddings_index:\n",
    "        profile[i, :] = embeddings_index[word]\n",
    "    else:\n",
    "        print(f\"Word {word} not found in GloVe embeddings.\")\n",
    "# Calculate cosine similarity between word vectors\n",
    "similarity = cosine_similarity(profile)\n",
    "# Create target_similarity dictionary\n",
    "target_similarity = {}\n",
    "for i in range(len(target_words)):\n",
    "    sorted_index = np.argsort(-1*similarity[i,:])\n",
    "    for j in range(1, len(target_words)):\n",
    "        target_similarity[(target_words[i], target_words[sorted_index[j]])]  = similarity[i,sorted_index[j]]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "65176441-a752-468e-83bf-ce975ed5130d",
   "metadata": {},
   "source": [
    "### Score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "bdb3dc44-c398-4d03-8ef0-9e73435ad7a7",
   "metadata": {
    "jupyter": {
     "source_hidden": true
    }
   },
   "outputs": [],
   "source": [
    "result_filepath = Dicrectories.test(dataset_name, \"glove_progress\")\n",
    "with open(result_filepath, 'w') as file, redirect_stdout(file):\n",
    "    # Compute scores   \n",
    "    calculated_score=[]\n",
    "    extracted_list = []\n",
    "    original_score=[]\n",
    "    word_pairs=[]\n",
    "    for (x,y) in pair_list:\n",
    "            if x in target_similarity:\n",
    "                # print(\"{} = {:.2f} - {}\".format(x, target_similarity[x] * 10, y))\n",
    "                word1_prof = target_similarity[x] * 10\n",
    "                extracted_list.append((x, word1_prof))\n",
    "                calculated_score.append(word1_prof)\n",
    "                original_score.append(y)\n",
    "                word_pairs.append(x)\n",
    "    spearman_TM = spearmanr(original_score, calculated_score)\n",
    "    spearman_TM = round(spearman_TM[0], 3)\n",
    "    print(f'Spearman Glove: {spearman_TM}')\n",
    "\n",
    "    total_list=[]\n",
    "    total_list.append(original_score)\n",
    "    total_list.append(calculated_score)\n",
    "\n",
    "    similarity = cosine_similarity(total_list)\n",
    "    print(f'Cosine Glove \\n{similarity}')\n",
    "\n",
    "    TM_corr= np.corrcoef(original_score, calculated_score)\n",
    "    print(f'Pearson Glove \\n{TM_corr}')\n",
    "\n",
    "    kendal_TM, _ = kendalltau(original_score, calculated_score)\n",
    "    print(f'Kendal Glove: {kendal_TM}')\n",
    "\n",
    "    data = pd.DataFrame([original_score,calculated_score])\n",
    "    data=data.transpose()\n",
    "    data.columns=['Original','TM']\n",
    "    correlation = data.corr()\n",
    "    print(\"Pearson Corr \\n\", correlation)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
