{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "114306db-240d-4551-bbc0-312ba048834d",
   "metadata": {},
   "source": [
    "# FastText with OneBillion"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5e5f807b-08dd-491a-9cf9-e28f3d3c9531",
   "metadata": {},
   "source": [
    "### Prepare"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e28d767a-8e22-437c-84fc-dc0fa545156d",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading Vectorizer\n",
      "Loading Data\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import pickle\n",
    "from Tools import Tools\n",
    "from DirectoriesUtil import Dicrectories\n",
    "\n",
    "def preprocess_text(text):\n",
    "    return text\n",
    "\n",
    "dataset_name = \"wordsim353-sim\"\n",
    "dataset_dir = os.path.join(\"datasets\", dataset_name)\n",
    "files_start_name = os.path.join(dataset_dir, dataset_name)\n",
    "\n",
    "print(\"Loading Vectorizer\")\n",
    "vectorizer_X = Tools.read_pickle_data(\"vectorizer_X.pickle\")\n",
    "feature_names = vectorizer_X.get_feature_names_out()\n",
    "number_of_features = vectorizer_X.get_feature_names_out().shape[0]\n",
    "print(\"Loading Data\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "827fcd4f-933e-49c8-90fb-e274133c61fb",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true,
     "source_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['us', 'centers', 'disease', 'control', 'prevention', 'initially', 'advised', 'school', 'systems', 'close', 'outbreaks', 'occurred', 'reversed', 'saying', 'apparent', 'virus', 'meant', 'schools', 'day', 'care', 'stay', 'open', 'even', 'confirmed', 'cases', 'swine', 'flu']\n"
     ]
    }
   ],
   "source": [
    "# either this or the next cell to build sentences\n",
    "sentences = []\n",
    "X_train = Tools.read_pickle_data(\"X.pickle\")\n",
    "for i in range(X_train.shape[0]):\n",
    "    # Find the indices of non-zero elements in the row\n",
    "    word_indices = X_train[i].indices\n",
    "    # Map indices to the actual words\n",
    "    words = [feature_names[idx] for idx in word_indices]\n",
    "    sentences.append(words)\n",
    "print(sentences[0])\n",
    "with open('big_sentences.pickle', 'wb') as file:\n",
    "    pickle.dump(sentences, file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b99a7c66-cf00-47f5-a0de-19d2f89358c2",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true,
     "source_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['apparent', 'close', 'control', 'day', 'disease', 'flu', 'open', 'school']\n"
     ]
    }
   ],
   "source": [
    "# run it if you saved the file in previous step\n",
    "with open('big_sentences.pickle', 'rb') as file:\n",
    "    sentences = pickle.load(file)\n",
    "print(sentences[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "db642fbf-3bc6-4c23-8f76-f841e49567ab",
   "metadata": {},
   "source": [
    "## Train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "56261b46-6da6-49e4-996c-e35c6e7c6f74",
   "metadata": {
    "jupyter": {
     "source_hidden": true
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "65\n",
      "Dataset words count:  48\n",
      "\n",
      "Training FastText model...\n"
     ]
    }
   ],
   "source": [
    "from gensim.models import FastText\n",
    "# Initialize parameters\n",
    "vector_size = 100  # Size of the word vectors\n",
    "window = 5         # Context window size\n",
    "min_count = 1      # Minimum word count to include in the model\n",
    "epochs = 25\n",
    "\n",
    "# Train FastText model\n",
    "print(\"\\nTraining FastText model...\")\n",
    "\n",
    "model = FastText(vector_size=vector_size, window=window, min_count=min_count)  # Initialize\n",
    "model.build_vocab(sentences)  # Build vocabulary\n",
    "model.train(sentences, total_examples=len(sentences), epochs=epochs)  # Train the model\n",
    "# Save the model for future use\n",
    "model.save(\"fasttext_model\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d655d24a-b3a9-44e6-bcab-4820954733ca",
   "metadata": {
    "jupyter": {
     "source_hidden": true
    }
   },
   "outputs": [],
   "source": [
    "from gensim.models import FastText\n",
    "model = FastText.load(\"fasttext_model\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a7a5f9ee-9938-4071-b02f-44b0605089c1",
   "metadata": {},
   "source": [
    "### Score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d95a1738-1258-4704-9eae-5a99af22cbd0",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "203\n",
      "Dataset words count:  277\n"
     ]
    }
   ],
   "source": [
    "from scipy.stats import kendalltau\n",
    "from scipy.stats import spearmanr\n",
    "import pandas as pd\n",
    "from contextlib import redirect_stdout\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "from collections import defaultdict\n",
    "from time import time\n",
    "import numpy as np\n",
    "\n",
    "\n",
    "pair_list = Tools.get_dataset_pairs(files_start_name)\n",
    "print(len(pair_list))\n",
    "output_active, target_words = Tools.get_dataset_targets(files_start_name, vectorizer_X, pair_list)\n",
    "# print(output_active)\n",
    "# print(target_words)\n",
    "\n",
    "result_filepath = Dicrectories.test(dataset_name, \"fasttext\")\n",
    "with open(result_filepath, 'w') as file, redirect_stdout(file):\n",
    "    print(\"\\nEvaluating FastText Model Over %d Epochs:\" % epochs)\n",
    "    print(\"No of features: %d\" % number_of_features)\n",
    "    Tools.print_training_time(epoch_time)\n",
    "    \n",
    "    # Extract word vectors for target words\n",
    "    profile = np.empty((len(target_words), vector_size))\n",
    "    for i, word in enumerate(target_words):\n",
    "        if word in model.wv:\n",
    "            profile[i, :] = model.wv[word]\n",
    "        else:\n",
    "            profile[i, :] = np.zeros(vector_size)\n",
    "    \n",
    "    # Calculate cosine similarity\n",
    "    # target_similarity=defaultdict(list)\n",
    "    similarity = cosine_similarity(profile)\n",
    "    for i in range(len(target_words)):\n",
    "        sorted_index = np.argsort(-1*similarity[i,:])\n",
    "        for j in range(1, len(target_words)):\n",
    "            target_similarity[(target_words[i], target_words[sorted_index[j]])]  = similarity[i,sorted_index[j]]\n",
    "\n",
    "    calculated_score=[]\n",
    "    extracted_list = []\n",
    "    original_score=[]\n",
    "    word_pairs=[]\n",
    "    for (x,y) in pair_list:\n",
    "            if x in target_similarity:\n",
    "                # print(\"{} = {:.2f} - {}\".format(x, target_similarity[x] * 10, y))\n",
    "                word1_prof = target_similarity[x] * 10\n",
    "                extracted_list.append((x, word1_prof))\n",
    "                calculated_score.append(word1_prof)\n",
    "                original_score.append(y)\n",
    "                word_pairs.append(x)\n",
    "    spearman_fasttext = spearmanr(original_score, calculated_score)\n",
    "    spearman_fasttext = round(spearman_fasttext[0], 3)\n",
    "    print(f'Spearman FastText: {spearman_fasttext}')\n",
    "\n",
    "    total_list=[]\n",
    "    total_list.append(original_score)\n",
    "    total_list.append(calculated_score)\n",
    "\n",
    "    similarity = cosine_similarity(total_list)\n",
    "    print(f'Cosine FastText \\n{similarity}')\n",
    "\n",
    "    fasttext_corr= np.corrcoef(original_score, calculated_score)\n",
    "    print(f'Pearson FastText \\n{fasttext_corr}')\n",
    "\n",
    "    kendal_fasttext, _ = kendalltau(original_score, calculated_score)\n",
    "    print(f'Kendal FastText: {kendal_fasttext}')\n",
    "\n",
    "    data = pd.DataFrame([original_score,calculated_score])\n",
    "    data=data.transpose()\n",
    "    data.columns=['Original','FastText']\n",
    "    correlation = data.corr()\n",
    "    print(\"Pearson Corr \\n\", correlation)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
