{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4853c15c-b74d-496f-b304-9d32b4339489",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2024-04-06 13:08:06.860258: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
      "To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
      "2024-04-06 13:08:07.747430: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3.11.6 | packaged by conda-forge | (main, Oct  3 2023, 10:40:35) [GCC 12.3.0]\n",
      "30301029\n"
     ]
    }
   ],
   "source": [
    "# prepare dataset\n",
    "import numpy as np\n",
    "import keras\n",
    "from sklearn.feature_selection import SelectKBest\n",
    "from sklearn.feature_selection import chi2\n",
    "from keras.datasets import imdb\n",
    "from time import time\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from scipy.sparse import csc_matrix, csr_matrix, dok_array\n",
    "import sys\n",
    "import pickle\n",
    "\n",
    "print(sys.version)\n",
    "min_frequency = 1\n",
    "\n",
    "f = open(\"train_v2.txt\", encoding='utf-8')\n",
    "lines = f.read().split(\"\\n\")\n",
    "f.close()\n",
    "sentences = []\n",
    "for sentence in lines:\n",
    "    sentences.append(sentence)\n",
    "print(len(sentences))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "023b3e0f-17cb-43b7-9339-eb9d1db6dc25",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset:       mturk-287 it has    499 words and    287 pairs\n",
      "Dataset:  wordsim353-sim it has    277 words and    203 pairs\n",
      "Dataset:       simlex999 it has   1028 words and    999 pairs\n",
      "Dataset:           rg-65 it has     48 words and     65 pairs\n",
      "Dataset:       mturk-771 it has   1113 words and    771 pairs\n",
      "Dataset:             men it has    754 words and   3000 pairs\n",
      "Total vocabulary has 2697 words\n"
     ]
    }
   ],
   "source": [
    "# prepare vocabulary by extract from all datasets target words\n",
    "import os\n",
    "from DirectoriesUtil import Dicrectories\n",
    "from Tools import Tools\n",
    "\n",
    "words_list = []\n",
    "for dataset_name in os.listdir(Dicrectories.datasets):\n",
    "    dataset_path = os.path.join(Dicrectories.datasets, dataset_name)\n",
    "    if os.path.isdir(dataset_path) and \".ipynb_checkpoints\" not in dataset_name:\n",
    "        files_start_name = os.path.join(dataset_path, dataset_name)\n",
    "        words = Tools.get_dataset_words(files_start_name)\n",
    "        pairs = Tools.get_dataset_pairs(files_start_name)\n",
    "        words_list.extend(words)\n",
    "        print(\"Dataset: %15s it has %6d words and %6d pairs\" % (dataset_name , len(words), len(pairs)))\n",
    "vocabulary = set(words_list)\n",
    "print(\"Total vocabulary has %d words\" % len(vocabulary))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "34126040-061d-4ec1-b386-b56f8c261d84",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package stopwords to\n",
      "[nltk_data]     /home/ahmedkk/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "40000\n"
     ]
    }
   ],
   "source": [
    "# vectorize first the vacabulary and then the rest until max_words\n",
    "import re\n",
    "import string\n",
    "from nltk.corpus import stopwords\n",
    "import nltk\n",
    "nltk.download('stopwords')\n",
    "\n",
    "NUM_WORDS=40000\n",
    "\n",
    "def preprocess_text(text):\n",
    "    # Convert text to lowercase\n",
    "    text = text.lower()\n",
    "    # Remove punctuation\n",
    "    text = text.translate(str.maketrans('', '', string.punctuation))\n",
    "    # Remove numbers\n",
    "    text = re.sub(r'\\d+', '', text)\n",
    "    # Tokenize text\n",
    "    words = text.split()\n",
    "    # Remove stopwords\n",
    "    english_stopwords = set(stopwords.words('english'))\n",
    "    words = [word for word in words if word not in english_stopwords]\n",
    "    # Join words back into a processed text\n",
    "    processed_text = ' '.join(words)\n",
    "    return processed_text\n",
    "    \n",
    "vectorizer_X = CountVectorizer(preprocessor=preprocess_text,max_features=NUM_WORDS, binary=True)\n",
    "X = vectorizer_X.fit_transform(sentences)\n",
    "\n",
    "# Check the current vocabulary size\n",
    "current_vocab_size = len(vectorizer_X.vocabulary_)\n",
    "print(current_vocab_size)\n",
    "print(\"vectorising completed\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "2dc58000-95d9-40d1-9b55-0d42be7afca3",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true,
     "source_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "51\n",
      "vectorising completed\n"
     ]
    }
   ],
   "source": [
    "# Check the remaining words \n",
    "remaining_words = [word for word in vocabulary if word not in vectorizer_X.vocabulary_]\n",
    "print(len(remaining_words))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "614ba661-6a5a-439f-bdf5-0acb581c65cc",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saving X completed\n"
     ]
    }
   ],
   "source": [
    "# save\n",
    "f_vectorizer_X = open(\"vectorizer_X.pickle\", \"wb\")\n",
    "pickle.dump(vectorizer_X, f_vectorizer_X, protocol=4)\n",
    "f_vectorizer_X.close()\n",
    "\n",
    "f_X = open(\"X.pickle\", \"wb\")\n",
    "pickle.dump(X, f_X, protocol=4)\n",
    "f_X.close()\n",
    "\n",
    "print(\"Saving X completed\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
