{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "caa0de87-fdf9-4e6c-8613-066975cad0bc",
   "metadata": {},
   "source": [
    "# Download dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "4be141fb-c90d-40c2-b7c6-1dd90761e3e4",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import random\n",
    "import numpy as np\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.metrics import accuracy_score\n",
    "from tensorflow.keras.datasets import imdb\n",
    "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
    "\n",
    "# Load the IMDB dataset\n",
    "(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=20000)\n",
    "\n",
    "# Decode back to text\n",
    "word_index = imdb.get_word_index()\n",
    "index_word = {index: word for word, index in word_index.items()}\n",
    "\n",
    "def decode_review(encoded_review):\n",
    "    return ' '.join([index_word.get(i - 3, '?') for i in encoded_review])\n",
    "\n",
    "# Convert integer sequences back to text\n",
    "X_train_text = [decode_review(review) for review in X_train]\n",
    "X_test_text = [decode_review(review) for review in X_test]\n",
    "\n",
    "# Tokenize the training data (split words)\n",
    "tokenized_train = [review.split() for review in (X_train_text + X_test_text)]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a91dbf25-db8d-4a6f-8205-84b23b9eb009",
   "metadata": {},
   "source": [
    "# Train GloVe from original stanford repo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "5cb908d2-d8df-482b-a725-b88bc692ae34",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset prepared and saved to glove/text8\n"
     ]
    }
   ],
   "source": [
    "output_file_path = \"glove/text8\"\n",
    "with open(output_file_path, 'w', encoding='utf-8') as f:\n",
    "    for sentence in tokenized_train:\n",
    "        f.write(' '.join(sentence) + '\\n')\n",
    "print(f\"Dataset prepared and saved to {output_file_path}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "39b17b29-c440-490e-a0fe-f2b61de564b1",
   "metadata": {},
   "source": [
    "#### [GloVe GitHub Repository](https://github.com/stanfordnlp/GloVe)\n",
    "```bash\n",
    "# Navigate to the GloVe directory and compile the code\n",
    "cd glove && make\n",
    "# Run the demo script\n",
    "./demo.sh\n",
    "```\n",
    "#### output is vectors.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "cf625f02-a29f-465e-9629-088960d99ace",
   "metadata": {
    "jupyter": {
     "source_hidden": true
    }
   },
   "outputs": [],
   "source": [
    "glove_file_path = 'glove/vectors.txt'\n",
    "\n",
    "# Load GloVe embeddings\n",
    "def load_glove_embeddings(glove_file_path):\n",
    "    embeddings_index = {}\n",
    "    with open(glove_file_path, 'r', encoding='utf-8') as f:\n",
    "        for line in f:\n",
    "            values = line.split()\n",
    "            word = values[0]\n",
    "            coefs = np.asarray(values[1:], dtype='float32')\n",
    "            embeddings_index[word] = coefs\n",
    "    return embeddings_index\n",
    "\n",
    "# Load the GloVe embeddings\n",
    "glove_vectors = load_glove_embeddings(glove_file_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b2a11a77-9136-4d76-ad11-acef1e51e809",
   "metadata": {},
   "source": [
    "# Or GloVe from cooccurrence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "72dc0640-effc-4a24-b73c-c13f0f928a3e",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [],
   "source": [
    "from collections import Counter, defaultdict\n",
    "import math\n",
    "# Define the context window size\n",
    "window_size = 4\n",
    "\n",
    "# Build the co-occurrence matrix\n",
    "def build_cooccurrence_matrix(corpus, vocab_size, window_size=4):\n",
    "    cooccurrence_matrix = defaultdict(lambda: defaultdict(int))\n",
    "    for sentence in corpus:\n",
    "        for i, word in enumerate(sentence):\n",
    "            for j in range(max(0, i - window_size), min(len(sentence), i + window_size + 1)):\n",
    "                if i != j:\n",
    "                    cooccurrence_matrix[word][sentence[j]] += 1\n",
    "    return cooccurrence_matrix\n",
    "\n",
    "# Create vocabulary and word to id mapping\n",
    "all_words = [word for sublist in tokenized_train for word in sublist]\n",
    "word_counts = Counter(all_words)\n",
    "vocab = {word: i for i, (word, count) in enumerate(word_counts.items())}\n",
    "vocab_size = len(vocab)\n",
    "\n",
    "# Build the co-occurrence matrix\n",
    "cooccurrence_matrix = build_cooccurrence_matrix(tokenized_train, vocab_size, window_size)\n",
    "\n",
    "\n",
    "# Define hyperparameters\n",
    "embedding_dim = 100\n",
    "learning_rate = 0.05\n",
    "epochs = 50\n",
    "x_max = 100\n",
    "alpha = 0.75\n",
    "\n",
    "# Initialize word vectors and biases\n",
    "word_vectors = np.random.uniform(-0.5 / embedding_dim, 0.5 / embedding_dim, (vocab_size, embedding_dim))\n",
    "word_biases = np.zeros(vocab_size)\n",
    "\n",
    "# Training the GloVe model\n",
    "for epoch in range(epochs):\n",
    "    total_loss = 0\n",
    "    for word, context_dict in cooccurrence_matrix.items():\n",
    "        word_id = vocab[word]\n",
    "        for context_word, count in context_dict.items():\n",
    "            context_word_id = vocab[context_word]\n",
    "            weight = (count / x_max) ** alpha if count < x_max else 1\n",
    "            log_cooccurrence = math.log(1 + count)\n",
    "            word_vector = word_vectors[word_id]\n",
    "            context_vector = word_vectors[context_word_id]\n",
    "            bias_sum = word_biases[word_id] + word_biases[context_word_id]\n",
    "\n",
    "            # Compute the cost and gradients\n",
    "            cost = (np.dot(word_vector, context_vector) + bias_sum - log_cooccurrence)\n",
    "            total_loss += 0.5 * weight * cost**2\n",
    "\n",
    "            grad_main = weight * cost * context_vector\n",
    "            grad_context = weight * cost * word_vector\n",
    "            grad_bias_word = weight * cost\n",
    "            grad_bias_context = weight * cost\n",
    "\n",
    "            # Update word vectors and biases\n",
    "            word_vectors[word_id] -= learning_rate * grad_main\n",
    "            word_vectors[context_word_id] -= learning_rate * grad_context\n",
    "            word_biases[word_id] -= learning_rate * grad_bias_word\n",
    "            word_biases[context_word_id] -= learning_rate * grad_bias_context\n",
    "\n",
    "    print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss}')\n",
    "\n",
    "# Save word vectors\n",
    "word_vectors_dict = {word: word_vectors[vocab[word]] for word in vocab}\n",
    "np.save('glove_word_vectors.npy', word_vectors_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5a03f9c1-e0ba-4a92-95a1-4257c4c67dfe",
   "metadata": {
    "jupyter": {
     "source_hidden": true
    }
   },
   "outputs": [],
   "source": [
    "# Assuming `word_vectors_dict` contains the GloVe embeddings (loaded from 'glove_word_vectors.npy')\n",
    "glove_vectors = np.load('glove_word_vectors.npy', allow_pickle=True).item()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "60aa5259-4199-4c97-bff2-e49fbcb9f9bd",
   "metadata": {},
   "source": [
    "# Augment dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ffdff3c2-a74d-42c9-9fba-70fc99dd126a",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "from numpy import dot\n",
    "from numpy.linalg import norm\n",
    "\n",
    "def cosine_similarity(vec1, vec2):\n",
    "    return dot(vec1, vec2) / (norm(vec1) * norm(vec2))\n",
    "\n",
    "def most_similar_glove(word, glove_vectors, topn=5):\n",
    "    if word not in glove_vectors:\n",
    "        return []\n",
    "    word_vector = glove_vectors[word]\n",
    "    similarities = {}\n",
    "    for other_word, other_vector in glove_vectors.items():\n",
    "        if other_word != word:\n",
    "            similarities[other_word] = cosine_similarity(word_vector, other_vector)\n",
    "    # Sort by similarity score\n",
    "    sorted_similarities = sorted(similarities.items(), key=lambda item: item[1], reverse=True)\n",
    "    # Most similar words (highest cosine similarity)\n",
    "    most_similar_words = sorted_similarities[:topn]\n",
    "    # Least similar words (lowest cosine similarity)\n",
    "    most_dissimilar_words = sorted_similarities[-topn:]\n",
    "    return most_similar_words, most_dissimilar_words\n",
    "\n",
    "import random\n",
    "\n",
    "def augment_document_glove(doc, label, glove_vectors, percent=5):\n",
    "    tokens = doc.split()\n",
    "    num_words_to_change = len(tokens) * (percent / 100)\n",
    "    words_changed = 0\n",
    "    indices_to_change = set(random.sample(range(len(tokens)), int(num_words_to_change)))\n",
    "  \n",
    "    new_tokens = []\n",
    "    for i, word in enumerate(tokens):\n",
    "        if i in indices_to_change and word in glove_vectors:\n",
    "            if label == 1:\n",
    "                similar_words, _ = most_similar_glove(word, glove_vectors, topn=5)\n",
    "                if similar_words:\n",
    "                    chosen_word = random.choice(similar_words)[0]  # Choose one similar word randomly\n",
    "                else:\n",
    "                    chosen_word = word\n",
    "            else:\n",
    "                _, dissimilar_words = most_similar_glove(word, glove_vectors, topn=5)\n",
    "                if dissimilar_words:\n",
    "                    chosen_word = random.choice(dissimilar_words)[0]\n",
    "                else:\n",
    "                    chosen_word = word\n",
    "\n",
    "\n",
    "            new_tokens.append(chosen_word)\n",
    "            words_changed += 1\n",
    "        else:\n",
    "            new_tokens.append(word)\n",
    "  \n",
    "        if words_changed >= num_words_to_change:\n",
    "            break\n",
    "  \n",
    "    return ' '.join(new_tokens)\n",
    "\n",
    "percent_to_change = 5\n",
    "X_train_augmented = []\n",
    "\n",
    "for r, doc in enumerate(tqdm(X_train_text)):\n",
    "    augmented_doc = augment_document_glove(doc, y_train[r], glove_vectors, percent_to_change)\n",
    "    X_train_augmented.append(augmented_doc)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9af9c6e0-b248-4239-9771-356d7209ab62",
   "metadata": {},
   "source": [
    "# Classify dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "a4bff02e-d669-4d08-adab-48426af1f7a5",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Use CountVectorizer to convert text to vectors\n",
    "vectorizer = CountVectorizer()\n",
    "X_train_vec = vectorizer.fit_transform(X_train_augmented)\n",
    "X_test_vec = vectorizer.transform(X_test_text)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7460ce85-15a2-4a56-b53f-a4a71cf2897d",
   "metadata": {},
   "source": [
    "### RandomForestClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6c456a7b-666b-4c2e-a1ea-1e33da666f95",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train the classifier on augmented data\n",
    "classifier = RandomForestClassifier(n_estimators=100, random_state=42)\n",
    "classifier.fit(X_train_vec, y_train)\n",
    "\n",
    "# Predict and evaluate on original test data\n",
    "y_pred = classifier.predict(X_test_vec)\n",
    "accuracy = accuracy_score(y_test, y_pred)\n",
    "print(f\"Test Accuracy: {accuracy:.4f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f73e435c-a864-4d1d-b105-283f39a8dfbc",
   "metadata": {},
   "source": [
    "### LogisticRegression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2727f318-f2c4-4c93-9616-585b4d956908",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "classifier = LogisticRegression(random_state=42)\n",
    "classifier.fit(X_train_vec, y_train)\n",
    "\n",
    "y_pred = classifier.predict(X_test_vec)\n",
    "accuracy = accuracy_score(y_test, y_pred)\n",
    "print(f\"Test Accuracy (Logistic Regression): {accuracy:.4f}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "457413ed-dbb9-4a8f-bbe0-44cf299bc4d3",
   "metadata": {},
   "source": [
    "### Naive Bayes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c544a8d0-ce68-49cb-8c12-4c3e7b2e62bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.naive_bayes import MultinomialNB\n",
    "\n",
    "classifier = MultinomialNB()\n",
    "classifier.fit(X_train_vec, y_train)\n",
    "\n",
    "y_pred = classifier.predict(X_test_vec)\n",
    "accuracy = accuracy_score(y_test, y_pred)\n",
    "print(f\"Test Accuracy (Naive Bayes): {accuracy:.4f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e7588d57-0c05-46ce-bef6-a97b99d8e61d",
   "metadata": {},
   "source": [
    "### SVM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "449b7dec-aeac-4419-b3c0-d121c8cee59f",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.svm import LinearSVC\n",
    "\n",
    "classifier = LinearSVC(random_state=42)\n",
    "classifier.fit(X_train_vec, y_train)\n",
    "\n",
    "y_pred = classifier.predict(X_test_vec)\n",
    "accuracy = accuracy_score(y_test, y_pred)\n",
    "print(f\"Test Accuracy (SVM): {accuracy:.4f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e90fede3-56c9-4ffa-ba88-3d4bc061ac0a",
   "metadata": {},
   "source": [
    "### MLP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3b7ed7b3-6a3a-4a97-ad67-841f300ed52d",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.neural_network import MLPClassifier\n",
    "\n",
    "classifier = MLPClassifier(random_state=42)\n",
    "classifier.fit(X_train_vec, y_train)\n",
    "\n",
    "y_pred = classifier.predict(X_test_vec)\n",
    "accuracy = accuracy_score(y_test, y_pred)\n",
    "print(f\"Test Accuracy (MLP): {accuracy:.4f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "37cc8102-219a-4746-999d-8277296bc999",
   "metadata": {},
   "source": [
    "### TM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "87999304-113e-4d01-be9c-390793513d95",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "from tmu.models.classification.vanilla_classifier import TMClassifier\n",
    "import numpy as np\n",
    "import pickle\n",
    "\n",
    "# Convert labels to one-hot encoding for Tsetlin Machine\n",
    "X_train_tm = np.array(X_train_vec.toarray(), dtype=np.uint32)\n",
    "Y_train_tm = y_train.astype(np.uint32)\n",
    "\n",
    "X_test_tm = np.array(X_test_vec.toarray(), dtype=np.uint32)\n",
    "Y_test_tm = y_test.astype(np.uint32)\n",
    "\n",
    "num_clauses = 1000\n",
    "T = 8000\n",
    "s = 2.0\n",
    "device = \"CPU\"\n",
    "weighted_clauses = True\n",
    "epochs = 10\n",
    "clause_drop_p = 0.75\n",
    "\n",
    "print(\"started\")\n",
    "tm = TMClassifier(num_clauses, T, s, platform=device, weighted_clauses=weighted_clauses,clause_drop_p=clause_drop_p)\n",
    "for epoch in range(epochs):\n",
    "    tm.fit(X_train_tm, Y_train_tm)\n",
    "    result = 100 * (tm.predict(X_test_tm) == Y_test_tm).mean()\n",
    "    print(f\"Accuracy: {result:.2f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "89ea78c9-9f21-4ce9-a2ec-9c36b4c37e1b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
