{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "bffcd17f-63da-4bc7-a34a-bb67677a2b8c",
      "metadata": {
        "tags": [],
        "id": "bffcd17f-63da-4bc7-a34a-bb67677a2b8c"
      },
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "import numpy as np\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "fac1ff80-bc4e-4b82-a2a8-c7b5bd566d7a",
      "metadata": {
        "tags": [],
        "id": "fac1ff80-bc4e-4b82-a2a8-c7b5bd566d7a"
      },
      "outputs": [],
      "source": [
        "iteration_1_data = pd.read_csv(\"Data/Low_and_Not_Low_Data.csv\")\n",
        "iteration_2_data = pd.read_csv(\"Data/Low_and_Not_Low_Data_Second_Itertation.csv\")\n",
        "seed_data = pd.concat([iteration_1_data, iteration_2_data], ignore_index=True)\n",
        "seed_data = seed_data[seed_data['Label']==1]\n",
        "seed_data_texts = seed_data['Tokenized Text']\n",
        "collected_data=pd.read_csv('Data/high_confidence_texts_cleaned_07.csv')\n",
        "collected_data_texts = collected_data['Text']"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "8c92c258-3c0f-4e86-b82b-4de22b7fd7ac",
      "metadata": {
        "tags": [],
        "id": "8c92c258-3c0f-4e86-b82b-4de22b7fd7ac"
      },
      "outputs": [],
      "source": [
        "len(seed_data_texts)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "1660754b-293f-4563-82d4-415935dc9eca",
      "metadata": {
        "tags": [],
        "id": "1660754b-293f-4563-82d4-415935dc9eca"
      },
      "outputs": [],
      "source": [
        "len(collected_data_texts)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "bf077d10-8b61-42f6-8c6a-7acb7370ac5a",
      "metadata": {
        "id": "bf077d10-8b61-42f6-8c6a-7acb7370ac5a"
      },
      "source": [
        "# **PCA with all data seed and collected from first filteration**"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import numpy as np\n",
        "from sklearn.feature_extraction.text import TfidfVectorizer\n",
        "\n",
        "# Reduced TF-IDF features to limit data size\n",
        "vectorizer = TfidfVectorizer(max_features=1000)\n",
        "X_seed_data_texts = vectorizer.fit_transform(seed_data_texts).toarray()\n",
        "X_collected = vectorizer.transform(collected_data_texts).toarray()"
      ],
      "metadata": {
        "id": "eNKxoiQPZ6lL"
      },
      "id": "eNKxoiQPZ6lL",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "5c6dcf13-cdf3-4ce9-ba27-d8b5ff62e45a",
      "metadata": {
        "tags": [],
        "id": "5c6dcf13-cdf3-4ce9-ba27-d8b5ff62e45a"
      },
      "outputs": [],
      "source": [
        "import numpy as np  # For CPU-based array operations\n",
        "from sklearn.decomposition import PCA  # For CPU-based PCA from sklearn\n",
        "import matplotlib.pyplot as plt  # For plotting\n",
        "\n",
        "# Ensure the data is on the CPU\n",
        "X_ground_truth_cpu = np.array(X_seed_data_texts)\n",
        "X_collected_cpu = np.array(X_collected)\n",
        "\n",
        "# Dimensionality Reduction using PCA with sklearn\n",
        "pca = PCA(n_components=2)\n",
        "X_ground_truth_pca = pca.fit_transform(X_ground_truth_cpu)\n",
        "X_collected_pca = pca.transform(X_collected_cpu)\n",
        "\n",
        "# Plotting PCA Results\n",
        "plt.figure(figsize=(10, 5))\n",
        "plt.scatter(X_collected_pca[:, 0], X_collected_pca[:, 1], color='red', label='Collected Data', alpha=0.5)\n",
        "plt.scatter(X_ground_truth_pca[:, 0], X_ground_truth_pca[:, 1], color='blue', label='Ground Truth', alpha=0.5)\n",
        "plt.title('PCA of Text Data')\n",
        "plt.legend()\n",
        "plt.savefig('PCA of Text Data.png', dpi=300, bbox_inches='tight')\n",
        "\n",
        "plt.show()\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "e1e99243-0524-401a-86df-f92b53487366",
      "metadata": {
        "id": "e1e99243-0524-401a-86df-f92b53487366"
      },
      "source": [
        "**Check for outliers in the collected data**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "ac1c0188-456d-4602-94bc-d487eccdb9a7",
      "metadata": {
        "tags": [],
        "id": "ac1c0188-456d-4602-94bc-d487eccdb9a7"
      },
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "\n",
        "# Assuming X_ground_truth_tsne is a 2D numpy array\n",
        "# Calculate Q1 (25th percentile) and Q3 (75th percentile)\n",
        "Q1 = np.percentile(X_collected_pca, 25, axis=0)\n",
        "Q3 = np.percentile(X_collected_pca, 75, axis=0)\n",
        "\n",
        "# Calculate the IQR (Interquartile Range)\n",
        "IQR = Q3 - Q1\n",
        "\n",
        "# Define the lower and upper bounds for outliers\n",
        "lower_bound = Q1 - 1.5 * IQR\n",
        "upper_bound = Q3 + 1.5 * IQR\n",
        "\n",
        "# Create a boolean mask to identify outliers\n",
        "outliers_mask = np.any((X_collected_pca < lower_bound) | (X_collected_pca > upper_bound), axis=1)\n",
        "\n",
        "# Remove outliers\n",
        "X_collected_pca_no_outliers = X_collected_pca[~outliers_mask]\n",
        "\n",
        "# Print the shape of the dataset after removing outliers\n",
        "print(\"Original shape:\", X_collected_pca.shape)\n",
        "print(\"Shape after removing outliers:\", X_collected_pca_no_outliers.shape)\n",
        ","
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "collected_data_no_outliers = collected_data[~outliers_mask]\n"
      ],
      "metadata": {
        "id": "ZtscXeoHdBmy"
      },
      "id": "ZtscXeoHdBmy",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "len(collected_data_no_outliers)"
      ],
      "metadata": {
        "id": "YNqKpQQUdDmS"
      },
      "id": "YNqKpQQUdDmS",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "collected_data_no_outliers.to_csv('Data/collected_data_no_outliers.csv', index=False)\n"
      ],
      "metadata": {
        "id": "AbzRg38wdMPw"
      },
      "id": "AbzRg38wdMPw",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "collected_data_no_outliers.head(1)"
      ],
      "metadata": {
        "id": "QVdw1zGqexz-"
      },
      "id": "QVdw1zGqexz-",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "body_data = collected_data_no_outliers[['Text']]\n",
        "\n",
        "# Save the 'body' column as an Excel file\n",
        "body_data.to_excel('Data/Iteration_3_collected_data_no_outliers.xlsx', index=False) # from 5195 to 5034"
      ],
      "metadata": {
        "id": "Kqoz9TPZfsQ3"
      },
      "id": "Kqoz9TPZfsQ3",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "**Check for outliers in the Seed data**"
      ],
      "metadata": {
        "id": "6rPFCzF6agop"
      },
      "id": "6rPFCzF6agop"
    },
    {
      "cell_type": "code",
      "source": [
        "import numpy as np\n",
        "\n",
        "# Assuming X_ground_truth_tsne is a 2D numpy array\n",
        "# Calculate Q1 (25th percentile) and Q3 (75th percentile)\n",
        "Q1 = np.percentile(X_ground_truth_pca, 25, axis=0)\n",
        "Q3 = np.percentile(X_ground_truth_pca, 75, axis=0)\n",
        "\n",
        "# Calculate the IQR (Interquartile Range)\n",
        "IQR = Q3 - Q1\n",
        "\n",
        "# Define the lower and upper bounds for outliers\n",
        "lower_bound = Q1 - 1.5 * IQR\n",
        "upper_bound = Q3 + 1.5 * IQR\n",
        "\n",
        "# Create a boolean mask to identify outliers\n",
        "outliers_mask = np.any((X_ground_truth_pca < lower_bound) | (X_ground_truth_pca > upper_bound), axis=1)\n",
        "\n",
        "# Remove outliers\n",
        "X_ground_truth_pca_no_outliers = X_ground_truth_pca[~outliers_mask]\n",
        "\n",
        "# Print the shape of the dataset after removing outliers\n",
        "print(\"Original shape:\", X_ground_truth_pca.shape)\n",
        "print(\"Shape after removing outliers:\", X_ground_truth_pca_no_outliers.shape)\n"
      ],
      "metadata": {
        "id": "RduzIeOTaf5V"
      },
      "id": "RduzIeOTaf5V",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "len(seed_data)\n",
        "331"
      ],
      "metadata": {
        "id": "ZGl603CAd6gp"
      },
      "id": "ZGl603CAd6gp",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "seed_data_no_outliers = seed_data[~outliers_mask]\n"
      ],
      "metadata": {
        "id": "Eoz-kMgDcUkd"
      },
      "id": "Eoz-kMgDcUkd",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "seed_data_no_outliers.to_csv('Data/iteration_3_seed_data_no_outliers.csv', index=False)\n"
      ],
      "metadata": {
        "id": "_fmyhMRDeVnC"
      },
      "id": "_fmyhMRDeVnC",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "seed_data_no_outliers.head(1)"
      ],
      "metadata": {
        "id": "Btkrvgi2fUnP"
      },
      "id": "Btkrvgi2fUnP",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "171 + 381"
      ],
      "metadata": {
        "id": "EozJlhIwesRC"
      },
      "id": "EozJlhIwesRC",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "**Clustering after removing outliers**"
      ],
      "metadata": {
        "id": "iPYXWC73aJw7"
      },
      "id": "iPYXWC73aJw7"
    },
    {
      "cell_type": "code",
      "source": [
        "# Plotting PCA Results\n",
        "plt.figure(figsize=(10, 5))\n",
        "plt.scatter(X_collected_pca_no_outliers[:, 0], X_collected_pca_no_outliers[:, 1], color='red', label='Collected Data', alpha=0.5)\n",
        "plt.scatter(X_ground_truth_pca_no_outliers[:, 0], X_ground_truth_pca_no_outliers[:, 1], color='blue', label='Ground Truth', alpha=0.5)\n",
        "plt.title('PCA of Text Data')\n",
        "plt.legend()\n",
        "plt.savefig('PCA of Text Data iteration 3 Without outliers.png', dpi=300, bbox_inches='tight')\n",
        "\n",
        "plt.show()\n"
      ],
      "metadata": {
        "id": "98GMTT_zaBAp"
      },
      "id": "98GMTT_zaBAp",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "len(collected_data_no_outliers)"
      ],
      "metadata": {
        "id": "-8JMoUL8-qFK"
      },
      "id": "-8JMoUL8-qFK",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import numpy as np\n",
        "\n",
        "# Step 1: Normalize the vectors\n",
        "\n",
        "X_combined_gpu = X_collected_pca_no_outliers / np.linalg.norm(X_collected_pca_no_outliers, axis=1, keepdims=True)\n",
        "X_ground_truth_gpu = X_ground_truth_pca_no_outliers / np.linalg.norm(X_ground_truth_pca_no_outliers, axis=1, keepdims=True)\n",
        "\n",
        "# Step 2: Calculate cosine similarity\n",
        "cos_sim = np.dot(X_combined_gpu, X_ground_truth_gpu.T)\n",
        "print(cos_sim.shape)\n",
        "# Step 3: Determine threshold for similarity\n",
        "threshold = 0.9  # Example threshold, adjust based on your requirement\n",
        "similarity_mask = cos_sim > threshold\n",
        "print(similarity_mask.shape)\n",
        "# Step 4: Convert the similarity mask to binary output (1 for similar, 0 for not similar)\n",
        "binary_output = similarity_mask.astype(int)\n",
        "print(binary_output.shape)\n",
        "# print(binary_output)\n",
        "#1 post 55 summed\n",
        "# how each one of the 5034 is similar to all the ground truth no outlier data\n"
      ],
      "metadata": {
        "id": "ifIUxX7z-p7N"
      },
      "id": "ifIUxX7z-p7N",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "binary_output_summed = np.sum(binary_output, axis=1)"
      ],
      "metadata": {
        "id": "o37_30dJ_-ix"
      },
      "id": "o37_30dJ_-ix",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# max(binary_output_summed)"
      ],
      "metadata": {
        "id": "GP0mL3rCAN8f"
      },
      "id": "GP0mL3rCAN8f",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "np.sum(binary_output_summed >= 86)\n"
      ],
      "metadata": {
        "id": "ghQ34yc-AN5X"
      },
      "id": "ghQ34yc-AN5X",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "np.sum(binary_output_summed >= 85)"
      ],
      "metadata": {
        "id": "cvryGSYWAN2A"
      },
      "id": "cvryGSYWAN2A",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "np.sum(binary_output_summed >= 84)"
      ],
      "metadata": {
        "id": "FKK-n45iANus"
      },
      "id": "FKK-n45iANus",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Next, we computed cosine similarity between the normalized vectors of the collected data and the outlier-free ground truth data. A threshold of 0.9 was set to classify data points as similar or dissimilar. This process revealed that 121 from the collected data points met or exceeded the total similarity score of 86, 260 data points had a similarity score of at least 85, and 444 data points had a score of 84 or less."
      ],
      "metadata": {
        "id": "yr1w3iVtlr8T"
      },
      "id": "yr1w3iVtlr8T"
    },
    {
      "cell_type": "code",
      "source": [
        "indexes = np.where(binary_output_summed >= 86)[0]"
      ],
      "metadata": {
        "id": "GAmREBNpnuTv"
      },
      "id": "GAmREBNpnuTv",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "collected_data_tokenized = [collected_data_texts[i.item()] for i in indexes]"
      ],
      "metadata": {
        "id": "mRiwlUoGnuQn"
      },
      "id": "mRiwlUoGnuQn",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "collected_data = [collected_data_texts [i.item()] for i in indexes]"
      ],
      "metadata": {
        "id": "w0i6VvOcnuOI"
      },
      "id": "w0i6VvOcnuOI",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "len(collected_data)"
      ],
      "metadata": {
        "id": "k_AOeNxjnuJa"
      },
      "id": "k_AOeNxjnuJa",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "collected_data[0]"
      ],
      "metadata": {
        "id": "MFNPm9Seo6A8"
      },
      "id": "MFNPm9Seo6A8",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "df = pd.DataFrame({\n",
        "    'tokenized_text': collected_data\n",
        "})\n",
        "\n",
        "df.to_csv('Data/iteration_3collected_data_86_no_outliers_tokenized.csv', index=False)"
      ],
      "metadata": {
        "id": "r5jSrKjRnuGP"
      },
      "id": "r5jSrKjRnuGP",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "2khhJFkCbRpK"
      },
      "id": "2khhJFkCbRpK",
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "myenv",
      "language": "python",
      "name": "myenv"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.10.14"
    },
    "colab": {
      "provenance": [],
      "gpuType": "T4",
      "collapsed_sections": [
        "TLVNfYyaa9du"
      ]
    },
    "accelerator": "GPU"
  },
  "nbformat": 4,
  "nbformat_minor": 5
}