{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "cf4a67a4-d540-4252-8983-dbf72a21fa8e",
      "metadata": {
        "tags": [],
        "id": "cf4a67a4-d540-4252-8983-dbf72a21fa8e"
      },
      "outputs": [],
      "source": [
        "import pandas as pd"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "746aa3d6-c5ad-46f8-912d-b7d6acac15c5",
      "metadata": {
        "tags": [],
        "id": "746aa3d6-c5ad-46f8-912d-b7d6acac15c5"
      },
      "outputs": [],
      "source": [
        "original_data = pd.read_csv('Data/first_iteration_seed_analyzed_data_updated_background.csv')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "ca0caeec-068f-43ae-b978-4fdd68aeee81",
      "metadata": {
        "tags": [],
        "id": "ca0caeec-068f-43ae-b978-4fdd68aeee81"
      },
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "\n",
        "# List of file paths\n",
        "file_paths = [\n",
        "    'Data/second_iteration_seed_analyzed_data_updated_background.csv',\n",
        "    'Data/third_iteration_seed_analyzed_data_updated_background.csv',\n",
        "    'Data/third_iteration_result_analyzed_data_updated_background.csv'\n",
        "]\n",
        "\n",
        "iterations = [1, 2, 3]\n",
        "\n",
        "dataframes = []\n",
        "\n",
        "# Loop through the files and iterations to read each file and add the 'iteration' column\n",
        "for i, file in enumerate(file_paths):\n",
        "    df = pd.read_csv(file)  # Read the CSV file\n",
        "    df['iteration'] = iterations[i]  # Add the 'iteration' column\n",
        "    dataframes.append(df)  # Append the dataframe to the list\n",
        "\n",
        "# Concatenate all dataframes into one\n",
        "concatenated_df = pd.concat(dataframes, ignore_index=True)\n",
        "\n",
        "# print(concatenated_df.head())\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "05238f46-f56b-4812-95db-37fd9af4e8e3",
      "metadata": {
        "tags": [],
        "id": "05238f46-f56b-4812-95db-37fd9af4e8e3"
      },
      "outputs": [],
      "source": [
        "import ast\n",
        "\n",
        "def safe_literal_eval(value):\n",
        "    if isinstance(value, str):\n",
        "        try:\n",
        "            # Try to evaluate the string to a Python literal (e.g., list)\n",
        "            return ast.literal_eval(value)\n",
        "        except (ValueError, SyntaxError):\n",
        "            # If it's an invalid string or has \"No struggles found\", return an empty list\n",
        "            if value == \"No struggles found\":\n",
        "                return []  # Empty list for \"No struggles found\"\n",
        "            else:\n",
        "                return []  # Empty list for any other invalid string\n",
        "    else:\n",
        "        return value  # If it's already a list or not a string, return it unchanged\n",
        "\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "84dbb3c6-7f3b-4084-8722-3ee1e88fed3d",
      "metadata": {
        "tags": [],
        "id": "84dbb3c6-7f3b-4084-8722-3ee1e88fed3d"
      },
      "outputs": [],
      "source": [
        "# import ast\n",
        "\n",
        "# original_data['Background'] = original_data['Background'].apply(lambda x: ast.literal_eval(x))\n",
        "# concatenated_df[['Background']] = concatenated_df['Background'].apply(lambda x: ast.literal_eval(x))\n",
        "# original_flattened_background = [item for sublist in original_data['Background'].tolist() for item in sublist]\n",
        "# # original_flattened_background_set = set(original_flattened_background)\n",
        "\n",
        "# concatenated_flattened_background = [item for sublist in concatenated_df['Background'].tolist() for item in sublist]\n",
        "# concatenated_flattened_background_set = set(concatenated_flattened_background)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "a794b294-d37f-4c8f-a4b1-b41b932b4e6c",
      "metadata": {
        "tags": [],
        "id": "a794b294-d37f-4c8f-a4b1-b41b932b4e6c"
      },
      "outputs": [],
      "source": [
        "original_data['Background'] = original_data['Background'].apply(safe_literal_eval)\n",
        "flattened_background = [item for sublist in original_data['Background'].tolist() for item in sublist]\n",
        "flattened_background_set = set(flattened_background)\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "7280dd32-f6ed-4a47-9c3e-3f98ff0befa3",
      "metadata": {
        "tags": [],
        "id": "7280dd32-f6ed-4a47-9c3e-3f98ff0befa3"
      },
      "outputs": [],
      "source": [
        "concatenated_df['Background'] = concatenated_df['Background'].apply(safe_literal_eval)\n",
        "collected_flattened_background = [item for sublist in concatenated_df['Background'].tolist() for item in sublist]\n",
        "collected_flattened_background_set = set(collected_flattened_background)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "43e6e043-86a5-4ab1-94f4-6b689e5b7067",
      "metadata": {
        "tags": [],
        "id": "43e6e043-86a5-4ab1-94f4-6b689e5b7067"
      },
      "outputs": [],
      "source": [
        "print(len(flattened_background_set),len(collected_flattened_background_set))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "894c9575-6bd1-48a2-a600-64357bf22222",
      "metadata": {
        "tags": [],
        "id": "894c9575-6bd1-48a2-a600-64357bf22222"
      },
      "outputs": [],
      "source": [
        "# collected_flattened_background_set"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "e1a0e770-008c-419e-827a-9ee5e8c73dd0",
      "metadata": {
        "tags": [],
        "id": "e1a0e770-008c-419e-827a-9ee5e8c73dd0"
      },
      "outputs": [],
      "source": [
        "original_data['Struggles'] = original_data['Struggles'].apply(safe_literal_eval)\n",
        "flattened_struggles = [item for sublist in original_data['Struggles'].tolist() for item in sublist]\n",
        "flattened_struggles_set = set(flattened_struggles)\n",
        "\n",
        "concatenated_df['Struggles'] = concatenated_df['Struggles'].apply(safe_literal_eval)\n",
        "collected_flattened_struggles = [item for sublist in concatenated_df['Struggles'].tolist() for item in sublist]\n",
        "collected_flattened_struggles_set = set(collected_flattened_struggles)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "610ebcdd-b6d3-4a14-ba8b-67896239abe4",
      "metadata": {
        "tags": [],
        "id": "610ebcdd-b6d3-4a14-ba8b-67896239abe4"
      },
      "outputs": [],
      "source": [
        "print(len(flattened_struggles_set),len(collected_flattened_struggles_set))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "5ffa68b1-4c28-4843-aabf-1f8e3483f68d",
      "metadata": {
        "tags": [],
        "id": "5ffa68b1-4c28-4843-aabf-1f8e3483f68d"
      },
      "outputs": [],
      "source": [
        "original_data['Solutions'] = original_data['Solutions'].apply(safe_literal_eval)\n",
        "flattened_solutions = original_data['Solutions'].dropna()\n",
        "flattened_solutions = [item for sublist in flattened_solutions.tolist() for item in sublist]\n",
        "flattened_solutions_set = set(flattened_solutions)\n",
        "\n",
        "concatenated_df['Solutions'] = concatenated_df['Solutions'].apply(safe_literal_eval)\n",
        "collected_flattened_solutions = concatenated_df['Solutions'].dropna()\n",
        "collected_flattened_solutions = [item for sublist in collected_flattened_solutions.tolist() for item in sublist]\n",
        "collected_flattened_solutions_set = set(collected_flattened_solutions)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "0ad43396-915b-48d9-b00c-2967798ad1d5",
      "metadata": {
        "tags": [],
        "id": "0ad43396-915b-48d9-b00c-2967798ad1d5"
      },
      "outputs": [],
      "source": [
        "print(len(flattened_solutions_set),len(collected_flattened_solutions_set))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "419c53dc-69c6-4cb6-a327-cddf1842b6d6",
      "metadata": {
        "tags": [],
        "id": "419c53dc-69c6-4cb6-a327-cddf1842b6d6"
      },
      "outputs": [],
      "source": [
        "# collected_flattened_solutions_set"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "372fcd11-4a13-402a-8f3f-aca494d6c1ad",
      "metadata": {
        "id": "372fcd11-4a13-402a-8f3f-aca494d6c1ad"
      },
      "outputs": [],
      "source": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "b24d6694-4bd3-4e7f-9f5d-bdd9dec5f4c7",
      "metadata": {
        "id": "b24d6694-4bd3-4e7f-9f5d-bdd9dec5f4c7"
      },
      "outputs": [],
      "source": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "049bbaed-4545-4e67-886a-a348b1796231",
      "metadata": {
        "tags": [],
        "id": "049bbaed-4545-4e67-886a-a348b1796231"
      },
      "outputs": [],
      "source": [
        "!pip3 install --quiet sentence-transformers scikit-learn matplotlib\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "6a80e8ac-fefd-44d7-8cfc-a29583d49229",
      "metadata": {
        "tags": [],
        "id": "6a80e8ac-fefd-44d7-8cfc-a29583d49229"
      },
      "outputs": [],
      "source": [
        "from sentence_transformers import SentenceTransformer\n",
        "from sklearn.cluster import KMeans\n",
        "import numpy as np\n",
        "import matplotlib.pyplot as plt\n",
        "from sklearn.decomposition import PCA\n",
        "import torch\n",
        "from sentence_transformers import SentenceTransformer\n",
        "\n",
        "# Check if a GPU is available and set the device accordingly\n",
        "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
        "# Load a pre-trained sentence transformer model\n",
        "model = SentenceTransformer('all-MiniLM-L6-v2',device=device)\n",
        "\n",
        "# Convert sets to lists of sentences\n",
        "flattened_list = list(flattened_background_set)\n",
        "collected_list = list(collected_flattened_background_set)\n",
        "\n",
        "# Generate sentence embeddings for both sets\n",
        "flattened_embeddings = model.encode(flattened_list)\n",
        "collected_embeddings = model.encode(collected_list)\n",
        "\n",
        "# Apply KMeans clustering to the flattened set\n",
        "num_clusters = 5  # Choose the number of clusters (can be adjusted)\n",
        "kmeans_flattened = KMeans(n_clusters=num_clusters, random_state=0)\n",
        "flattened_labels = kmeans_flattened.fit_predict(flattened_embeddings)\n",
        "\n",
        "# Apply KMeans clustering to the collected set\n",
        "kmeans_collected = KMeans(n_clusters=num_clusters, random_state=0)\n",
        "collected_labels = kmeans_collected.fit_predict(collected_embeddings)\n",
        "\n",
        "# Function to print cluster sentences\n",
        "def print_cluster_sentences(sentences, labels, cluster_id):\n",
        "    print(f\"Cluster {cluster_id} Sentences:\")\n",
        "    for i, label in enumerate(labels):\n",
        "        if label == cluster_id:\n",
        "            print(f\"- {sentences[i]}\")\n",
        "\n",
        "# Example: Print sentences in a specific cluster (Cluster 0) for flattened set\n",
        "print_cluster_sentences(flattened_list, flattened_labels, 0)\n",
        "\n",
        "# Example: Print sentences in a specific cluster (Cluster 0) for collected set\n",
        "print_cluster_sentences(collected_list, collected_labels, 0)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "afa5376a-64d7-48c3-a152-93715af9b491",
      "metadata": {
        "tags": [],
        "id": "afa5376a-64d7-48c3-a152-93715af9b491"
      },
      "outputs": [],
      "source": [
        "from sklearn.decomposition import PCA\n",
        "\n",
        "# Reduce embeddings to 2D for visualization using PCA\n",
        "pca = PCA(n_components=2)\n",
        "flattened_reduced = pca.fit_transform(flattened_embeddings)\n",
        "collected_reduced = pca.fit_transform(collected_embeddings)\n",
        "\n",
        "# Plot the clusters\n",
        "plt.figure(figsize=(10, 5))\n",
        "plt.subplot(1, 2, 1)\n",
        "plt.scatter(flattened_reduced[:, 0], flattened_reduced[:, 1], c=flattened_labels, cmap='rainbow')\n",
        "plt.title(\"Ground Truth Background Set Clusters\")\n",
        "\n",
        "plt.subplot(1, 2, 2)\n",
        "plt.scatter(collected_reduced[:, 0], collected_reduced[:, 1], c=collected_labels, cmap='rainbow')\n",
        "plt.title(\"Collected Background Set Clusters\")\n",
        "\n",
        "plt.show()\n"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.decomposition import PCA\n",
        "import matplotlib.pyplot as plt\n",
        "import numpy as np\n",
        "import os\n",
        "\n",
        "# Ensure the directory exists\n",
        "output_dir = 'Data'\n",
        "output_file = 'background_clusters.png'\n",
        "os.makedirs(output_dir, exist_ok=True)\n",
        "\n",
        "# Reduce embeddings to 2D for visualization using PCA\n",
        "pca = PCA(n_components=2)\n",
        "flattened_reduced = pca.fit_transform(flattened_embeddings)\n",
        "collected_reduced = pca.fit_transform(collected_embeddings)\n",
        "\n",
        "# Get unique labels for legend\n",
        "flattened_unique_labels = np.unique(flattened_labels)\n",
        "collected_unique_labels = np.unique(collected_labels)\n",
        "\n",
        "# Create a color map\n",
        "colors = plt.cm.rainbow(np.linspace(0, 1, len(flattened_unique_labels)))\n",
        "\n",
        "# Create a figure and subplots\n",
        "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))\n",
        "\n",
        "# Plot the clusters for flattened dataset\n",
        "for i, label in enumerate(flattened_unique_labels):\n",
        "    label_mask = flattened_labels == label\n",
        "    ax1.scatter(flattened_reduced[label_mask, 0], flattened_reduced[label_mask, 1],\n",
        "                color=colors[i], label=f'Cluster {label}')\n",
        "ax1.set_title(\"Ground Truth Background Set Clusters\")\n",
        "\n",
        "# Plot the clusters for collected dataset\n",
        "for i, label in enumerate(collected_unique_labels):\n",
        "    label_mask = collected_labels == label\n",
        "    ax2.scatter(collected_reduced[label_mask, 0], collected_reduced[label_mask, 1],\n",
        "                color=colors[i], label=f'Cluster {label}')\n",
        "ax2.set_title(\"Collected Background Set Clusters\")\n",
        "\n",
        "# Add one legend outside the subplots\n",
        "handles, labels = ax1.get_legend_handles_labels()\n",
        "fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=len(flattened_unique_labels))\n",
        "\n",
        "plt.tight_layout()\n",
        "\n",
        "# Save the figure to the specified file with bbox_inches='tight' to include the legend\n",
        "output_path = os.path.join(output_dir, output_file)\n",
        "plt.savefig(output_path, bbox_inches='tight')\n",
        "\n",
        "# Display the plot\n",
        "plt.show()\n"
      ],
      "metadata": {
        "id": "Pwl0kWFXYVyY"
      },
      "id": "Pwl0kWFXYVyY",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.decomposition import PCA\n",
        "import matplotlib.pyplot as plt\n",
        "import numpy as np\n",
        "import os\n",
        "\n",
        "# Ensure the directory exists\n",
        "output_dir = 'Data'\n",
        "output_file = 'background_clusters.png'\n",
        "os.makedirs(output_dir, exist_ok=True)\n",
        "\n",
        "# Reduce embeddings to 2D for visualization using PCA\n",
        "pca = PCA(n_components=2)\n",
        "flattened_reduced = pca.fit_transform(flattened_embeddings)\n",
        "collected_reduced = pca.fit_transform(collected_embeddings)\n",
        "\n",
        "# Suggested cluster titles\n",
        "cluster_titles = {\n",
        "    0: \"Work Struggles\",\n",
        "    1: \"Family Dynamics\",\n",
        "    2: \"Mental Health\",\n",
        "    3: \"Societal Challenges\",\n",
        "    4: \"Systemic Critiques\"\n",
        "}\n",
        "\n",
        "# Get unique labels for legend\n",
        "flattened_unique_labels = np.unique(flattened_labels)\n",
        "collected_unique_labels = np.unique(collected_labels)\n",
        "\n",
        "# Create a color map\n",
        "colors = plt.cm.rainbow(np.linspace(0, 1, len(flattened_unique_labels)))\n",
        "\n",
        "# Create a figure and subplots\n",
        "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))\n",
        "\n",
        "# Plot the clusters for flattened dataset\n",
        "for i, label in enumerate(flattened_unique_labels):\n",
        "    label_mask = flattened_labels == label\n",
        "    ax1.scatter(flattened_reduced[label_mask, 0], flattened_reduced[label_mask, 1],\n",
        "                color=colors[i], label=f'{cluster_titles[label]}')\n",
        "ax1.set_title(\"Ground Truth Background Set Clusters\")\n",
        "\n",
        "# Plot the clusters for collected dataset\n",
        "for i, label in enumerate(collected_unique_labels):\n",
        "    label_mask = collected_labels == label\n",
        "    ax2.scatter(collected_reduced[label_mask, 0], collected_reduced[label_mask, 1],\n",
        "                color=colors[i], label=f'{cluster_titles[label]}')\n",
        "ax2.set_title(\"Collected Background Set Clusters\")\n",
        "\n",
        "# Add one legend outside the subplots\n",
        "handles, labels = ax1.get_legend_handles_labels()\n",
        "fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=len(flattened_unique_labels))\n",
        "\n",
        "plt.tight_layout()\n",
        "\n",
        "# Save the figure to the specified file with bbox_inches='tight' to include the legend\n",
        "output_path = os.path.join(output_dir, output_file)\n",
        "plt.savefig(output_path, bbox_inches='tight')\n",
        "\n",
        "# Display the plot\n",
        "plt.show()\n"
      ],
      "metadata": {
        "id": "lxy7vjviG9xu"
      },
      "id": "lxy7vjviG9xu",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "c1d542c7-db15-4574-863b-eba4389e375d",
      "metadata": {
        "tags": [],
        "id": "c1d542c7-db15-4574-863b-eba4389e375d"
      },
      "outputs": [],
      "source": [
        "from sklearn.metrics.pairwise import cosine_similarity\n",
        "\n",
        "# Compute cosine similarity between sentences in the flattened set and collected set\n",
        "similarity_matrix = cosine_similarity(flattened_embeddings, collected_embeddings)\n",
        "\n",
        "# Find the most similar sentences between the sets\n",
        "for i, sim_scores in enumerate(similarity_matrix):\n",
        "    max_sim_idx = np.argmax(sim_scores)\n",
        "    print(f\"Most similar sentence to '{flattened_list[i]}' in the collected set is '{collected_list[max_sim_idx]}' with a similarity score of {sim_scores[max_sim_idx]:.2f}\")\n"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.metrics.pairwise import cosine_similarity\n",
        "import numpy as np\n",
        "\n",
        "def get_representative_sentences(sentences, embeddings, labels, num_clusters):\n",
        "    for cluster_id in range(num_clusters):\n",
        "        # Get all sentences belonging to the current cluster\n",
        "        cluster_sentences = [sentences[i] for i in range(len(sentences)) if labels[i] == cluster_id]\n",
        "        cluster_embeddings = [embeddings[i] for i in range(len(sentences)) if labels[i] == cluster_id]\n",
        "\n",
        "        # Calculate the centroid of the cluster\n",
        "        centroid = np.mean(cluster_embeddings, axis=0)\n",
        "\n",
        "        # Calculate cosine similarity between the centroid and each sentence embedding in the cluster\n",
        "        similarities = cosine_similarity([centroid], cluster_embeddings)[0]\n",
        "\n",
        "        # Get the most representative sentence (the one closest to the centroid)\n",
        "        most_representative_idx = np.argmax(similarities)\n",
        "        most_representative_sentence = cluster_sentences[most_representative_idx]\n",
        "\n",
        "        # Print the representative sentence to help name the cluster\n",
        "        print(f\"Cluster {cluster_id}: {most_representative_sentence}\")\n",
        "\n",
        "# Example of how to use it:\n",
        "get_representative_sentences(flattened_list, flattened_embeddings, flattened_labels, num_clusters)\n"
      ],
      "metadata": {
        "id": "5vJTIQrTLFjy"
      },
      "id": "5vJTIQrTLFjy",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Cluster 0: \"Job Loss and Financial Decline\"\n",
        "This cluster focuses on the impact of job loss, financial instability, and long-term struggles within a family due to economic downturns.\n",
        "\n",
        "Cluster 1: \"Social Disparities and Embarrassment\"\n",
        "This cluster highlights the emotional and social struggles of feeling different or left out due to financial limitations, particularly in relation to peers.\n",
        "\n",
        "Cluster 2: \"Basic Necessities: Food and Shelter Insecurity\"\n",
        "The primary theme here is the struggle to afford basic needs like food and housing, emphasizing extreme financial hardship.\n",
        "\n",
        "Cluster 3: \"Poverty and Personal Identity\"\n",
        "This cluster revolves around the individual's self-identification with poverty, where the person's sense of being poor is central.\n",
        "\n",
        "Cluster 4: \"Education and Financial Barriers\"\n",
        "The focus of this cluster is on the financial challenges of pursuing higher education, with emphasis on the need for loans and the lack of family resources for support."
      ],
      "metadata": {
        "id": "vd9NDJMFNKV8"
      },
      "id": "vd9NDJMFNKV8"
    },
    {
      "cell_type": "markdown",
      "source": [
        "# another Way"
      ],
      "metadata": {
        "id": "V5bM7YmROCCt"
      },
      "id": "V5bM7YmROCCt"
    },
    {
      "cell_type": "code",
      "source": [
        "from sentence_transformers import SentenceTransformer\n",
        "\n",
        "# Load the pre-trained Sentence-BERT model\n",
        "model = SentenceTransformer('all-MiniLM-L6-v2')\n",
        "\n",
        "# Encode sentences from both sets\n",
        "flattened_embeddings = model.encode(list(flattened_background_set))\n",
        "collected_embeddings = model.encode(list(collected_flattened_background_set))\n"
      ],
      "metadata": {
        "id": "xfvo_yaQLFXx"
      },
      "id": "xfvo_yaQLFXx",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.cluster import KMeans\n",
        "\n",
        "# Number of clusters to use (can adjust based on dataset size)\n",
        "num_clusters = 5\n",
        "\n",
        "# Cluster the flattened set\n",
        "kmeans_flattened = KMeans(n_clusters=num_clusters, random_state=42)\n",
        "flattened_labels = kmeans_flattened.fit_predict(flattened_embeddings)\n",
        "\n",
        "# Cluster the collected set\n",
        "kmeans_collected = KMeans(n_clusters=num_clusters, random_state=42)\n",
        "collected_labels = kmeans_collected.fit_predict(collected_embeddings)\n"
      ],
      "metadata": {
        "id": "c-6fk5ZnLFNx"
      },
      "id": "c-6fk5ZnLFNx",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d726807f-670c-4f2f-b74e-d0d725b7b85a",
      "metadata": {
        "id": "d726807f-670c-4f2f-b74e-d0d725b7b85a"
      },
      "outputs": [],
      "source": [
        "from sklearn.metrics.pairwise import cosine_similarity\n",
        "import numpy as np\n",
        "\n",
        "# Get the centroids of both sets of clusters\n",
        "centroids_flattened = kmeans_flattened.cluster_centers_\n",
        "centroids_collected = kmeans_collected.cluster_centers_\n",
        "\n",
        "# Calculate cosine similarity between centroids of both sets\n",
        "similarity_matrix = cosine_similarity(centroids_flattened, centroids_collected)\n",
        "\n",
        "# Identify new or different clusters by looking for low similarity scores\n",
        "threshold = 0.7  # Similarity threshold to define \"new\" clusters\n",
        "new_cluster_indices = np.where(similarity_matrix.max(axis=0) < threshold)[0]\n",
        "\n",
        "# Print the indices of the new clusters in the collected set\n",
        "print(\"New clusters in the collected set:\", new_cluster_indices)\n"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Function to get sentences from specific clusters\n",
        "def get_sentences_from_cluster(sentences, labels, cluster_id):\n",
        "    return [sentences[i] for i in range(len(sentences)) if labels[i] == cluster_id]\n",
        "\n",
        "# Get the new sentences from the collected set\n",
        "new_sentences = []\n",
        "for cluster_id in new_cluster_indices:\n",
        "    new_sentences.extend(get_sentences_from_cluster(list(collected_flattened_background_set), collected_labels, cluster_id))\n",
        "\n",
        "# Print the new sentences that represent new expressions\n",
        "for sentence in new_sentences:\n",
        "    print(sentence)\n"
      ],
      "metadata": {
        "id": "qAZu3297OLo7"
      },
      "id": "qAZu3297OLo7",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Assuming new_sentences is the list of new sentences extracted\n",
        "new_sentences_embeddings = model.encode(new_sentences)\n"
      ],
      "metadata": {
        "id": "lWMuZrRgOO8K"
      },
      "id": "lWMuZrRgOO8K",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.cluster import KMeans\n",
        "\n",
        "# Number of clusters can be adjusted depending on the number of new sentences\n",
        "num_clusters = 5  # Adjust based on dataset size\n",
        "kmeans_new = KMeans(n_clusters=num_clusters, random_state=42)\n",
        "new_sentence_labels = kmeans_new.fit_predict(new_sentences_embeddings)\n",
        "\n",
        "# Print the clustered sentences\n",
        "def print_clusters(sentences, labels, num_clusters):\n",
        "    for cluster_id in range(num_clusters):\n",
        "        print(f\"\\nCluster {cluster_id}:\")\n",
        "        for i, label in enumerate(labels):\n",
        "            if label == cluster_id:\n",
        "                print(f\"- {sentences[i]}\")\n",
        "\n",
        "# Display the grouped sentences\n",
        "print_clusters(new_sentences, new_sentence_labels, num_clusters)\n"
      ],
      "metadata": {
        "id": "p_hFyi-NO4d8"
      },
      "id": "p_hFyi-NO4d8",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from scipy.cluster.hierarchy import linkage, fcluster, dendrogram\n",
        "import matplotlib.pyplot as plt\n",
        "\n",
        "# Perform hierarchical clustering (Ward's method)\n",
        "Z = linkage(new_sentences_embeddings, method='ward')\n",
        "\n",
        "# Assign clusters based on a distance threshold\n",
        "max_distance = 1.5  # You can adjust this threshold based on your data\n",
        "new_sentence_labels_hierarchical = fcluster(Z, max_distance, criterion='distance')\n",
        "\n",
        "# Optionally, plot the dendrogram to visualize clustering\n",
        "plt.figure(figsize=(10, 7))\n",
        "dendrogram(Z)\n",
        "plt.show()\n",
        "\n",
        "# Print the hierarchical clusters\n",
        "print_clusters(new_sentences, new_sentence_labels_hierarchical, len(set(new_sentence_labels_hierarchical)))\n"
      ],
      "metadata": {
        "id": "TI3mQiWaO8L8"
      },
      "id": "TI3mQiWaO8L8",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# new sentences"
      ],
      "metadata": {
        "id": "yA-XQEckRyKc"
      },
      "id": "yA-XQEckRyKc",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def print_grouped_sentences(sentences, labels, num_clusters, themes):\n",
        "    for cluster_id in range(num_clusters):\n",
        "        print(f\"\\nTheme: {themes[cluster_id]}\")\n",
        "        for i, label in enumerate(labels):\n",
        "            if label == cluster_id:\n",
        "                print(f\"- {sentences[i]}\")\n",
        "\n",
        "# Example themes for the clusters (can be adjusted based on the content)\n",
        "themes = [\"Financial Struggles\", \"Family Issues\", \"Educational Barriers\", \"Mental Health\", \"Social Inequality\"]\n",
        "\n",
        "# Display the grouped sentences with themes\n",
        "print_grouped_sentences(new_sentences, new_sentence_labels, num_clusters, themes)\n"
      ],
      "metadata": {
        "id": "kL0TOHkIPBj5"
      },
      "id": "kL0TOHkIPBj5",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "The proposed algorithm identifies and groups new sentences introduced in the collected dataset based on their semantic similarity. First, we leverage Sentence-BERT, a pre-trained transformer model, to generate sentence embeddings that capture the underlying meaning of each sentence. These embeddings are then used as input for clustering algorithms like KMeans or Hierarchical Clustering to group semantically similar sentences together. After clustering, we analyze the sentences within each cluster to identify common themes or expressions. Finally, each cluster is assigned a descriptive label (e.g., \"Financial Struggles\" or \"Emotional Challenges\") based on the content of its sentences. This process reveals how the collected dataset introduces new ways of expressing backgrounds, expanding the thematic representation beyond the original dataset."
      ],
      "metadata": {
        "id": "SRfaR3yhQTiw"
      },
      "id": "SRfaR3yhQTiw"
    },
    {
      "cell_type": "markdown",
      "source": [
        "##\n",
        "To check whether the themes identified in the collected data are present or absent in the original data, you can apply the following approach:"
      ],
      "metadata": {
        "id": "fSMw-5LXR4Ty"
      },
      "id": "fSMw-5LXR4Ty"
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.cluster import KMeans\n",
        "\n",
        "# Encode the new sentences using Sentence-BERT\n",
        "collected_embeddings = model.encode(list(collected_flattened_background_set))\n",
        "\n",
        "# Train KMeans on the collected (new) data\n",
        "num_clusters = 5  # Adjust based on your data\n",
        "kmeans_new = KMeans(n_clusters=num_clusters, random_state=42)\n",
        "kmeans_new.fit(collected_embeddings)\n",
        "\n",
        "# Get the cluster labels for the collected data\n",
        "collected_labels = kmeans_new.labels_\n",
        "\n"
      ],
      "metadata": {
        "id": "iAx7IdiSR4Fe"
      },
      "id": "iAx7IdiSR4Fe",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Encode the original sentences using Sentence-BERT\n",
        "flattened_embeddings = model.encode(list(flattened_background_set))\n",
        "\n",
        "# Predict the cluster assignments for the original data using the trained KMeans model\n",
        "original_labels = kmeans_new.predict(flattened_embeddings)\n"
      ],
      "metadata": {
        "id": "QnpoXUvzPo1d"
      },
      "id": "QnpoXUvzPo1d",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import numpy as np\n",
        "\n",
        "# Count how many original sentences are assigned to each cluster (theme)\n",
        "original_cluster_distribution = np.bincount(original_labels, minlength=num_clusters)\n",
        "collected_cluster_distribution = np.bincount(collected_labels, minlength=num_clusters)\n",
        "\n",
        "# Compare distribution\n",
        "for i in range(num_clusters):\n",
        "    print(f\"Cluster {i}: {original_cluster_distribution[i]} sentences from original data, {collected_cluster_distribution[i]} from collected data\")\n"
      ],
      "metadata": {
        "id": "STkQTcNZTgDM"
      },
      "id": "STkQTcNZTgDM",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Identify clusters (themes) that have very few sentences in the original data\n",
        "missing_themes = [i for i in range(num_clusters) if original_cluster_distribution[i] == 0]\n",
        "\n",
        "# Display missing themes\n",
        "print(f\"Clusters (themes) not represented in the original data: {missing_themes}\")\n"
      ],
      "metadata": {
        "id": "w1B4m8-9TiV0"
      },
      "id": "w1B4m8-9TiV0",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Function to print sentences in each cluster\n",
        "def print_sentences_by_cluster(sentences, labels, cluster_id):\n",
        "    print(f\"Sentences in Cluster {cluster_id}:\")\n",
        "    for i, label in enumerate(labels):\n",
        "        if label == cluster_id:\n",
        "            print(f\"- {sentences[i]}\")\n",
        "\n",
        "# Example: Print sentences in Cluster 0 from both original and collected sets\n",
        "print(\"Original Data, Cluster 0:\")\n",
        "print_sentences_by_cluster(list(flattened_background_set), original_labels, 0)\n",
        "\n",
        "print(\"\\nCollected Data, Cluster 0:\")\n",
        "print_sentences_by_cluster(list(collected_flattened_background_set), collected_labels, 0)\n"
      ],
      "metadata": {
        "id": "FehBfYXsTnR3"
      },
      "id": "FehBfYXsTnR3",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Collected Data Cluster 0 shows an expansion of themes around work experience and internships, academic support systems, and mental health/emotional struggles, which are less represented in the original data.\n",
        "The original data focuses more on broad socioeconomic barriers and financial challenges, while the collected data dives deeper into the specific experiences of navigating those barriers within the academic environment, offering more granularity in describing coping mechanisms and work-life balance challenges.\n",
        "__________________________________________________\n",
        "Potential Insights:\n",
        "The collected data adds more depth and nuance to the themes present in the original dataset, particularly with regard to specific experiences like internships, rural education, and mental health struggles.\n",
        "The collected data introduces new challenges related to the academic system, such as issues with faculty, navigating educational bureaucracy, and balancing work and school.\n",
        "The original data offers a broader but less detailed view of socioeconomic challenges, while the collected data enriches this with more personal and emotionally charged accounts of the same struggles.\n",
        "\n",
        "_____________________________________________________\n",
        "\n",
        "The collected dataset significantly expands the representation of background challenges by providing more specific experiences and personal narratives. This demonstrates an enrichment in how students describe their financial, academic, and emotional struggles, showing a more detailed picture of how socioeconomic factors impact educational journeys."
      ],
      "metadata": {
        "id": "TTv9ocfMUtRl"
      },
      "id": "TTv9ocfMUtRl"
    },
    {
      "cell_type": "code",
      "source": [
        "# Example: Print sentences in Cluster 0 from both original and collected sets\n",
        "print(\"Original Data, Cluster 1:\")\n",
        "print_sentences_by_cluster(list(flattened_background_set), original_labels, 1)\n",
        "\n",
        "print(\"\\nCollected Data, Cluster 1:\")\n",
        "print_sentences_by_cluster(list(collected_flattened_background_set), collected_labels, 1)"
      ],
      "metadata": {
        "id": "nPoQEMTsTy6u"
      },
      "id": "nPoQEMTsTy6u",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "More Diverse and Specific Family Backgrounds, Struggles with Independence and Support, Impact of Broader Social and Economic Systems:\n",
        "\n",
        "The collected data contains mentions of how systemic issues contribute to their struggles (e.g., \"it's not as if we did not know that we were not investing enough in the education of our poorest\"). This expansion shows a broader awareness of societal-level factors affecting the individuals, which is less present in the original data.\n",
        "Mental Health and Emotional Toll:\n",
        "\n",
        "There is a deeper reflection on the emotional and psychological impacts of poverty in the collected data. Sentences like \"it's really quite difficult keeping up a cheerful composure when you know your future is pretty much guaranteed to be spent in poverty\" show a more explicit discussion of the emotional toll, which is less represented in the original data.\n",
        "\n",
        "Conclusion:\n",
        "Common Ground: Both datasets heavily feature themes of poverty, family struggles, and lack of financial support, which are central to the backgrounds of low socioeconomic status (SES) individuals.\n",
        "New Expansions: The collected data expands on these themes by adding more specific family stories, including struggles with independence, more cultural backgrounds, and a deeper discussion of the emotional and psychological toll of poverty.\n",
        "Representation of Systemic Factors: The collected data also brings more awareness to systemic and societal issues that exacerbate these challenges, expanding the narrative beyond individual financial hardship.\n",
        "These insights suggest that while the core themes remain consistent across both datasets, the collected data introduces more detailed and diverse representations of family dynamics, personal independence, and societal influences, giving a fuller picture of the experiences of individuals from low SES backgrounds."
      ],
      "metadata": {
        "id": "whcYGsP_Vk4q"
      },
      "id": "whcYGsP_Vk4q"
    },
    {
      "cell_type": "code",
      "source": [
        "# Example: Print sentences in Cluster 0 from both original and collected sets\n",
        "print(\"Original Data, Cluster 2:\")\n",
        "print_sentences_by_cluster(list(flattened_background_set), original_labels, 2)\n",
        "\n",
        "print(\"\\nCollected Data, Cluster 2:\")\n",
        "print_sentences_by_cluster(list(collected_flattened_background_set), collected_labels, 2)"
      ],
      "metadata": {
        "id": "oZ7ICSMlU-l7"
      },
      "id": "oZ7ICSMlU-l7",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "New or Expanded Themes in the Collected Data:\n",
        "\n",
        "Emotional Struggles and Mental Health:\n",
        "\n",
        "The collected data introduces more detailed reflections on emotional struggles and mental health challenges. For example, one sentence states, \"while wallowing in self-pity,\" indicating deeper expressions of frustration and sadness.\n",
        "Another example of emotional toll in the collected data is: \"i left college it felt like someone knocked out the tent pole in the middle of a circus, and everything is dark...\" which expands on the mental health challenges and feelings of despair linked to financial struggles.\n",
        "Social Commentary and Systemic Criticism:\n",
        "\n",
        "The collected data introduces a broader social commentary, touching on systemic issues such as gentrification, disenfranchisement, and income inequality. For instance, the statement \"our bad on the whole 'disenfranchisement' thing that we did to you?\" reflects frustration with societal structures that perpetuate poverty.\n",
        "There's a clearer focus on the socioeconomic system in the collected data, such as in the sentence, \"it is not as if we did not know that we were not investing enough in the education of our poorest.\"\n",
        "Coping with Poverty Through Creativity and Community:\n",
        "\n",
        "In the collected data, there are examples of individuals coping with their difficult circumstances through creative problem-solving or relying on community support. For example, \"i found out maybe two weeks ago, i think he passed mid-feb. i was told this by the other streetwise guy\" shows how communities of marginalized people, even in poverty, support one another.\n",
        "The sentence \"my girlfriend and I live just outside of town in a home that uses a wood stove for the only heat source\" reflects a certain resilience in coping with financial hardship by making do with minimal resources.\n",
        "Expressions of Frustration and Anger:\n",
        "\n",
        "The collected data brings a more personal tone of frustration and anger regarding the financial and housing struggles, seen in phrases like \"you guys are in a shitty situation\" or \"you fucking with me, guy?\" These expressions of anger are not as prominent in the original data, which tends to focus more on the practical aspects of financial hardship.\n",
        "Lack of Social Mobility:\n",
        "\n",
        "The collected data shows a greater emphasis on the lack of social mobility and hopelessness associated with poverty. For example, \"it's really quite difficult keeping up a cheerful composure when you know your future is pretty much guaranteed to be spent in poverty\" highlights a sense of being trapped in a cycle of poverty, a theme less explicitly discussed in the original data.\n",
        "Conclusion:\n",
        "Common Themes: Both datasets emphasize housing insecurity, homelessness, and financial struggles, but the collected data expands on these issues with more personal and emotional reflections, as well as a broader social critique of systemic issues.\n",
        "New Expansions: The collected data introduces deeper emotional struggles, anger, and frustration regarding the hardships faced, while also highlighting how individuals and communities attempt to cope with or resist these challenges. The collected data also introduces more social commentary on the system that perpetuates poverty and disenfranchisement.\n",
        "This comparison suggests that while both datasets focus on material struggles, the collected data provides a richer narrative that includes emotional, social, and systemic perspectives, offering a more comprehensive view of the experience of poverty and financial hardship."
      ],
      "metadata": {
        "id": "3RuAi9VvWCy_"
      },
      "id": "3RuAi9VvWCy_"
    },
    {
      "cell_type": "code",
      "source": [
        "# Example: Print sentences in Cluster 0 from both original and collected sets\n",
        "print(\"Original Data, Cluster 3:\")\n",
        "print_sentences_by_cluster(list(flattened_background_set), original_labels, 3)\n",
        "\n",
        "print(\"\\nCollected Data, Cluster 3:\")\n",
        "print_sentences_by_cluster(list(collected_flattened_background_set), collected_labels, 3)"
      ],
      "metadata": {
        "id": "V7PxnZYHVC0T"
      },
      "id": "V7PxnZYHVC0T",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Example: Print sentences in Cluster 0 from both original and collected sets\n",
        "print(\"Original Data, Cluster 4:\")\n",
        "print_sentences_by_cluster(list(flattened_background_set), original_labels, 4)\n",
        "\n",
        "print(\"\\nCollected Data, Cluster 4:\")\n",
        "print_sentences_by_cluster(list(collected_flattened_background_set), collected_labels, 4)"
      ],
      "metadata": {
        "id": "cO6mdBrsVFZI"
      },
      "id": "cO6mdBrsVFZI",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "New or Expanded Themes in the Collected Data:\n",
        "**Increased Focus on Systemic Issues:\n",
        "\n",
        "The collected data reflects more detailed critiques of systemic issues affecting students, such as the cost of education, the student debt crisis, and the unrealistic promises of higher education as a golden ticket to success:\n",
        "\"I wish our generation were not forced into over-educated servitude of the retiring baby boomers.\"\n",
        "\"I'm going to be blunt here... people are promised that college is the golden ticket to getting a good job – and that's just not true.\"\n",
        "Criticism of Student Employment Practices:\n",
        "\n",
        "The collected data introduces new themes around student employment practices, such as being underpaid or exploited in jobs meant to support their education:\n",
        "\"Working with the 'regulars' was awful... their contempt for students really was poisonous.\"\n",
        "\"Students are the managers, so it's a big favoritism game.\"\n",
        "Career Concerns Post-Graduation:\n",
        "\n",
        "The collected data also introduces more concerns about job prospects post-graduation and how debt influences decisions about which fields to enter:\n",
        "\"I'm going to be blunt here... fear that you'll end up in a job post-graduation that didn't even require a college degree.\"\n",
        "\"Man, I wish they would take that energy and focus it on making UT a school wherein people without a high income could attend without going 100k into debt.\"\n",
        "Conclusion:\n",
        "Shared Struggles: Both datasets emphasize the hardships of balancing work and education, financial constraints, and the non-linear paths that many students are forced to take due to external pressures. These are fundamental struggles faced by students from low socioeconomic backgrounds.\n",
        "Expanded Representation in Collected Data: The collected data adds a richer, more detailed critique of the systemic issues facing students, particularly the burden of student debt and how employment and career prospects are influenced by economic barriers.\n",
        "By comparing both datasets, it is evident that while the core struggles remain consistent, the collected data expands the narrative by incorporating broader societal issues and adding emotional depth to the financial and work-related challenges faced by students."
      ],
      "metadata": {
        "id": "2tsuWnQJWbSs"
      },
      "id": "2tsuWnQJWbSs"
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "qgMmIFOcVKgQ"
      },
      "id": "qgMmIFOcVKgQ",
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "myenv",
      "language": "python",
      "name": "myenv"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.10.14"
    },
    "colab": {
      "provenance": [],
      "gpuType": "T4"
    },
    "accelerator": "GPU"
  },
  "nbformat": 4,
  "nbformat_minor": 5
}