{
  "cells": [
    {
      "cell_type": "markdown",
      "id": "09875178-b468-4656-af14-201fcb71189c",
      "metadata": {
        "id": "09875178-b468-4656-af14-201fcb71189c"
      },
      "source": [
        "# get all texts that are low SES"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "7bb03ccb-b1e5-4583-b8ce-6fbd97605333",
      "metadata": {
        "tags": [],
        "id": "7bb03ccb-b1e5-4583-b8ce-6fbd97605333"
      },
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "import nltk\n",
        "import os"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "ed9b6f6a-264b-421f-9d2c-ebe640e008df",
      "metadata": {
        "id": "ed9b6f6a-264b-421f-9d2c-ebe640e008df"
      },
      "source": [
        "**getting 64-original-text**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "20b4da1f-8b25-49e0-a4c6-46ba11fed38d",
      "metadata": {
        "tags": [],
        "id": "20b4da1f-8b25-49e0-a4c6-46ba11fed38d"
      },
      "outputs": [],
      "source": [
        "\n",
        "# Preprocessing text data\n",
        "nltk.download('stopwords')\n",
        "nltk.download('punkt')  # Ensure 'punkt' is downloaded\n",
        "from nltk.corpus import stopwords\n",
        "from nltk.tokenize import word_tokenize\n",
        "stop_words = set(stopwords.words('english'))\n",
        "\n",
        "def preprocess_text(text):\n",
        "    tokens = word_tokenize(text.lower())  # Tokenization and lowercasing\n",
        "    tokens = [word for word in tokens if word.isalpha()]  # Remove non-alphabetic tokens\n",
        "    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords\n",
        "    return ' '.join(tokens)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "3a333368-12b2-4f09-ab26-c9da08e76c61",
      "metadata": {
        "tags": [],
        "id": "3a333368-12b2-4f09-ab26-c9da08e76c61"
      },
      "outputs": [],
      "source": [
        "\n",
        "path = \"Data/GroundTruthFilter/\"\n",
        "files = [file for file in os.listdir(path) if not file.startswith('.')]\n",
        "\n",
        "ground_truth_texts = []\n",
        "\n",
        "for file_name in files:\n",
        "    with open(path+file_name, 'r') as f:\n",
        "        output = f.read()\n",
        "        ground_truth_texts.append(output)\n",
        "\n",
        "# ground_truth_texts= [preprocess_text(item) for item in ground_truth_texts]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "6c469a7a-34dc-418c-8258-ee969d1f0e5a",
      "metadata": {
        "tags": [],
        "id": "6c469a7a-34dc-418c-8258-ee969d1f0e5a"
      },
      "outputs": [],
      "source": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "1788f854-4740-4630-81a7-31f3aea3d013",
      "metadata": {
        "tags": [],
        "id": "1788f854-4740-4630-81a7-31f3aea3d013"
      },
      "outputs": [],
      "source": [
        "df = pd.read_excel('Data/final_collected_data_from_tsne_35_no_outliers.xlsx')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "84999270-710f-4d4d-a315-b676015df811",
      "metadata": {
        "tags": [],
        "id": "84999270-710f-4d4d-a315-b676015df811"
      },
      "outputs": [],
      "source": [
        "iteration_1_low_ses_text = df[df['Low SES Check'] ==1]['text']\n",
        "iteration_1_low_ses_tokenized_text = df[df['Low SES Check'] ==1]['tokenized_text']\n",
        "iteration_1_not_low_ses_tokenized_text = df[df['Low SES Check'] ==0]['tokenized_text']\n",
        "iteration_1_not_low_ses_text = df[df['Low SES Check'] ==0]['text']"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "53f03b01-c81f-490a-a03d-a355ae72f360",
      "metadata": {
        "tags": [],
        "id": "53f03b01-c81f-490a-a03d-a355ae72f360"
      },
      "outputs": [],
      "source": [
        "iteration_2_checked_data_all_fields = pd.read_csv('Data/iteration_2_checked_data_all_fields.csv')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "a736d630-d739-48fa-b936-72bc30aabe04",
      "metadata": {
        "tags": [],
        "id": "a736d630-d739-48fa-b936-72bc30aabe04"
      },
      "outputs": [],
      "source": [
        "iteration_2_SES_texts = iteration_2_checked_data_all_fields['tokenized_body']\n",
        "iteration_2_SES_texts_not_tokenized = iteration_2_checked_data_all_fields['body']"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "47742773-f714-4125-a2fe-6f7961a1f2a5",
      "metadata": {
        "tags": [],
        "id": "47742773-f714-4125-a2fe-6f7961a1f2a5"
      },
      "outputs": [],
      "source": [
        "len(ground_truth_texts), len(iteration_1_low_ses_tokenized_text), len(iteration_2_SES_texts)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "9819d439-c7fe-4f97-b64e-73a9a31865f7",
      "metadata": {
        "tags": [],
        "id": "9819d439-c7fe-4f97-b64e-73a9a31865f7"
      },
      "outputs": [],
      "source": [
        "len(ground_truth_texts),len(iteration_1_low_ses_text),len(iteration_2_SES_texts_not_tokenized)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "c99107c1-b2e6-495b-a7f6-5b738b9da2ae",
      "metadata": {
        "tags": [],
        "id": "c99107c1-b2e6-495b-a7f6-5b738b9da2ae"
      },
      "outputs": [],
      "source": [
        "64+110+167"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "2daee4ff-77de-4210-b21e-3245b0278372",
      "metadata": {
        "id": "2daee4ff-77de-4210-b21e-3245b0278372"
      },
      "source": [
        "# **Prompt Engineering**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "7bb9e3f1-57f0-48cf-8923-2d92c632eb6f",
      "metadata": {
        "tags": [],
        "id": "7bb9e3f1-57f0-48cf-8923-2d92c632eb6f"
      },
      "outputs": [],
      "source": [
        "import json\n",
        "import torch\n",
        "from transformers import LlamaTokenizer, LlamaForCausalLM, pipeline\n",
        "\n",
        "tokenizer = LlamaTokenizer.from_pretrained(\"meta-llama/Llama-2-7b-chat-hf\")\n",
        "model = LlamaForCausalLM.from_pretrained(\"meta-llama/Llama-2-7b-chat-hf\", torch_dtype=torch.float16)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "f48534f6-757d-4970-aad4-e864426fc04c",
      "metadata": {
        "tags": [],
        "id": "f48534f6-757d-4970-aad4-e864426fc04c"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "from transformers import pipeline\n",
        "import json\n",
        "\n",
        "# Set up the pipeline with GPU/CPU support\n",
        "device = 0 if torch.cuda.is_available() else -1  # Use GPU if available, else CPU\n",
        "llama_pipeline = pipeline(\n",
        "    \"text-generation\",\n",
        "    model=model,\n",
        "    tokenizer=tokenizer,\n",
        "    max_new_tokens=300,  # Adjusted for appropriate token size\n",
        "    do_sample=False,  # Disable sampling for more deterministic outputs\n",
        "    temperature=0.5,  # Lower temperature for less randomness\n",
        "    device=device  # Specify GPU/CPU usage\n",
        ")\n",
        "\n",
        "def extract_struggles(text):\n",
        "    # Construct the prompt to extract only struggles information with full criteria\n",
        "    prompt = f\"\"\"\n",
        "    All the texts provided are written by low SES (socioeconomic status) students who are writing about their struggles.\n",
        "\n",
        "    The following criteria have been used to verify that each text is from a low SES student:\n",
        "    1. **Relevance of the content**: The narratives need to shed light on the experience of being a low-SES student and attending higher education, focusing on financial, psychological, physical, or social struggles.\n",
        "    2. **Descriptive Detail**: The narratives should describe the challenges faced by individuals with low-SES backgrounds, their efforts to improve their situation, and the outcomes of those efforts.\n",
        "    3. **Exclusion of General Commentary**: Narratives that primarily offered general commentary, described a condition, or provided advice were excluded, as they did not qualify as valid data points.\n",
        "\n",
        "    Important: Extract the following information **exactly** from the text without adding or changing any words. Use only direct quotes from the text:\n",
        "    - **Struggles** they faced (directly quoted from the text)\n",
        "\n",
        "    Text:\n",
        "    {text}\n",
        "\n",
        "    Output valid JSON with only direct quotes related to struggles:\n",
        "    \"\"\"\n",
        "\n",
        "    # Generate the output using llama_pipeline\n",
        "    output = llama_pipeline(prompt, max_new_tokens=300, do_sample=False, temperature=0.5)\n",
        "\n",
        "    # Extract and return the generated text\n",
        "    generated_text = output[0]['generated_text']\n",
        "    # print(\"Generated Text:\", generated_text)\n",
        "\n",
        "    # Attempt to parse JSON from generated text\n",
        "    try:\n",
        "        # Find the JSON part of the generated text\n",
        "        json_start = generated_text.find('{')\n",
        "        json_end = generated_text.rfind('}') + 1\n",
        "        # Extract the JSON string\n",
        "        json_str = generated_text[json_start:json_end]\n",
        "\n",
        "        # Parse the extracted JSON\n",
        "        extracted_json = json.loads(json_str)\n",
        "        # print(extracted_json)\n",
        "        # Ensure that the JSON contains the 'struggles' field\n",
        "\n",
        "        struggles = extracted_json.get('struggles', \"\")\n",
        "        return struggles if struggles else \"No struggles found\"\n",
        "\n",
        "    except (json.JSONDecodeError, ValueError):\n",
        "        print(\"Invalid JSON or no valid struggles found.\")\n",
        "        return \"No struggles found\"\n",
        "\n",
        "# Example usage:\n",
        "text_example = \"\"\"\n",
        "My struggles during college were overwhelming. Financial aid couldn't cover everything.\n",
        "I had to work multiple jobs while managing my coursework, and it often interfered with my ability to study.\n",
        "\"\"\"\n",
        "struggles = extract_struggles(text_example)\n",
        "print(\"Extracted Struggles:\", struggles)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "72f1eed4-b6b5-45c1-99e4-b6e27c8f9176",
      "metadata": {
        "tags": [],
        "id": "72f1eed4-b6b5-45c1-99e4-b6e27c8f9176"
      },
      "outputs": [],
      "source": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d3d99e81-34e7-47b3-957a-7408fadae7e1",
      "metadata": {
        "tags": [],
        "id": "d3d99e81-34e7-47b3-957a-7408fadae7e1"
      },
      "outputs": [],
      "source": [
        "def extract_solutions(text):\n",
        "    # Construct the prompt to extract only solutions information with full criteria\n",
        "    prompt = f\"\"\"\n",
        "    All the texts provided are written by low SES (socioeconomic status) students who are writing about their struggles.\n",
        "\n",
        "    The following criteria have been used to verify that each text is from a low SES student:\n",
        "    1. **Relevance of the content**: The narratives need to shed light on the experience of being a low-SES student and attending higher education, focusing on financial, psychological, physical, or social struggles.\n",
        "    2. **Descriptive Detail**: The narratives should describe the challenges faced by individuals with low-SES backgrounds, their efforts to improve their situation, and the outcomes of those efforts.\n",
        "    3. **Exclusion of General Commentary**: Narratives that primarily offered general commentary, described a condition, or provided advice were excluded, as they did not qualify as valid data points.\n",
        "\n",
        "    Each narrative had to meet at least one of the first two criteria and also satisfy the third qualification to be included in the dataset. The narratives were doubly annotated by two annotators to ensure consistency and quality of the data.\n",
        "\n",
        "    Important: Extract the following information **exactly** from the text without adding or changing any words. Use only direct quotes from the text:\n",
        "    - **Solutions** or actions they took to address these struggles (directly quoted from the text)\n",
        "\n",
        "    Text:\n",
        "    {text}\n",
        "\n",
        "    valid JSON Output (only with direct quotes from the text):\n",
        "    \"\"\"\n",
        "\n",
        "    # Generate the output using llama_pipeline\n",
        "    output = llama_pipeline(prompt, max_new_tokens=200, do_sample=False, temperature=0.5)\n",
        "\n",
        "    # Extract and return the generated text\n",
        "    generated_text = output[0]['generated_text']\n",
        "\n",
        "    # Attempt to parse JSON from generated text\n",
        "    try:\n",
        "        json_start = generated_text.find('{')\n",
        "        json_end = generated_text.rfind('}') + 1\n",
        "        extracted_json = json.loads(generated_text[json_start:json_end])\n",
        "\n",
        "    except json.JSONDecodeError:\n",
        "        extracted_json = {\"text\": text, \"solutions\": \"\"}\n",
        "\n",
        "    return extracted_json.get('solutions', \"\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "2223f5b9-05f7-4bf0-8e9e-311dbed6ba7c",
      "metadata": {
        "tags": [],
        "id": "2223f5b9-05f7-4bf0-8e9e-311dbed6ba7c"
      },
      "outputs": [],
      "source": [
        "import regex as re"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "641463b7-1eec-40e7-8d92-c667587b68e9",
      "metadata": {
        "tags": [],
        "id": "641463b7-1eec-40e7-8d92-c667587b68e9"
      },
      "outputs": [],
      "source": [
        "import json\n",
        "import re\n",
        "\n",
        "def clean_json(json_string):\n",
        "    \"\"\"Remove trailing commas from JSON objects.\"\"\"\n",
        "    # Remove trailing commas that appear before the closing brace\n",
        "    cleaned_json = re.sub(r',\\s*}', '}', json_string)\n",
        "    return cleaned_json\n",
        "\n",
        "def process_output(generated_text):\n",
        "    # Use regex to find all JSON-like structures in the generated text\n",
        "    json_pattern = r'\\{(?:[^{}]|(?:\\{[^{}]*\\}))*\\}'\n",
        "    json_matches = re.findall(json_pattern, generated_text)\n",
        "\n",
        "    # Create an empty list to hold all background texts\n",
        "    background_list = []\n",
        "\n",
        "    for json_match in json_matches:\n",
        "        try:\n",
        "            # Clean the JSON string (remove any trailing commas)\n",
        "            cleaned_json = clean_json(json_match)\n",
        "\n",
        "            # Attempt to load the JSON object from each cleaned match\n",
        "            extracted_json = json.loads(cleaned_json)\n",
        "\n",
        "            # Append the 'background' field to the background_list if it exists\n",
        "            if 'background' in extracted_json:\n",
        "                background_list.append(extracted_json['background'])\n",
        "        except (json.JSONDecodeError, ValueError) as e:\n",
        "            print(f\"Error parsing JSON: {json_match}, Error: {str(e)}\")\n",
        "\n",
        "    # Return the list of backgrounds\n",
        "    return background_list"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "782237c0-6bee-4214-af76-cdde3baa957d",
      "metadata": {
        "tags": [],
        "id": "782237c0-6bee-4214-af76-cdde3baa957d"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "from transformers import pipeline\n",
        "import json\n",
        "\n",
        "# Check if a GPU is available\n",
        "device = 0 if torch.cuda.is_available() else -1  # Use device 0 for GPU, -1 for CPU\n",
        "\n",
        "# Set up the pipeline with GPU/CPU support\n",
        "llama_pipeline = pipeline(\n",
        "    \"text-generation\",\n",
        "    model=model,\n",
        "    tokenizer=tokenizer,\n",
        "    max_new_tokens=200,  # Reduced number of tokens for faster generation\n",
        "    do_sample=False,  # Disable sampling for faster, deterministic outputs\n",
        "    temperature=0.5,  # Lower temperature for more deterministic output\n",
        "    device=device  # Explicitly specify GPU or CPU\n",
        ")\n",
        "\n",
        "def extract_background(text):\n",
        "    # Updated prompt with emphasis on generating concise outputs\n",
        "    prompt = f\"\"\"\n",
        "    All the texts provided are written by low SES (socioeconomic status) students who are writing about their struggles.\n",
        "\n",
        "    The following criteria have been used to verify that each text is from a low SES student:\n",
        "    1. **Relevance of the content**: The narratives need to shed light on the experience of being a low-SES student and attending higher education, focusing on financial, psychological, physical, or social struggles.\n",
        "    2. **Descriptive Detail**: The narratives should describe the challenges faced by individuals with low-SES backgrounds, their efforts to improve their situation, and the outcomes of those efforts.\n",
        "    3. **Exclusion of General Commentary**: Narratives that primarily offered general commentary, described a condition, or provided advice were excluded, as they did not qualify as valid data points.\n",
        "\n",
        "    Each narrative had to meet at least one of the first two criteria and also satisfy the third qualification to be included in the dataset. The narratives were doubly annotated by two annotators to ensure consistency and quality of the data.\n",
        "\n",
        "    Important: Extract the following information **exactly** from the text without adding or changing any words.\n",
        "    Use only direct quotes from the text:\n",
        "    - **background** or any texts about family situations (directly quoted from the text)\n",
        "\n",
        "    Example Backgrounds:\n",
        "    - \"I, however, am poor\"\n",
        "    - \"My parents could no longer afford to keep me fed with a roof over my head\"\n",
        "    - \"My family could never afford\"\n",
        "\n",
        "    Text:\n",
        "    {text}\n",
        "\n",
        "    valid JSON Output (only with direct quotes from the text):\n",
        "    \"\"\"\n",
        "\n",
        "    # Generate the output using llama_pipeline\n",
        "    output = llama_pipeline(prompt, max_new_tokens=200, do_sample=False, temperature=0.5)\n",
        "\n",
        "    # Extract and return the generated text\n",
        "    generated_text = output[0]['generated_text']\n",
        "\n",
        "    # Debugging step: Print generated text to verify\n",
        "    # print(\"Generated Text: \", generated_text)\n",
        "\n",
        "    # Extract multiple JSON objects and consolidate them into a single list\n",
        "    background_list = []\n",
        "    count = 0\n",
        "    try:\n",
        "        # Split the generated output by \"or\" (assuming this separates multiple JSON objects in your output)\n",
        "        generated_text = generated_text.split(\"valid JSON Output (only with direct quotes from the text):\")[1]\n",
        "        # print(\"new generated #####\",generated_text)\n",
        "        background_list = process_output(generated_text)\n",
        "        return background_list\n",
        "\n",
        "    except Exception as e:\n",
        "        print(f\"Error processing generated text: {str(e)}\")\n",
        "        return {\"background\": []}\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "0e29ddbf-bdc4-4c3f-9f4b-4ff7bdc6edcc",
      "metadata": {
        "tags": [],
        "id": "0e29ddbf-bdc4-4c3f-9f4b-4ff7bdc6edcc"
      },
      "outputs": [],
      "source": [
        "input_text = ground_truth_texts[8]\n",
        "\n",
        "# Extract background information\n",
        "background = extract_background(input_text)\n",
        "print(\"Background:\", background)\n",
        "print(\"_____________________\")\n",
        "# # Extract struggles information\n",
        "struggles = extract_struggles(input_text)\n",
        "print(\"Struggles:\", struggles)\n",
        "print(\"_____________________\")\n",
        "# # Extract solutions information\n",
        "solutions = extract_solutions(input_text)\n",
        "print(\"Solutions:\", solutions)\n",
        "print(\"_____________________\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "1698bffb-4b47-4c47-84fa-e888c8e6af74",
      "metadata": {
        "tags": [],
        "id": "1698bffb-4b47-4c47-84fa-e888c8e6af74"
      },
      "outputs": [],
      "source": [
        "type(background)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "03a953cd-aa5b-4381-8c01-8ce03c28bcea",
      "metadata": {
        "tags": [],
        "id": "03a953cd-aa5b-4381-8c01-8ce03c28bcea"
      },
      "outputs": [],
      "source": [
        "# background['background']"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "9908015e-3e31-466e-bf49-dd5ab9117112",
      "metadata": {
        "id": "9908015e-3e31-466e-bf49-dd5ab9117112"
      },
      "source": [
        "## **passing_first_iteration_64 data**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "ea2f2d6e-b4c3-4d04-ab24-c124ce6362d8",
      "metadata": {
        "tags": [],
        "id": "ea2f2d6e-b4c3-4d04-ab24-c124ce6362d8"
      },
      "outputs": [],
      "source": [
        "# Initialize an empty list to store the extracted data\n",
        "data = []\n",
        "\n",
        "# Loop through all the texts and extract information\n",
        "for text in ground_truth_texts:\n",
        "    # Extract background information\n",
        "    background = extract_background(text)\n",
        "\n",
        "    # Extract struggles information\n",
        "    struggles = extract_struggles(text)\n",
        "\n",
        "    # Extract solutions information\n",
        "    solutions = extract_solutions(text)\n",
        "\n",
        "    # Append the extracted information as a dictionary\n",
        "    data.append({\n",
        "        \"Text\": text,\n",
        "        \"Background\": background,\n",
        "        \"Struggles\": struggles,\n",
        "        \"Solutions\": solutions\n",
        "    })\n",
        "\n",
        "# Create a DataFrame from the extracted data\n",
        "df = pd.DataFrame(data)\n",
        "\n",
        "# Save the DataFrame to the specified CSV path\n",
        "csv_path = \"Data/first_iteration_seed_analyzed_data_updated_background.csv\"\n",
        "df.to_csv(csv_path, index=False)  # Saving without index\n",
        "\n",
        "print(f\"DataFrame saved to {csv_path}\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "15b44fe7-7e5f-47bc-813c-7371323414ca",
      "metadata": {
        "tags": [],
        "id": "15b44fe7-7e5f-47bc-813c-7371323414ca"
      },
      "outputs": [],
      "source": [
        "# df.iloc[5]\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "91b1d177-7046-435a-aafe-2e4f89a24b99",
      "metadata": {
        "tags": [],
        "id": "91b1d177-7046-435a-aafe-2e4f89a24b99"
      },
      "outputs": [],
      "source": [
        "# Initialize an empty list to store the extracted data\n",
        "data = []\n",
        "\n",
        "# Loop through all the texts and extract information\n",
        "for text in iteration_1_low_ses_text:\n",
        "    # Extract background information\n",
        "    background = extract_background(text)\n",
        "\n",
        "    # Extract struggles information\n",
        "    struggles = extract_struggles(text)\n",
        "\n",
        "    # Extract solutions information\n",
        "    solutions = extract_solutions(text)\n",
        "\n",
        "    # Append the extracted information as a dictionary\n",
        "    data.append({\n",
        "        \"Text\": text,\n",
        "        \"Background\": background,\n",
        "        \"Struggles\": struggles,\n",
        "        \"Solutions\": solutions\n",
        "    })\n",
        "\n",
        "# Create a DataFrame from the extracted data\n",
        "df = pd.DataFrame(data)\n",
        "\n",
        "# Save the DataFrame to the specified CSV path\n",
        "csv_path = \"Data/second_iteration_seed_analyzed_data_updated_background.csv\"\n",
        "df.to_csv(csv_path, index=False)  # Saving without index\n",
        "\n",
        "print(f\"DataFrame saved to {csv_path}\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "ff8fc5e4-e565-4cab-ad79-e8153489deb5",
      "metadata": {
        "tags": [],
        "id": "ff8fc5e4-e565-4cab-ad79-e8153489deb5"
      },
      "outputs": [],
      "source": [
        "# print(df.head())"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "7389645b-5a18-4c20-91a1-8415290bc713",
      "metadata": {
        "tags": [],
        "id": "7389645b-5a18-4c20-91a1-8415290bc713"
      },
      "outputs": [],
      "source": [
        "# Initialize an empty list to store the extracted data\n",
        "data = []\n",
        "\n",
        "# Loop through all the texts and extract information\n",
        "for text in iteration_2_SES_texts_not_tokenized:\n",
        "    # Extract background information\n",
        "    background = extract_background(text)\n",
        "\n",
        "    # Extract struggles information\n",
        "    struggles = extract_struggles(text)\n",
        "\n",
        "    # Extract solutions information\n",
        "    solutions = extract_solutions(text)\n",
        "\n",
        "    # Append the extracted information as a dictionary\n",
        "    data.append({\n",
        "        \"Text\": text,\n",
        "        \"Background\": background,\n",
        "        \"Struggles\": struggles,\n",
        "        \"Solutions\": solutions\n",
        "    })\n",
        "\n",
        "# Create a DataFrame from the extracted data\n",
        "df = pd.DataFrame(data)\n",
        "\n",
        "# Save the DataFrame to the specified CSV path\n",
        "csv_path = \"Data/third_iteration_seed_analyzed_data_updated_background.csv\"\n",
        "df.to_csv(csv_path, index=False)  # Saving without index\n",
        "\n",
        "print(f\"DataFrame saved to {csv_path}\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "e4fd963d-3160-44e4-98a0-ec0da86ea938",
      "metadata": {
        "tags": [],
        "id": "e4fd963d-3160-44e4-98a0-ec0da86ea938"
      },
      "outputs": [],
      "source": [
        "# print(df.head())"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "55350e7f-85ac-4b8a-a9e0-c7a9515693a8",
      "metadata": {
        "id": "55350e7f-85ac-4b8a-a9e0-c7a9515693a8"
      },
      "outputs": [],
      "source": [
        "print(\"done\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "5f2109ae-c4cf-4dca-92a4-db70d97c11ba",
      "metadata": {
        "tags": [],
        "id": "5f2109ae-c4cf-4dca-92a4-db70d97c11ba"
      },
      "outputs": [],
      "source": [
        "# df.head()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "189e67a7-e5ff-4002-9bbe-61fc831bf59a",
      "metadata": {
        "tags": [],
        "id": "189e67a7-e5ff-4002-9bbe-61fc831bf59a"
      },
      "outputs": [],
      "source": [
        "df['Background'].apply(lambda x: isinstance(x, str) and x.strip() != '').sum()\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "ed69271d-39c8-48ba-9784-98bfa04a933b",
      "metadata": {
        "tags": [],
        "id": "ed69271d-39c8-48ba-9784-98bfa04a933b"
      },
      "outputs": [],
      "source": [
        "df['Struggles'].apply(lambda x: isinstance(x, list) and len(x) > 0).sum()\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "c8776145-0f9c-49ed-840e-08d0ece33b79",
      "metadata": {
        "tags": [],
        "id": "c8776145-0f9c-49ed-840e-08d0ece33b79"
      },
      "outputs": [],
      "source": [
        "df['Solutions'].apply(lambda x: isinstance(x, list) and len(x) > 0).sum()\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "361775ec-ba87-4db3-87cb-1ab22004e451",
      "metadata": {
        "id": "361775ec-ba87-4db3-87cb-1ab22004e451"
      },
      "source": [
        "## iteration 3"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "547b00a5-4942-403e-b831-1ae1f288bbf2",
      "metadata": {
        "tags": [],
        "id": "547b00a5-4942-403e-b831-1ae1f288bbf2"
      },
      "outputs": [],
      "source": [
        "data_checked = pd.read_excel('Data/iteration_3collected_data_86_no_outliers_checked.xlsx')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "0f65a964-7e37-4e98-9a47-447b1f0574cb",
      "metadata": {
        "tags": [],
        "id": "0f65a964-7e37-4e98-9a47-447b1f0574cb"
      },
      "outputs": [],
      "source": [
        "len(data_checked)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "255cd017-d950-4b22-b27c-9cecfcc22666",
      "metadata": {
        "tags": [],
        "id": "255cd017-d950-4b22-b27c-9cecfcc22666"
      },
      "outputs": [],
      "source": [
        "data_checked.head(1)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "097ce500-6ac9-4395-b9e8-8fef2ec576ab",
      "metadata": {
        "tags": [],
        "id": "097ce500-6ac9-4395-b9e8-8fef2ec576ab"
      },
      "outputs": [],
      "source": [
        "filtered_data = data_checked[data_checked['Class'] == 1]\n",
        "data_checked_text = filtered_data['body'].tolist()\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "a7dcaaef-3a30-41c5-93fb-0581a5b606ad",
      "metadata": {
        "tags": [],
        "id": "a7dcaaef-3a30-41c5-93fb-0581a5b606ad"
      },
      "outputs": [],
      "source": [
        "len(data_checked_text)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "e0a3ba32-3ffd-4efa-831d-8760f91d9f2a",
      "metadata": {
        "tags": [],
        "id": "e0a3ba32-3ffd-4efa-831d-8760f91d9f2a"
      },
      "outputs": [],
      "source": [
        "46/121 38%"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "902ceee7-8532-4f5e-943f-bb023eb3834e",
      "metadata": {
        "tags": [],
        "id": "902ceee7-8532-4f5e-943f-bb023eb3834e"
      },
      "outputs": [],
      "source": [
        "# Initialize an empty list to store the extracted data\n",
        "data = []\n",
        "\n",
        "# Loop through all the texts and extract information\n",
        "for text in data_checked_text:\n",
        "    # Extract background information\n",
        "    background = extract_background(text)\n",
        "\n",
        "    # Extract struggles information\n",
        "    struggles = extract_struggles(text)\n",
        "\n",
        "    # Extract solutions information\n",
        "    solutions = extract_solutions(text)\n",
        "\n",
        "    # Append the extracted information as a dictionary\n",
        "    data.append({\n",
        "        \"Text\": text,\n",
        "        \"Background\": background,\n",
        "        \"Struggles\": struggles,\n",
        "        \"Solutions\": solutions\n",
        "    })\n",
        "\n",
        "# Create a DataFrame from the extracted data\n",
        "df = pd.DataFrame(data)\n",
        "\n",
        "# Save the DataFrame to the specified CSV path\n",
        "csv_path = \"Data/third_iteration_result_analyzed_data_updated_background.csv\"\n",
        "df.to_csv(csv_path, index=False)  # Saving without index\n",
        "\n",
        "print(f\"DataFrame saved to {csv_path}\")\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "eba73da9-1233-4df8-9658-f6641697dd2d",
      "metadata": {
        "id": "eba73da9-1233-4df8-9658-f6641697dd2d"
      },
      "source": [
        "## Collect all Data into one data set"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "4d8c1264-dfb6-4254-892c-bd860fec71df",
      "metadata": {
        "tags": [],
        "id": "4d8c1264-dfb6-4254-892c-bd860fec71df"
      },
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "\n",
        "# List of file paths\n",
        "file_paths = [\n",
        "    'Data/second_iteration_seed_analyzed_data_updated_background.csv',\n",
        "    'Data/third_iteration_seed_analyzed_data_updated_background.csv',\n",
        "    'Data/third_iteration_result_analyzed_data_updated_background.csv'\n",
        "]\n",
        "\n",
        "iterations = [1, 2, 3]\n",
        "\n",
        "dataframes = []\n",
        "\n",
        "# Loop through the files and iterations to read each file and add the 'iteration' column\n",
        "for i, file in enumerate(file_paths):\n",
        "    df = pd.read_csv(file)  # Read the CSV file\n",
        "    df['iteration'] = iterations[i]  # Add the 'iteration' column\n",
        "    dataframes.append(df)  # Append the dataframe to the list\n",
        "\n",
        "# Concatenate all dataframes into one\n",
        "concatenated_df = pd.concat(dataframes, ignore_index=True)\n",
        "\n",
        "# print(concatenated_df.head())\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "e9cf9ccf-16d5-49ab-845c-acd774f0249b",
      "metadata": {
        "tags": [],
        "id": "e9cf9ccf-16d5-49ab-845c-acd774f0249b"
      },
      "outputs": [],
      "source": [
        "concatenated_df['iteration'].value_counts() # 167,110,46"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "81deeb64-07ef-4525-80a6-49fea62381a9",
      "metadata": {
        "tags": [],
        "id": "81deeb64-07ef-4525-80a6-49fea62381a9"
      },
      "outputs": [],
      "source": [
        "len(concatenated_df['Background'])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "1b42c472-50b8-4c3f-b3da-41af19b46f4b",
      "metadata": {
        "tags": [],
        "id": "1b42c472-50b8-4c3f-b3da-41af19b46f4b"
      },
      "outputs": [],
      "source": [
        "original_data = pd.read_csv('Data/first_iteration_seed_analyzed_data_updated_background.csv')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "7bfbb355-9b84-43b8-8305-c51d0fd43822",
      "metadata": {
        "tags": [],
        "id": "7bfbb355-9b84-43b8-8305-c51d0fd43822"
      },
      "outputs": [],
      "source": [
        "len(original_data)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "6322d508-4299-4e34-84a6-bfb870f616c5",
      "metadata": {
        "tags": [],
        "id": "6322d508-4299-4e34-84a6-bfb870f616c5"
      },
      "outputs": [],
      "source": [
        "# original_data.head()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "a8adbe2f-56a0-4918-aa76-f61ff7628845",
      "metadata": {
        "tags": [],
        "id": "a8adbe2f-56a0-4918-aa76-f61ff7628845"
      },
      "outputs": [],
      "source": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "a3679f29-14a4-48fc-a0f3-453ecc388314",
      "metadata": {
        "tags": [],
        "id": "a3679f29-14a4-48fc-a0f3-453ecc388314"
      },
      "outputs": [],
      "source": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "60ce4cab-6d63-4027-afe9-0a7a963b0900",
      "metadata": {
        "id": "60ce4cab-6d63-4027-afe9-0a7a963b0900"
      },
      "outputs": [],
      "source": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "myenv",
      "language": "python",
      "name": "myenv"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.10.14"
    },
    "colab": {
      "provenance": []
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}