{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "ce6ffe9c-309c-45d0-b065-1523947273d5",
      "metadata": {
        "tags": [],
        "id": "ce6ffe9c-309c-45d0-b065-1523947273d5"
      },
      "outputs": [],
      "source": [
        "import time\n",
        "import pandas as pd"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "74064eb7-5a44-4ebd-aee9-abea39d852fb",
      "metadata": {
        "tags": [],
        "id": "74064eb7-5a44-4ebd-aee9-abea39d852fb"
      },
      "outputs": [],
      "source": [
        "iteration_1_data = pd.read_csv(\"Data/Low_and_Not_Low_Data.csv\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "9e0a53be-dc73-4ec0-8bba-050e6fc0de66",
      "metadata": {
        "tags": [],
        "id": "9e0a53be-dc73-4ec0-8bba-050e6fc0de66"
      },
      "outputs": [],
      "source": [
        "iteration_1_data.head(1)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "cc325fb3-52a0-4aed-a8cc-9dbd1841f5e7",
      "metadata": {
        "id": "cc325fb3-52a0-4aed-a8cc-9dbd1841f5e7"
      },
      "outputs": [],
      "source": [
        "# iteration_2_checked_data_all_fields = pd.read_csv('Data/iteration_2_checked_data_all_fields.csv')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "13e1adbd-ac7f-4d85-bc73-d7eb594c21c0",
      "metadata": {
        "tags": [],
        "id": "13e1adbd-ac7f-4d85-bc73-d7eb594c21c0"
      },
      "outputs": [],
      "source": [
        "# iteration_2_checked_data_all_fields.head(1)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "949d4507-952c-47a3-8b54-857b133fb2d9",
      "metadata": {
        "id": "949d4507-952c-47a3-8b54-857b133fb2d9"
      },
      "source": [
        "**Prepare Data from iteration 2**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "02bc269b-432d-4c93-b259-7faaca93d177",
      "metadata": {
        "tags": [],
        "id": "02bc269b-432d-4c93-b259-7faaca93d177"
      },
      "outputs": [],
      "source": [
        "import nltk\n",
        "\n",
        "# Preprocessing text data\n",
        "nltk.download('stopwords')\n",
        "nltk.download('punkt')  # Ensure 'punkt' is downloaded\n",
        "from nltk.corpus import stopwords\n",
        "from nltk.tokenize import word_tokenize\n",
        "stop_words = set(stopwords.words('english'))\n",
        "\n",
        "def preprocess_text(text):\n",
        "    tokens = word_tokenize(text.lower())  # Tokenization and lowercasing\n",
        "    tokens = [word for word in tokens if word.isalpha()]  # Remove non-alphabetic tokens\n",
        "    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords\n",
        "    return ' '.join(tokens)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "277233ee-0b17-42a5-8725-3260d60df472",
      "metadata": {
        "tags": [],
        "id": "277233ee-0b17-42a5-8725-3260d60df472"
      },
      "outputs": [],
      "source": [
        "import spacy\n",
        "\n",
        "# Load the spaCy English model\n",
        "nlp = spacy.load(\"en_core_web_sm\")\n",
        "\n",
        "# Function to replace names of people with PERSON and organizations with ORG\n",
        "def replace_entities(text):\n",
        "    doc = nlp(text)  # Apply the spaCy NLP model to the text\n",
        "    modified_text = text  # Start with the original text\n",
        "\n",
        "    for ent in doc.ents:  # Loop through all the named entities\n",
        "        if ent.label_ == \"PERSON\":\n",
        "            modified_text = modified_text.replace(ent.text, \"PERSON\")\n",
        "        elif ent.label_ == \"ORG\":\n",
        "            modified_text = modified_text.replace(ent.text, \"ORG\")\n",
        "\n",
        "    return modified_text\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "50ee08ba-af4a-4b4a-8d6a-71176beca14e",
      "metadata": {
        "tags": [],
        "id": "50ee08ba-af4a-4b4a-8d6a-71176beca14e"
      },
      "outputs": [],
      "source": [
        "checked_iteration_2_data = pd.read_excel('Data/collected_data_no_outliers_version_2.xlsx')\n",
        "checked_iteration_2_data = checked_iteration_2_data.iloc[0:381]\n",
        "print(len(checked_iteration_2_data['Class']))\n",
        "print(sum(checked_iteration_2_data['Class']))\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "7c603b9d-c1da-4654-85f2-d6fb7ea65403",
      "metadata": {
        "tags": [],
        "id": "7c603b9d-c1da-4654-85f2-d6fb7ea65403"
      },
      "outputs": [],
      "source": [
        "checked_iteration_2_data.head(1)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "61d26bc5-1012-4314-9472-7b0151cd3a5d",
      "metadata": {
        "tags": [],
        "id": "61d26bc5-1012-4314-9472-7b0151cd3a5d"
      },
      "outputs": [],
      "source": [
        "df = checked_iteration_2_data\n",
        "low_ses_text = df[df['Class'] ==1]['body']\n",
        "not_low_ses_text = df[df['Class'] ==0]['body']\n",
        "print(len(low_ses_text), len(not_low_ses_text))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "42498120-4f67-4e6a-9742-3a16b22b2192",
      "metadata": {
        "tags": [],
        "id": "42498120-4f67-4e6a-9742-3a16b22b2192"
      },
      "outputs": [],
      "source": [
        "# tokenize iteration 2 checked texts\n",
        "\n",
        "low_ses_tokenized_text = [preprocess_text(item) for item in low_ses_text]\n",
        "not_low_ses_tokenized_text = [preprocess_text(item) for item in not_low_ses_text]\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "15d13d51-5723-4bd4-b9bb-054ac78504c1",
      "metadata": {
        "tags": [],
        "id": "15d13d51-5723-4bd4-b9bb-054ac78504c1"
      },
      "outputs": [],
      "source": [
        "low_ses_text_pii = [replace_entities(text) for text in low_ses_text]\n",
        "not_low_ses_text_pii = [replace_entities(text) for text in not_low_ses_text]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "88c1ab84-618d-400f-a0c1-83bdd38293f8",
      "metadata": {
        "tags": [],
        "id": "88c1ab84-618d-400f-a0c1-83bdd38293f8"
      },
      "outputs": [],
      "source": [
        "print(len(low_ses_tokenized_text),len(not_low_ses_tokenized_text))\n",
        "print(len(low_ses_text_pii),len(not_low_ses_text_pii))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "278833e5-805d-4974-92e3-74023f09164b",
      "metadata": {
        "id": "278833e5-805d-4974-92e3-74023f09164b"
      },
      "outputs": [],
      "source": [
        "# Create the DataFrame by passing all columns at once\n",
        "iteration_2_prepared_data = pd.DataFrame({\n",
        "    \"Anonymized Text\": low_ses_text_pii,\n",
        "    \"Tokenized Text\": low_ses_tokenized_text,\n",
        "    \"Original Text\": low_ses_text,\n",
        "    \"Class\": [\"Low_SES\"] * len(low_ses_text_pii),  # Ensure the same length\n",
        "    \"Label\": [1] * len(low_ses_text_pii)           # Ensure the same length\n",
        "})\n",
        "\n",
        "# Display the updated DataFrame\n",
        "iteration_2_prepared_data.head()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "33bdad49-1efc-4cbc-b1e0-082588485f1c",
      "metadata": {
        "tags": [],
        "id": "33bdad49-1efc-4cbc-b1e0-082588485f1c"
      },
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "\n",
        "# Assuming df is your original DataFrame with Low_SES data\n",
        "\n",
        "# Create a new DataFrame for the \"Not_Low_SES\" class\n",
        "not_low_ses_df = pd.DataFrame({\n",
        "    \"Anonymized Text\": not_low_ses_text_pii,  # Assuming no anonymized text for this set\n",
        "    \"Tokenized Text\": not_low_ses_tokenized_text,\n",
        "    \"Original Text\": not_low_ses_text,\n",
        "    \"Class\": [\"Not_Low_SES\"] * len(not_low_ses_text),  # Set the \"Class\" column to \"Not_Low_SES\"\n",
        "    \"Label\": [0] * len(not_low_ses_text)  # Set the \"Label\" column to 0 for Not_Low_SES\n",
        "})\n",
        "\n",
        "# Append the new DataFrame to the original one\n",
        "iteration_2_prepared_data = pd.concat([iteration_2_prepared_data, not_low_ses_df], ignore_index=True)\n",
        "\n",
        "# Display the updated DataFrame\n",
        "iteration_2_prepared_data.head()  # To check the first few rows\n",
        "iteration_2_prepared_data.tail()  # To check the last few rows to ensure the new rows were added\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "48105fdd-c281-44db-b4d0-aef30af2e9b6",
      "metadata": {
        "tags": [],
        "id": "48105fdd-c281-44db-b4d0-aef30af2e9b6"
      },
      "outputs": [],
      "source": [
        "iteration_2_prepared_data['Class'].value_counts()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d0025c0a-4faf-4dac-8c1d-1ddc3c721e2f",
      "metadata": {
        "tags": [],
        "id": "d0025c0a-4faf-4dac-8c1d-1ddc3c721e2f"
      },
      "outputs": [],
      "source": [
        "iteration_2_prepared_data.to_csv('Data/Low_and_Not_Low_Data_Second_Itertation.csv',index=False)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "18a00f60-f6bc-444b-9708-fb3e24775887",
      "metadata": {
        "tags": [],
        "id": "18a00f60-f6bc-444b-9708-fb3e24775887"
      },
      "outputs": [],
      "source": [
        "import os\n",
        "path = \"Data/GroundTruthFilter/\"\n",
        "files = [file for file in os.listdir(path) if not file.startswith('.')]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "e3c03a9c-f118-4004-a794-f623930b9b20",
      "metadata": {
        "tags": [],
        "id": "e3c03a9c-f118-4004-a794-f623930b9b20"
      },
      "outputs": [],
      "source": [
        "ground_truth_texts = []\n",
        "\n",
        "for file_name in files:\n",
        "    with open(path+file_name, 'r') as f:\n",
        "        output = f.read()\n",
        "        ground_truth_texts.append(output)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "0d0a1430-0d53-4913-92dd-79a711652e27",
      "metadata": {
        "tags": [],
        "id": "0d0a1430-0d53-4913-92dd-79a711652e27"
      },
      "outputs": [],
      "source": [
        "import nltk\n",
        "# Preprocessing text data\n",
        "nltk.download('stopwords')\n",
        "nltk.download('punkt')  # Ensure 'punkt' is downloaded\n",
        "from nltk.corpus import stopwords\n",
        "from nltk.tokenize import word_tokenize\n",
        "stop_words = set(stopwords.words('english'))\n",
        "\n",
        "def preprocess_text(text):\n",
        "    tokens = word_tokenize(text.lower())  # Tokenization and lowercasing\n",
        "    tokens = [word for word in tokens if word.isalpha()]  # Remove non-alphabetic tokens\n",
        "    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords\n",
        "    return ' '.join(tokens)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "5cb08cd0-e2cb-45f8-aee5-d8619c1462d8",
      "metadata": {
        "tags": [],
        "id": "5cb08cd0-e2cb-45f8-aee5-d8619c1462d8"
      },
      "outputs": [],
      "source": [
        "ground_truth_texts= [preprocess_text(item) for item in ground_truth_texts]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "8e1ac20a-d907-4bb4-b767-dfab59f0e342",
      "metadata": {
        "tags": [],
        "id": "8e1ac20a-d907-4bb4-b767-dfab59f0e342"
      },
      "outputs": [],
      "source": [
        "my_file = open(\"Data/not-low-ses.txt\", \"r\")\n",
        "data = my_file.readlines()\n",
        "my_file.close()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "fa1ef0c4-ed74-434e-9ab9-2d0e68df4390",
      "metadata": {
        "tags": [],
        "id": "fa1ef0c4-ed74-434e-9ab9-2d0e68df4390"
      },
      "outputs": [],
      "source": [
        "not_low_ses = []\n",
        "for line in data:\n",
        "    if len(line)>100:\n",
        "        not_low_ses.append(line)\n",
        "    if len(not_low_ses) == 64:\n",
        "        break"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "a3a41b0b-a074-4b10-9493-b62039742e78",
      "metadata": {
        "tags": [],
        "id": "a3a41b0b-a074-4b10-9493-b62039742e78"
      },
      "outputs": [],
      "source": [
        "len(not_low_ses)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "c8dd3fad-aaea-4276-98fa-6ac6031c3b88",
      "metadata": {
        "tags": [],
        "id": "c8dd3fad-aaea-4276-98fa-6ac6031c3b88"
      },
      "outputs": [],
      "source": [
        "not_low_ses_texts= [preprocess_text(item) for item in not_low_ses]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "0ece11cc-8f7f-49d5-82f8-08beefd8a526",
      "metadata": {
        "id": "0ece11cc-8f7f-49d5-82f8-08beefd8a526"
      },
      "outputs": [],
      "source": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "5fcb602d-8768-4707-85a3-d23d36779058",
      "metadata": {
        "tags": [],
        "id": "5fcb602d-8768-4707-85a3-d23d36779058"
      },
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "\n",
        "# Create the DataFrame by passing all columns at once\n",
        "df = pd.DataFrame({\n",
        "    \"Tokenized Text\": ground_truth_texts,\n",
        "    \"Label\": [1] * len(ground_truth_texts)           # Ensure the same length\n",
        "})\n",
        "\n",
        "# Display the updated DataFrame\n",
        "df.head()\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "5e01dcd9-d6ec-47e5-afb5-9844c7cb9133",
      "metadata": {
        "tags": [],
        "id": "5e01dcd9-d6ec-47e5-afb5-9844c7cb9133"
      },
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "\n",
        "# Assuming df is your original DataFrame with Low_SES data\n",
        "\n",
        "# Create a new DataFrame for the \"Not_Low_SES\" class\n",
        "not_low_ses_df = pd.DataFrame({\n",
        "    \"Tokenized Text\": not_low_ses_texts,\n",
        "    \"Label\": [0] * len(not_low_ses_texts)  # Set the \"Label\" column to 0 for Not_Low_SES\n",
        "})\n",
        "\n",
        "# Append the new DataFrame to the original one\n",
        "df = pd.concat([df, not_low_ses_df], ignore_index=True)\n",
        "\n",
        "# Display the updated DataFrame\n",
        "df.head()  # To check the first few rows\n",
        "df.tail()  # To check the last few rows to ensure the new rows were added\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "a6d88738-bb6b-46f1-8560-b4f0750e7237",
      "metadata": {
        "tags": [],
        "id": "a6d88738-bb6b-46f1-8560-b4f0750e7237"
      },
      "outputs": [],
      "source": [
        "df['Label'].value_counts()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "ce811c83-1e7f-46c7-ab3c-c4143b7ff772",
      "metadata": {
        "tags": [],
        "id": "ce811c83-1e7f-46c7-ab3c-c4143b7ff772"
      },
      "outputs": [],
      "source": [
        "data = df.sample(frac=1).reset_index(drop=True)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "cd87686f-2f3f-46e1-a70c-20183d2add7b",
      "metadata": {
        "tags": [],
        "id": "cd87686f-2f3f-46e1-a70c-20183d2add7b"
      },
      "outputs": [],
      "source": [
        "data.head()"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "a2d84422-80ec-4933-82fe-0f4099e6d393",
      "metadata": {
        "tags": [],
        "id": "a2d84422-80ec-4933-82fe-0f4099e6d393"
      },
      "source": [
        "# **First Filteration**"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "87285039-5a25-44a8-af89-bfe8ddc523c9",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true,
        "tags": [],
        "id": "87285039-5a25-44a8-af89-bfe8ddc523c9"
      },
      "source": [
        "[link text](https://)## **Zeroshot bart-large-mnli f1=80**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "3db3292e-b341-4da8-b964-167232e8444c",
      "metadata": {
        "tags": [],
        "id": "3db3292e-b341-4da8-b964-167232e8444c"
      },
      "outputs": [],
      "source": [
        "from transformers import pipeline\n",
        "from sklearn.model_selection import StratifiedKFold, train_test_split\n",
        "\n",
        "\n",
        "pipe = pipeline(\"zero-shot-classification\", device=0)\n",
        "all_labels = [\"not from low socio economic status\",\"from low socio economic status\"]\n",
        "train_val_data, test_data = train_test_split(data, test_size=0.15, random_state=46, stratify=data['Label'])\n",
        "\n",
        "\n",
        "def zero_shot_pipeline(text, predicted_labels, scores):\n",
        "    output = pipe(text, all_labels, multi_label=False)\n",
        "    predicted_labels.append( output[\"labels\"])\n",
        "    scores.append(output[\"scores\"])\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "334cc489-9e78-4cba-8b05-11fc5ddb9b49",
      "metadata": {
        "tags": [],
        "id": "334cc489-9e78-4cba-8b05-11fc5ddb9b49"
      },
      "outputs": [],
      "source": [
        "test_texts= test_data['Tokenized Text'].tolist()\n",
        "predicted_labels = []\n",
        "scores = []\n",
        "\n",
        "for text in test_texts:\n",
        "    zero_shot_pipeline(text, predicted_labels, scores)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "5a39c499-5d43-4221-913a-69e687ebea6f",
      "metadata": {
        "tags": [],
        "id": "5a39c499-5d43-4221-913a-69e687ebea6f"
      },
      "outputs": [],
      "source": [
        "preds = []\n",
        "\n",
        "for i in range(len(predicted_labels)):\n",
        "\n",
        "    if predicted_labels[i][0] == \"from low socio economic status\" and scores[i][0]>0.7:\n",
        "        preds.append(1)\n",
        "    else:\n",
        "        preds.append(0)\n",
        "\n",
        "label_map = {0: 'not from low socio economic status', 1: 'from low socio economic status'}\n",
        "clf_report = classification_report(test_data['Label'].tolist(), preds, target_names=label_map.values())\n",
        "print(clf_report)\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "81305ac7-356c-47b5-8f38-34669567bb74",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true,
        "tags": [],
        "id": "81305ac7-356c-47b5-8f38-34669567bb74"
      },
      "source": [
        "## **Logistic Regression f1=60**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "66bcec2d-b3e1-49a3-addb-480ae852f743",
      "metadata": {
        "tags": [],
        "id": "66bcec2d-b3e1-49a3-addb-480ae852f743"
      },
      "outputs": [],
      "source": [
        "from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score\n",
        "import torch\n",
        "import numpy as np\n",
        "from sklearn.model_selection import StratifiedKFold, train_test_split\n",
        "from torch.utils.data import DataLoader\n",
        "import torch.nn as nn\n",
        "\n",
        "# Initial split: 85% for training/validation, 15% for test\n",
        "train_val_data, test_data = train_test_split(data, test_size=0.15, random_state=46, stratify=data['Label'])\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "5b2a252a-8fa2-4c55-a863-d75a0fda9379",
      "metadata": {
        "tags": [],
        "id": "5b2a252a-8fa2-4c55-a863-d75a0fda9379"
      },
      "outputs": [],
      "source": [
        "from sklearn.feature_extraction.text import TfidfVectorizer\n",
        "from sklearn.linear_model import LogisticRegression\n",
        "from sklearn.model_selection import StratifiedKFold, train_test_split\n",
        "from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score\n",
        "import numpy as np\n",
        "import joblib\n",
        "\n",
        "# Initial split: 85% for training/validation, 15% for test\n",
        "train_val_data, test_data = train_test_split(data, test_size=0.15, random_state=46, stratify=data['Label'])\n",
        "\n",
        "# TF-IDF vectorizer setup\n",
        "# MAX_FEATURES = 5000  # You can adjust this based on your dataset\n",
        "MAX_FEATURES = 1000  # You can try smaller numbers like 1000 or even less\n",
        "\n",
        "vectorizer = TfidfVectorizer(max_features=MAX_FEATURES)\n",
        "\n",
        "# Fit TF-IDF on the training/validation data\n",
        "train_val_texts = train_val_data['Tokenized Text'].values\n",
        "test_texts = test_data['Tokenized Text'].values\n",
        "\n",
        "X_train_val = vectorizer.fit_transform(train_val_texts)\n",
        "X_test = vectorizer.transform(test_texts)\n",
        "\n",
        "y_train_val = train_val_data['Label'].values\n",
        "y_test = test_data['Label'].values\n",
        "\n",
        "# 5-Fold Cross-Validation setup\n",
        "kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=46)\n",
        "\n",
        "# Initialize Logistic Regression model\n",
        "# model = LogisticRegression(max_iter=1000, random_state=46)  # Adjust max_iter if needed for convergence\n",
        "\n",
        "# Adjust MAX_FEATURES to reduce memory usage\n",
        "\n",
        "# Initialize Logistic Regression model with reduced iterations\n",
        "model = LogisticRegression(random_state=46, max_iter=500)  # Reduce max_iter to a lower number\n",
        "\n",
        "# Track metrics for the best model\n",
        "best_f1 = 0\n",
        "best_model_state = None\n",
        "\n",
        "# Start cross-validation\n",
        "for fold, (train_index, val_index) in enumerate(kf.split(X_train_val, y_train_val)):\n",
        "    print(f\"\\nFold {fold + 1}\\n\" + \"-\" * 10)\n",
        "\n",
        "    # Split the 85% training/validation data into training and validation for this fold\n",
        "    X_train, X_val = X_train_val[train_index], X_train_val[val_index]\n",
        "    y_train, y_val = y_train_val[train_index], y_train_val[val_index]\n",
        "\n",
        "    # Train the Logistic Regression model\n",
        "    model.fit(X_train, y_train)\n",
        "\n",
        "    # Predict on validation data\n",
        "    y_val_pred = model.predict(X_val)\n",
        "\n",
        "    # Calculate metrics\n",
        "    accuracy = accuracy_score(y_val, y_val_pred)\n",
        "    precision = precision_score(y_val, y_val_pred)\n",
        "    recall = recall_score(y_val, y_val_pred)\n",
        "    f1 = f1_score(y_val, y_val_pred)\n",
        "\n",
        "    print(f\"Fold {fold + 1} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}\")\n",
        "\n",
        "    # Keep track of the best model based on F1 score\n",
        "    if f1 > best_f1:\n",
        "        best_f1 = f1\n",
        "        best_model_state = model.get_params()\n",
        "\n",
        "# Evaluate the best model on the test set\n",
        "model.set_params(**best_model_state)  # Set the best parameters (optional for Logistic Regression)\n",
        "y_test_pred = model.predict(X_test)\n",
        "test_f1 = f1_score(y_test, y_test_pred)\n",
        "\n",
        "print(f\"\\nBest Logistic Regression model with F1 on test set: {test_f1:.4f}\")\n",
        "\n",
        "# Print classification report for the test set\n",
        "print(\"\\nClassification Report on Test Set:\")\n",
        "print(classification_report(y_test, y_test_pred, target_names=['Class 0', 'Class 1']))\n",
        "\n",
        "# Save the best Logistic Regression model\n",
        "model_save_path = 'Data/ThirdIterationModels/best_LogReg_Model.joblib'\n",
        "joblib.dump(model, model_save_path)\n",
        "print(f\"Best Logistic Regression model saved to {model_save_path}\")\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "f5ed25cd-8dd4-4474-a0c3-2556ac691e1c",
      "metadata": {
        "tags": [],
        "id": "f5ed25cd-8dd4-4474-a0c3-2556ac691e1c"
      },
      "source": [
        "## **SVM f1=60**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "478e4f77-1ff6-425d-8694-a1324898e0af",
      "metadata": {
        "tags": [],
        "id": "478e4f77-1ff6-425d-8694-a1324898e0af"
      },
      "outputs": [],
      "source": [
        "from sklearn.feature_extraction.text import TfidfVectorizer\n",
        "from sklearn.svm import SVC\n",
        "from sklearn.model_selection import StratifiedKFold, train_test_split\n",
        "from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score\n",
        "import numpy as np\n",
        "import joblib\n",
        "\n",
        "# Initial split: 85% for training/validation, 15% for test\n",
        "train_val_data, test_data = train_test_split(data, test_size=0.15, random_state=46, stratify=data['Label'])\n",
        "\n",
        "# TF-IDF vectorizer setup\n",
        "MAX_FEATURES = 5000  # You can adjust this based on your dataset\n",
        "vectorizer = TfidfVectorizer(max_features=MAX_FEATURES)\n",
        "\n",
        "# Fit TF-IDF on the training/validation data\n",
        "train_val_texts = train_val_data['Tokenized Text'].values\n",
        "test_texts = test_data['Tokenized Text'].values\n",
        "\n",
        "X_train_val = vectorizer.fit_transform(train_val_texts)\n",
        "X_test = vectorizer.transform(test_texts)\n",
        "\n",
        "y_train_val = train_val_data['Label'].values\n",
        "y_test = test_data['Label'].values\n",
        "\n",
        "# 5-Fold Cross-Validation setup\n",
        "kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=46)\n",
        "\n",
        "# Initialize SVM model (using 'linear' kernel as default, but you can also try 'rbf', etc.)\n",
        "model = SVC(kernel='linear', probability=True, random_state=46)  # You can change the kernel to 'rbf', 'poly', etc.\n",
        "\n",
        "# Track metrics for the best model\n",
        "best_f1 = 0\n",
        "best_model_state = None\n",
        "\n",
        "# Start cross-validation\n",
        "for fold, (train_index, val_index) in enumerate(kf.split(X_train_val, y_train_val)):\n",
        "    print(f\"\\nFold {fold + 1}\\n\" + \"-\" * 10)\n",
        "\n",
        "    # Split the 85% training/validation data into training and validation for this fold\n",
        "    X_train, X_val = X_train_val[train_index], X_train_val[val_index]\n",
        "    y_train, y_val = y_train_val[train_index], y_train_val[val_index]\n",
        "\n",
        "    # Train the SVM model\n",
        "    model.fit(X_train, y_train)\n",
        "\n",
        "    # Predict on validation data\n",
        "    y_val_pred = model.predict(X_val)\n",
        "\n",
        "    # Calculate metrics\n",
        "    accuracy = accuracy_score(y_val, y_val_pred)\n",
        "    precision = precision_score(y_val, y_val_pred)\n",
        "    recall = recall_score(y_val, y_val_pred)\n",
        "    f1 = f1_score(y_val, y_val_pred)\n",
        "\n",
        "    print(f\"Fold {fold + 1} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}\")\n",
        "\n",
        "    # Keep track of the best model based on F1 score\n",
        "    if f1 > best_f1:\n",
        "        best_f1 = f1\n",
        "        best_model_state = model.get_params()\n",
        "\n",
        "# Evaluate the best model on the test set\n",
        "model.set_params(**best_model_state)  # Set the best parameters (optional for SVM)\n",
        "y_test_pred = model.predict(X_test)\n",
        "test_f1 = f1_score(y_test, y_test_pred)\n",
        "\n",
        "print(f\"\\nBest SVM model with F1 on test set: {test_f1:.4f}\")\n",
        "\n",
        "# Print classification report for the test set\n",
        "print(\"\\nClassification Report on Test Set:\")\n",
        "print(classification_report(y_test, y_test_pred, target_names=['Class 0', 'Class 1']))\n",
        "\n",
        "# Save the best SVM model\n",
        "model_save_path = 'Data/ThirdIterationModels/best_SVM_Model.joblib'\n",
        "joblib.dump(model, model_save_path)\n",
        "print(f\"Best SVM model saved to {model_save_path}\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d2ff87e7-0eb2-4857-a223-b7b535d816fc",
      "metadata": {
        "tags": [],
        "id": "d2ff87e7-0eb2-4857-a223-b7b535d816fc"
      },
      "outputs": [],
      "source": [
        "# Save the fitted TF-IDF vectorizer\n",
        "vectorizer_save_path = 'Data/ThirdIterationModels/tfidf_SVM_vectorizer.joblib'\n",
        "joblib.dump(vectorizer, vectorizer_save_path)\n",
        "print(f\"TF-IDF vectorizer saved to {vectorizer_save_path}\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "95f0f938-8f17-40c7-8157-b851ab2e86ed",
      "metadata": {
        "id": "95f0f938-8f17-40c7-8157-b851ab2e86ed"
      },
      "outputs": [],
      "source": [
        "first 64 vs 64\n",
        "\n",
        "70 15 15\n",
        "\n",
        "zero shot LLm\n",
        "\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "c46a6588-3cd2-4c71-88cd-6b93dc55b513",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true,
        "tags": [],
        "id": "c46a6588-3cd2-4c71-88cd-6b93dc55b513"
      },
      "source": [
        "## **GradientBoosting f1=64**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "93dd1812-ae97-40c5-ab7d-860de1fd0835",
      "metadata": {
        "tags": [],
        "id": "93dd1812-ae97-40c5-ab7d-860de1fd0835"
      },
      "outputs": [],
      "source": [
        "from sklearn.feature_extraction.text import TfidfVectorizer\n",
        "from sklearn.ensemble import GradientBoostingClassifier\n",
        "from sklearn.model_selection import StratifiedKFold, train_test_split\n",
        "from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score\n",
        "import numpy as np\n",
        "import joblib\n",
        "\n",
        "# Initial split: 85% for training/validation, 15% for test\n",
        "train_val_data, test_data = train_test_split(data, test_size=0.15, random_state=46, stratify=data['Label'])\n",
        "\n",
        "# TF-IDF vectorizer setup\n",
        "MAX_FEATURES = 5000  # You can adjust this based on your dataset\n",
        "vectorizer = TfidfVectorizer(max_features=MAX_FEATURES)\n",
        "\n",
        "# Fit TF-IDF on the training/validation data\n",
        "train_val_texts = train_val_data['Tokenized Text'].values\n",
        "test_texts = test_data['Tokenized Text'].values\n",
        "\n",
        "X_train_val = vectorizer.fit_transform(train_val_texts)\n",
        "X_test = vectorizer.transform(test_texts)\n",
        "\n",
        "y_train_val = train_val_data['Label'].values\n",
        "y_test = test_data['Label'].values\n",
        "\n",
        "# 5-Fold Cross-Validation setup\n",
        "kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=46)\n",
        "\n",
        "# Initialize Gradient Boosting model\n",
        "model = GradientBoostingClassifier(random_state=46)  # You can tune parameters like n_estimators and learning_rate\n",
        "\n",
        "# Track metrics for the best model\n",
        "best_f1 = 0\n",
        "best_model_state = None\n",
        "\n",
        "# Start cross-validation\n",
        "for fold, (train_index, val_index) in enumerate(kf.split(X_train_val, y_train_val)):\n",
        "    print(f\"\\nFold {fold + 1}\\n\" + \"-\" * 10)\n",
        "\n",
        "    # Split the 85% training/validation data into training and validation for this fold\n",
        "    X_train, X_val = X_train_val[train_index], X_train_val[val_index]\n",
        "    y_train, y_val = y_train_val[train_index], y_train_val[val_index]\n",
        "\n",
        "    # Train the Gradient Boosting model\n",
        "    model.fit(X_train, y_train)\n",
        "\n",
        "    # Predict on validation data\n",
        "    y_val_pred = model.predict(X_val)\n",
        "\n",
        "    # Calculate metrics\n",
        "    accuracy = accuracy_score(y_val, y_val_pred)\n",
        "    precision = precision_score(y_val, y_val_pred)\n",
        "    recall = recall_score(y_val, y_val_pred)\n",
        "    f1 = f1_score(y_val, y_val_pred)\n",
        "\n",
        "    print(f\"Fold {fold + 1} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}\")\n",
        "\n",
        "    # Keep track of the best model based on F1 score\n",
        "    if f1 > best_f1:\n",
        "        best_f1 = f1\n",
        "        best_model_state = model.get_params()\n",
        "\n",
        "# Evaluate the best model on the test set\n",
        "model.set_params(**best_model_state)  # Set the best parameters (optional for Gradient Boosting)\n",
        "y_test_pred = model.predict(X_test)\n",
        "test_f1 = f1_score(y_test, y_test_pred)\n",
        "\n",
        "print(f\"\\nBest Gradient Boosting model with F1 on test set: {test_f1:.4f}\")\n",
        "\n",
        "# Print classification report for the test set\n",
        "print(\"\\nClassification Report on Test Set:\")\n",
        "print(classification_report(y_test, y_test_pred, target_names=['Class 0', 'Class 1']))\n",
        "\n",
        "# Save the best Gradient Boosting model\n",
        "model_save_path = 'Data/ThirdIterationModels/best_GB_Model.joblib'\n",
        "joblib.dump(model, model_save_path)\n",
        "print(f\"Best Gradient Boosting model saved to {model_save_path}\")\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "b862f604-2ba1-42b1-a7f1-5de7305e8d5c",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true,
        "tags": [],
        "id": "b862f604-2ba1-42b1-a7f1-5de7305e8d5c"
      },
      "source": [
        "## **Random Forest f1=54**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "bf7f5ac0-ef73-4454-9fd8-b3dd5f32c55a",
      "metadata": {
        "tags": [],
        "id": "bf7f5ac0-ef73-4454-9fd8-b3dd5f32c55a"
      },
      "outputs": [],
      "source": [
        "from sklearn.feature_extraction.text import TfidfVectorizer\n",
        "from sklearn.ensemble import RandomForestClassifier\n",
        "from sklearn.model_selection import StratifiedKFold, train_test_split\n",
        "from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score\n",
        "import numpy as np\n",
        "import joblib\n",
        "\n",
        "# Initial split: 85% for training/validation, 15% for test\n",
        "train_val_data, test_data = train_test_split(data, test_size=0.15, random_state=46, stratify=data['Label'])\n",
        "\n",
        "# TF-IDF vectorizer setup\n",
        "MAX_FEATURES = 5000  # You can adjust this based on your dataset\n",
        "vectorizer = TfidfVectorizer(max_features=MAX_FEATURES)\n",
        "\n",
        "# Fit TF-IDF on the training/validation data\n",
        "train_val_texts = train_val_data['Tokenized Text'].values\n",
        "test_texts = test_data['Tokenized Text'].values\n",
        "\n",
        "X_train_val = vectorizer.fit_transform(train_val_texts)\n",
        "X_test = vectorizer.transform(test_texts)\n",
        "\n",
        "y_train_val = train_val_data['Label'].values\n",
        "y_test = test_data['Label'].values\n",
        "\n",
        "# 5-Fold Cross-Validation setup\n",
        "kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=46)\n",
        "\n",
        "# Initialize Random Forest model\n",
        "model = RandomForestClassifier(n_estimators=100, random_state=46)  # You can adjust the number of trees (n_estimators)\n",
        "\n",
        "# Track metrics for the best model\n",
        "best_f1 = 0\n",
        "best_model_state = None\n",
        "\n",
        "# Start cross-validation\n",
        "for fold, (train_index, val_index) in enumerate(kf.split(X_train_val, y_train_val)):\n",
        "    print(f\"\\nFold {fold + 1}\\n\" + \"-\" * 10)\n",
        "\n",
        "    # Split the 85% training/validation data into training and validation for this fold\n",
        "    X_train, X_val = X_train_val[train_index], X_train_val[val_index]\n",
        "    y_train, y_val = y_train_val[train_index], y_train_val[val_index]\n",
        "\n",
        "    # Train the Random Forest model\n",
        "    model.fit(X_train, y_train)\n",
        "\n",
        "    # Predict on validation data\n",
        "    y_val_pred = model.predict(X_val)\n",
        "\n",
        "    # Calculate metrics\n",
        "    accuracy = accuracy_score(y_val, y_val_pred)\n",
        "    precision = precision_score(y_val, y_val_pred)\n",
        "    recall = recall_score(y_val, y_val_pred)\n",
        "    f1 = f1_score(y_val, y_val_pred)\n",
        "\n",
        "    print(f\"Fold {fold + 1} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}\")\n",
        "\n",
        "    # Keep track of the best model based on F1 score\n",
        "    if f1 > best_f1:\n",
        "        best_f1 = f1\n",
        "        best_model_state = model.get_params()\n",
        "\n",
        "# Evaluate the best model on the test set\n",
        "model.set_params(**best_model_state)  # Set the best parameters (optional for Random Forest)\n",
        "y_test_pred = model.predict(X_test)\n",
        "test_f1 = f1_score(y_test, y_test_pred)\n",
        "\n",
        "print(f\"\\nBest Random Forest model with F1 on test set: {test_f1:.4f}\")\n",
        "\n",
        "# Print classification report for the test set\n",
        "print(\"\\nClassification Report on Test Set:\")\n",
        "print(classification_report(y_test, y_test_pred, target_names=['Class 0', 'Class 1']))\n",
        "\n",
        "# Save the best Random Forest model\n",
        "model_save_path = 'Data/ThirdIterationModels/best_RF_Model.joblib'\n",
        "joblib.dump(model, model_save_path)\n",
        "print(f\"Best Random Forest model saved to {model_save_path}\")\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "6d42a8cd-44fd-4f80-ab60-4545c47be206",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true,
        "tags": [],
        "id": "6d42a8cd-44fd-4f80-ab60-4545c47be206"
      },
      "source": [
        "## **Naive Bayes f1=67**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d8c5a169-f02c-44c7-b6be-ebe5c5963c78",
      "metadata": {
        "tags": [],
        "id": "d8c5a169-f02c-44c7-b6be-ebe5c5963c78"
      },
      "outputs": [],
      "source": [
        "from sklearn.feature_extraction.text import TfidfVectorizer\n",
        "from sklearn.naive_bayes import MultinomialNB\n",
        "from sklearn.model_selection import StratifiedKFold, train_test_split\n",
        "from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score\n",
        "import numpy as np\n",
        "\n",
        "# Initial split: 85% for training/validation, 15% for test\n",
        "train_val_data, test_data = train_test_split(data, test_size=0.15, random_state=46, stratify=data['Label'])\n",
        "\n",
        "# TF-IDF vectorizer setup\n",
        "MAX_FEATURES = 5000  # You can adjust this based on your dataset\n",
        "vectorizer = TfidfVectorizer(max_features=MAX_FEATURES)\n",
        "\n",
        "# Fit TF-IDF on the training/validation data\n",
        "train_val_texts = train_val_data['Tokenized Text'].values\n",
        "test_texts = test_data['Tokenized Text'].values\n",
        "\n",
        "X_train_val = vectorizer.fit_transform(train_val_texts)\n",
        "X_test = vectorizer.transform(test_texts)\n",
        "\n",
        "y_train_val = train_val_data['Label'].values\n",
        "y_test = test_data['Label'].values\n",
        "\n",
        "# 5-Fold Cross-Validation setup\n",
        "kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=46)\n",
        "\n",
        "# Track the best model and fold, and accumulate metrics\n",
        "best_model = None\n",
        "best_fold = -1\n",
        "best_f1 = 0\n",
        "\n",
        "# Start cross-validation\n",
        "for fold, (train_index, val_index) in enumerate(kf.split(X_train_val, y_train_val)):\n",
        "    print(f\"\\nFold {fold + 1}\\n\" + \"-\" * 10)\n",
        "\n",
        "    # Split the 85% training/validation data into training and validation for this fold\n",
        "    X_train, X_val = X_train_val[train_index], X_train_val[val_index]\n",
        "    y_train, y_val = y_train_val[train_index], y_train_val[val_index]\n",
        "\n",
        "    # Initialize Naive Bayes model\n",
        "    model = MultinomialNB()\n",
        "\n",
        "    # Train the model\n",
        "    model.fit(X_train, y_train)\n",
        "\n",
        "    # Evaluate on validation data\n",
        "    y_val_pred = model.predict(X_val)\n",
        "\n",
        "    # Calculate metrics for the fold\n",
        "    accuracy = accuracy_score(y_val, y_val_pred)\n",
        "    precision = precision_score(y_val, y_val_pred)\n",
        "    recall = recall_score(y_val, y_val_pred)\n",
        "    f1 = f1_score(y_val, y_val_pred)\n",
        "\n",
        "    print(f\"Fold {fold + 1} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}\")\n",
        "\n",
        "    # Keep track of the best model based on F1 score\n",
        "    if f1 > best_f1:\n",
        "        best_f1 = f1\n",
        "        best_fold = fold + 1\n",
        "        best_model = model\n",
        "\n",
        "# Evaluate the best model on the test set\n",
        "y_test_pred = best_model.predict(X_test)\n",
        "\n",
        "# Print the classification report on the test set\n",
        "print(\"\\nClassification Report on Test Set:\")\n",
        "print(classification_report(y_test, y_test_pred, target_names=['Class 0', 'Class 1']))\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "b96f5d04-8c8d-466d-8120-2141b1c1e0ec",
      "metadata": {
        "tags": [],
        "id": "b96f5d04-8c8d-466d-8120-2141b1c1e0ec"
      },
      "outputs": [],
      "source": [
        "import joblib\n",
        "\n",
        "# Save the best model to a file\n",
        "model_save_path = 'Data/ThirdIterationModels/best_NV_Model.joblib'\n",
        "joblib.dump(best_model, model_save_path)\n",
        "\n",
        "print(f\"Best Naive Bayes model saved to {model_save_path}\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "20becee3-249f-4cd5-8af8-6e2ecadb6ef1",
      "metadata": {
        "id": "20becee3-249f-4cd5-8af8-6e2ecadb6ef1"
      },
      "outputs": [],
      "source": []
    },
    {
      "cell_type": "markdown",
      "id": "38d1240d-3d3f-4076-b3ed-faaa83b104a2",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true,
        "tags": [],
        "id": "38d1240d-3d3f-4076-b3ed-faaa83b104a2"
      },
      "source": [
        "## **Roberta f1 = 75**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "3e38ebdf-163b-4555-9271-d4b9a102151e",
      "metadata": {
        "tags": [],
        "id": "3e38ebdf-163b-4555-9271-d4b9a102151e"
      },
      "outputs": [],
      "source": [
        "from transformers import RobertaTokenizer, RobertaModel\n",
        "\n",
        "# Model and tokenizer setup\n",
        "MAX_LEN = 512\n",
        "TRAIN_BATCH_SIZE = 32\n",
        "VALID_BATCH_SIZE = 32\n",
        "EPOCHS = 40\n",
        "LEARNING_RATE = 1e-05\n",
        "device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\n",
        "tokenizer = RobertaTokenizer.from_pretrained('roberta-base')\n",
        "\n",
        "class RoBERTaClass(torch.nn.Module):\n",
        "    def __init__(self):\n",
        "        super(RoBERTaClass, self).__init__()\n",
        "        self.roberta_model = RobertaModel.from_pretrained('roberta-base', return_dict=True)\n",
        "        self.dropout = torch.nn.Dropout(0.3)\n",
        "        self.linear = torch.nn.Linear(768, 1)  # Change output to 1 for binary classification\n",
        "\n",
        "    def forward(self, input_ids, attn_mask, token_type_ids):\n",
        "        output = self.roberta_model(\n",
        "            input_ids,\n",
        "            attention_mask=attn_mask,\n",
        "            token_type_ids=token_type_ids\n",
        "        )\n",
        "        output_dropout = self.dropout(output.pooler_output)\n",
        "        output = self.linear(output_dropout)\n",
        "        return output\n",
        "\n",
        "# Custom dataset class for binary classification\n",
        "class CustomDataset(torch.utils.data.Dataset):\n",
        "    def __init__(self, df, tokenizer, max_len):\n",
        "        self.tokenizer = tokenizer\n",
        "        self.df = df\n",
        "        self.texts = df['Tokenized Text']\n",
        "        self.labels = df['Label']\n",
        "        self.max_len = max_len\n",
        "\n",
        "    def __len__(self):\n",
        "        return len(self.texts)\n",
        "\n",
        "    def __getitem__(self, index):\n",
        "        text = str(self.texts[index])\n",
        "        text = \" \".join(text.split())\n",
        "\n",
        "        inputs = self.tokenizer.encode_plus(\n",
        "            text,\n",
        "            None,\n",
        "            add_special_tokens=True,\n",
        "            max_length=self.max_len,\n",
        "            padding='max_length',\n",
        "            return_token_type_ids=True,\n",
        "            truncation=True,\n",
        "            return_attention_mask=True,\n",
        "            return_tensors='pt'\n",
        "        )\n",
        "\n",
        "        return {\n",
        "            'input_ids': inputs['input_ids'].flatten(),\n",
        "            'attention_mask': inputs['attention_mask'].flatten(),\n",
        "            'token_type_ids': inputs['token_type_ids'].flatten(),\n",
        "            'targets': torch.FloatTensor([self.labels[index]])  # Use FloatTensor for binary classification\n",
        "        }"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "5730f295-7cfd-4eaf-adb6-30508de1452b",
      "metadata": {
        "tags": [],
        "id": "5730f295-7cfd-4eaf-adb6-30508de1452b"
      },
      "outputs": [],
      "source": [
        "\n",
        "\n",
        "\n",
        "# Function to evaluate the model and return metrics for the fold\n",
        "def evaluate_fold(model, data_loader):\n",
        "    model.eval()\n",
        "    all_preds = []\n",
        "    all_labels = []\n",
        "\n",
        "    with torch.no_grad():\n",
        "        for batch in data_loader:\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            token_type_ids = batch['token_type_ids'].to(device)\n",
        "            targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "            outputs = model(input_ids, attn_mask, token_type_ids).squeeze()\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "\n",
        "            all_preds.extend(predicted.cpu().numpy())\n",
        "            all_labels.extend(targets.cpu().numpy())\n",
        "\n",
        "    all_preds = np.array(all_preds)\n",
        "    all_labels = np.array(all_labels)\n",
        "\n",
        "    # Calculate metrics for the fold\n",
        "    accuracy = accuracy_score(all_labels, all_preds)\n",
        "    precision = precision_score(all_labels, all_preds)\n",
        "    recall = recall_score(all_labels, all_preds)\n",
        "    f1 = f1_score(all_labels, all_preds)\n",
        "\n",
        "    return accuracy, precision, recall, f1, all_labels, all_preds\n",
        "\n",
        "# Create test dataset and loader (15% test set)\n",
        "test_data = test_data.reset_index(drop=True)\n",
        "\n",
        "test_dataset = CustomDataset(test_data, tokenizer, MAX_LEN)\n",
        "test_loader = DataLoader(test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)\n",
        "\n",
        "# 5-Fold Cross-Validation setup\n",
        "kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=46)\n",
        "\n",
        "# Track the best model and fold, and accumulate metrics\n",
        "best_model_state = None\n",
        "best_fold = -1\n",
        "best_loss = float('inf')\n",
        "\n",
        "# Start cross-validation\n",
        "for fold, (train_index, val_index) in enumerate(kf.split(train_val_data, train_val_data['Label'])):\n",
        "    print(f\"\\nFold {fold + 1}\\n\" + \"-\" * 10)\n",
        "\n",
        "    # Split the 85% training/validation data into training and validation for this fold\n",
        "    train_data = train_val_data.iloc[train_index].reset_index(drop=True)\n",
        "    val_data = train_val_data.iloc[val_index].reset_index(drop=True)\n",
        "\n",
        "    # Create datasets and loaders for this fold\n",
        "    train_dataset = CustomDataset(train_data, tokenizer, MAX_LEN)\n",
        "    val_dataset = CustomDataset(val_data, tokenizer, MAX_LEN)\n",
        "\n",
        "    train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=0)\n",
        "    val_loader = DataLoader(val_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)\n",
        "\n",
        "    # Custom model for binary classification\n",
        "    model = RoBERTaClass()\n",
        "    model.to(device)\n",
        "\n",
        "    criterion = nn.BCEWithLogitsLoss()\n",
        "    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)\n",
        "\n",
        "    # Training loop for this fold\n",
        "    patience = 8\n",
        "    best_valid_loss = float('inf')\n",
        "    patience_counter = 0\n",
        "\n",
        "    for epoch in range(EPOCHS):\n",
        "        model.train()\n",
        "        train_loss = 0.0\n",
        "        train_correct = 0\n",
        "        train_total = 0\n",
        "\n",
        "        for batch in train_loader:\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            token_type_ids = batch['token_type_ids'].to(device)\n",
        "            targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "            optimizer.zero_grad()\n",
        "            outputs = model(input_ids, attn_mask, token_type_ids).squeeze()\n",
        "            loss = criterion(outputs, targets)\n",
        "            loss.backward()\n",
        "            optimizer.step()\n",
        "\n",
        "            train_loss += loss.item()\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "            train_total += targets.size(0)\n",
        "            train_correct += (predicted == targets).sum().item()\n",
        "\n",
        "        train_accuracy = train_correct / train_total\n",
        "        train_loss /= len(train_loader)\n",
        "\n",
        "        # Validation loop\n",
        "        model.eval()\n",
        "        valid_loss = 0.0\n",
        "        valid_correct = 0\n",
        "        valid_total = 0\n",
        "\n",
        "        with torch.no_grad():\n",
        "            for batch in val_loader:\n",
        "                input_ids = batch['input_ids'].to(device)\n",
        "                attn_mask = batch['attention_mask'].to(device)\n",
        "                token_type_ids = batch['token_type_ids'].to(device)\n",
        "                targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "                outputs = model(input_ids, attn_mask, token_type_ids).squeeze()\n",
        "                loss = criterion(outputs, targets)\n",
        "                valid_loss += loss.item()\n",
        "                predicted = torch.round(torch.sigmoid(outputs))\n",
        "                valid_total += targets.size(0)\n",
        "                valid_correct += (predicted == targets).sum().item()\n",
        "\n",
        "        valid_accuracy = valid_correct / valid_total\n",
        "        valid_loss /= len(val_loader)\n",
        "\n",
        "        print(f'Epoch {epoch + 1}/{EPOCHS},Training Accuracy: {train_accuracy:.4f}, Training Loss: {train_loss:.4f}, Validation Loss: {valid_loss:.4f}, Validation Accuracy: {valid_accuracy:.4f}')\n",
        "\n",
        "        # Early stopping logic and save best model\n",
        "        if valid_loss < best_valid_loss:\n",
        "            best_valid_loss = valid_loss\n",
        "            patience_counter = 0\n",
        "            # Save the model state for the current fold\n",
        "            torch.save(model.state_dict(), f'Data/Roberta_Fold_{fold + 1}.pth')\n",
        "        else:\n",
        "            patience_counter += 1\n",
        "\n",
        "        if patience_counter >= patience:\n",
        "            print(\"Early stopping triggered\")\n",
        "            break\n",
        "\n",
        "    # Check if this is the best model across all folds\n",
        "    if best_valid_loss < best_loss:\n",
        "        best_loss = best_valid_loss\n",
        "        best_fold = fold + 1\n",
        "        best_model_state = model.state_dict()  # Save the state of the best model\n",
        "\n",
        "# Save the best model across all folds\n",
        "torch.save(best_model_state, 'Data/ThirdIterationModels/Roberta_Best_Fold.pth')\n",
        "print(f\"\\nBest model saved from fold {best_fold}\")\n",
        "\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d81790d5-873c-42db-9b5a-a9a92b28e0b1",
      "metadata": {
        "tags": [],
        "id": "d81790d5-873c-42db-9b5a-a9a92b28e0b1"
      },
      "outputs": [],
      "source": [
        "# Load the best model and evaluate on the test set\n",
        "model.load_state_dict(torch.load('Data/ThirdIterationModels/Roberta_Best_Fold.pth'))\n",
        "model.to(device)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "6f3600cc-663e-49e2-903f-088dfed45d2c",
      "metadata": {
        "tags": [],
        "id": "6f3600cc-663e-49e2-903f-088dfed45d2c"
      },
      "outputs": [],
      "source": [
        "\n",
        "from sklearn.metrics import classification_report\n",
        "\n",
        "# Load the saved model\n",
        "model = RoBERTaClass()\n",
        "model.load_state_dict(torch.load('Data/ThirdIterationModels/Roberta_Best_Fold.pth'))\n",
        "model.to(device)\n",
        "\n",
        "# Evaluate the model on the validation set\n",
        "def evaluate_on_validation(model, test_loader):\n",
        "    model.eval()\n",
        "    all_preds = []\n",
        "    all_labels = []\n",
        "\n",
        "    with torch.no_grad():\n",
        "        for batch in test_loader:\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            token_type_ids = batch['token_type_ids'].to(device)\n",
        "            targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "            outputs = model(input_ids, attn_mask, token_type_ids).squeeze()\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "\n",
        "            all_preds.extend(predicted.cpu().numpy())\n",
        "            all_labels.extend(targets.cpu().numpy())\n",
        "\n",
        "    return np.array(all_labels), np.array(all_preds)\n",
        "\n",
        "# Use the validation loader for evaluation\n",
        "true_labels, predicted_labels = evaluate_on_validation(model, test_loader)\n",
        "\n",
        "# Print the classification report\n",
        "print(\"\\nClassification Report on Validation Set:\")\n",
        "print(classification_report(true_labels, predicted_labels, target_names=['Class 0', 'Class 1']))\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "3cc54f09-4d86-4241-94f1-9ad84f1c3ee5",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true,
        "tags": [],
        "id": "3cc54f09-4d86-4241-94f1-9ad84f1c3ee5"
      },
      "source": [
        "## **Roberta Large f1=75**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "680d2c17-5a40-41de-83a9-cd706c39252a",
      "metadata": {
        "tags": [],
        "id": "680d2c17-5a40-41de-83a9-cd706c39252a"
      },
      "outputs": [],
      "source": [
        "from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score\n",
        "import torch\n",
        "import numpy as np\n",
        "from sklearn.model_selection import StratifiedKFold, train_test_split\n",
        "from torch.utils.data import DataLoader\n",
        "import torch.nn as nn\n",
        "from transformers import RobertaTokenizer, RobertaModel\n",
        "\n",
        "# Initial split: 85% for training/validation, 15% for test\n",
        "train_val_data, test_data = train_test_split(data, test_size=0.15, random_state=46, stratify=data['Label'])\n",
        "\n",
        "# Model and tokenizer setup\n",
        "MAX_LEN = 512\n",
        "TRAIN_BATCH_SIZE = 16  # Reduce batch size due to larger model\n",
        "VALID_BATCH_SIZE = 16\n",
        "EPOCHS = 40\n",
        "LEARNING_RATE = 1e-05\n",
        "device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\n",
        "\n",
        "# Use Roberta-large tokenizer\n",
        "tokenizer = RobertaTokenizer.from_pretrained('roberta-large')\n",
        "\n",
        "# Define Roberta-large model class for binary classification\n",
        "class RoBERTaLargeClass(torch.nn.Module):\n",
        "    def __init__(self):\n",
        "        super(RoBERTaLargeClass, self).__init__()\n",
        "        self.roberta_model = RobertaModel.from_pretrained('roberta-large', return_dict=True)\n",
        "        self.dropout = torch.nn.Dropout(0.3)\n",
        "        self.linear = torch.nn.Linear(1024, 1)  # Change output to 1 for binary classification\n",
        "\n",
        "    def forward(self, input_ids, attn_mask, token_type_ids):\n",
        "        output = self.roberta_model(\n",
        "            input_ids,\n",
        "            attention_mask=attn_mask,\n",
        "            token_type_ids=token_type_ids\n",
        "        )\n",
        "        output_dropout = self.dropout(output.pooler_output)\n",
        "        output = self.linear(output_dropout)\n",
        "        return output\n",
        "\n",
        "# Custom dataset class for binary classification\n",
        "class CustomDataset(torch.utils.data.Dataset):\n",
        "    def __init__(self, df, tokenizer, max_len):\n",
        "        self.tokenizer = tokenizer\n",
        "        self.df = df\n",
        "        self.texts = df['Tokenized Text']\n",
        "        self.labels = df['Label']\n",
        "        self.max_len = max_len\n",
        "\n",
        "    def __len__(self):\n",
        "        return len(self.texts)\n",
        "\n",
        "    def __getitem__(self, index):\n",
        "        text = str(self.texts[index])\n",
        "        text = \" \".join(text.split())\n",
        "\n",
        "        inputs = self.tokenizer.encode_plus(\n",
        "            text,\n",
        "            None,\n",
        "            add_special_tokens=True,\n",
        "            max_length=self.max_len,\n",
        "            padding='max_length',\n",
        "            return_token_type_ids=True,\n",
        "            truncation=True,\n",
        "            return_attention_mask=True,\n",
        "            return_tensors='pt'\n",
        "        )\n",
        "\n",
        "        return {\n",
        "            'input_ids': inputs['input_ids'].flatten(),\n",
        "            'attention_mask': inputs['attention_mask'].flatten(),\n",
        "            'token_type_ids': inputs['token_type_ids'].flatten(),\n",
        "            'targets': torch.FloatTensor([self.labels[index]])  # Use FloatTensor for binary classification\n",
        "        }\n",
        "\n",
        "# Function to evaluate the model and return metrics for the fold\n",
        "def evaluate_fold(model, data_loader):\n",
        "    model.eval()\n",
        "    all_preds = []\n",
        "    all_labels = []\n",
        "\n",
        "    with torch.no_grad():\n",
        "        for batch in data_loader:\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            token_type_ids = batch['token_type_ids'].to(device)\n",
        "            targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "            outputs = model(input_ids, attn_mask, token_type_ids).squeeze()\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "\n",
        "            all_preds.extend(predicted.cpu().numpy())\n",
        "            all_labels.extend(targets.cpu().numpy())\n",
        "\n",
        "    all_preds = np.array(all_preds)\n",
        "    all_labels = np.array(all_labels)\n",
        "\n",
        "    # Calculate metrics for the fold\n",
        "    accuracy = accuracy_score(all_labels, all_preds)\n",
        "    precision = precision_score(all_labels, all_preds)\n",
        "    recall = recall_score(all_labels, all_preds)\n",
        "    f1 = f1_score(all_labels, all_preds)\n",
        "\n",
        "    return accuracy, precision, recall, f1, all_labels, all_preds\n",
        "\n",
        "# Create test dataset and loader (15% test set)\n",
        "test_data = test_data.reset_index(drop=True)\n",
        "test_dataset = CustomDataset(test_data, tokenizer, MAX_LEN)\n",
        "test_loader = DataLoader(test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)\n",
        "\n",
        "# 5-Fold Cross-Validation setup\n",
        "kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=46)\n",
        "\n",
        "# Track the best model and fold, and accumulate metrics\n",
        "best_model_state = None\n",
        "best_fold = -1\n",
        "best_loss = float('inf')\n",
        "\n",
        "# Start cross-validation\n",
        "for fold, (train_index, val_index) in enumerate(kf.split(train_val_data, train_val_data['Label'])):\n",
        "    print(f\"\\nFold {fold + 1}\\n\" + \"-\" * 10)\n",
        "\n",
        "    # Split the 85% training/validation data into training and validation for this fold\n",
        "    train_data = train_val_data.iloc[train_index].reset_index(drop=True)\n",
        "    val_data = train_val_data.iloc[val_index].reset_index(drop=True)\n",
        "\n",
        "    # Create datasets and loaders for this fold\n",
        "    train_dataset = CustomDataset(train_data, tokenizer, MAX_LEN)\n",
        "    val_dataset = CustomDataset(val_data, tokenizer, MAX_LEN)\n",
        "\n",
        "    train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=0)\n",
        "    val_loader = DataLoader(val_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)\n",
        "\n",
        "    # Custom model for binary classification\n",
        "    model = RoBERTaLargeClass()\n",
        "    model.to(device)\n",
        "\n",
        "    criterion = nn.BCEWithLogitsLoss()\n",
        "    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)\n",
        "\n",
        "    # Training loop for this fold\n",
        "    patience = 8\n",
        "    best_valid_loss = float('inf')\n",
        "    patience_counter = 0\n",
        "\n",
        "    for epoch in range(EPOCHS):\n",
        "        model.train()\n",
        "        train_loss = 0.0\n",
        "        train_correct = 0\n",
        "        train_total = 0\n",
        "\n",
        "        for batch in train_loader:\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            token_type_ids = batch['token_type_ids'].to(device)\n",
        "            targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "            optimizer.zero_grad()\n",
        "            outputs = model(input_ids, attn_mask, token_type_ids).squeeze()\n",
        "            loss = criterion(outputs, targets)\n",
        "            loss.backward()\n",
        "            optimizer.step()\n",
        "\n",
        "            train_loss += loss.item()\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "            train_total += targets.size(0)\n",
        "            train_correct += (predicted == targets).sum().item()\n",
        "\n",
        "        train_accuracy = train_correct / train_total\n",
        "        train_loss /= len(train_loader)\n",
        "\n",
        "        # Validation loop\n",
        "        model.eval()\n",
        "        valid_loss = 0.0\n",
        "        valid_correct = 0\n",
        "        valid_total = 0\n",
        "\n",
        "        with torch.no_grad():\n",
        "            for batch in val_loader:\n",
        "                input_ids = batch['input_ids'].to(device)\n",
        "                attn_mask = batch['attention_mask'].to(device)\n",
        "                token_type_ids = batch['token_type_ids'].to(device)\n",
        "                targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "                outputs = model(input_ids, attn_mask, token_type_ids).squeeze()\n",
        "                loss = criterion(outputs, targets)\n",
        "                valid_loss += loss.item()\n",
        "                predicted = torch.round(torch.sigmoid(outputs))\n",
        "                valid_total += targets.size(0)\n",
        "                valid_correct += (predicted == targets).sum().item()\n",
        "\n",
        "        valid_accuracy = valid_correct / valid_total\n",
        "        valid_loss /= len(val_loader)\n",
        "\n",
        "        print(f'Epoch {epoch + 1}/{EPOCHS}, Training Accuracy: {train_accuracy:.4f}, Training Loss: {train_loss:.4f}, Validation Loss: {valid_loss:.4f}, Validation Accuracy: {valid_accuracy:.4f}')\n",
        "\n",
        "        # Early stopping logic and save best model\n",
        "        if valid_loss < best_valid_loss:\n",
        "            best_valid_loss = valid_loss\n",
        "            patience_counter = 0\n",
        "            # Save the model state for the current fold\n",
        "            torch.save(model.state_dict(), f'Data/ThirdIterationModels/Roberta_Large_Fold_{fold + 1}.pth')\n",
        "        else:\n",
        "            patience_counter += 1\n",
        "\n",
        "        if patience_counter >= patience:\n",
        "            print(\"Early stopping triggered\")\n",
        "            break\n",
        "\n",
        "    # Check if this is the best model across all folds\n",
        "    if best_valid_loss < best_loss:\n",
        "        best_loss = best_valid_loss\n",
        "        best_fold = fold + 1\n",
        "        best_model_state = model.state_dict()  # Save the state of the best model\n",
        "\n",
        "# Save the best model across all folds\n",
        "torch.save(best_model_state, 'Data/ThirdIterationModels/Roberta_Large_Best_Fold.pth')\n",
        "print(f\"\\nBest model saved from fold {best_fold}\")\n",
        "\n",
        "# Load the best model and evaluate on the test set\n",
        "model.load_state_dict(torch.load('Data/ThirdIterationModels/Roberta_Large_Best_Fold.pth'))\n",
        "model.to(device)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "7c718ee0-4bb8-4177-98b0-b9f8bcc9a8c8",
      "metadata": {
        "tags": [],
        "id": "7c718ee0-4bb8-4177-98b0-b9f8bcc9a8c8"
      },
      "outputs": [],
      "source": [
        "from sklearn.metrics import classification_report\n",
        "\n",
        "# Load the saved model\n",
        "class RoBERTaLargeClass(torch.nn.Module):\n",
        "    def __init__(self):\n",
        "        super(RoBERTaLargeClass, self).__init__()\n",
        "        self.roberta_model = RobertaModel.from_pretrained('roberta-large', return_dict=True)\n",
        "        self.dropout = torch.nn.Dropout(0.3)\n",
        "        self.linear = torch.nn.Linear(1024, 1)  # For roberta-large (hidden size = 1024)\n",
        "\n",
        "    def forward(self, input_ids, attn_mask, token_type_ids):\n",
        "        output = self.roberta_model(\n",
        "            input_ids,\n",
        "            attention_mask=attn_mask,\n",
        "            token_type_ids=token_type_ids\n",
        "        )\n",
        "        output_dropout = self.dropout(output.pooler_output)\n",
        "        output = self.linear(output_dropout)\n",
        "        return output\n",
        "\n",
        "# Instantiate the RoBERTa-large model\n",
        "model = RoBERTaLargeClass()\n",
        "model.load_state_dict(torch.load('Data/ThirdIterationModels/Roberta_Large_Best_Fold.pth'))\n",
        "model.to(device)\n",
        "\n",
        "# Evaluate the model on the validation set\n",
        "def evaluate_on_validation(model, test_loader):\n",
        "    model.eval()\n",
        "    all_preds = []\n",
        "    all_labels = []\n",
        "\n",
        "    with torch.no_grad():\n",
        "        for batch in test_loader:\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            token_type_ids = batch['token_type_ids'].to(device)\n",
        "            targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "            outputs = model(input_ids, attn_mask, token_type_ids).squeeze()\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "\n",
        "            all_preds.extend(predicted.cpu().numpy())\n",
        "            all_labels.extend(targets.cpu().numpy())\n",
        "\n",
        "    return np.array(all_labels), np.array(all_preds)\n",
        "\n",
        "# Use the validation loader for evaluation\n",
        "true_labels, predicted_labels = evaluate_on_validation(model, test_loader)\n",
        "\n",
        "# Print the classification report\n",
        "print(\"\\nClassification Report on Validation Set:\")\n",
        "print(classification_report(true_labels, predicted_labels, target_names=['Class 0', 'Class 1']))\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "3059788d-85e2-427d-a50f-2459ef116756",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true,
        "tags": [],
        "id": "3059788d-85e2-427d-a50f-2459ef116756"
      },
      "source": [
        "## **Bert f1=70**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "df48b1fa-304d-4250-a376-ee1d6adfc8ee",
      "metadata": {
        "tags": [],
        "id": "df48b1fa-304d-4250-a376-ee1d6adfc8ee"
      },
      "outputs": [],
      "source": [
        "import warnings\n",
        "warnings.filterwarnings('ignore')\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "6415e0b1-2869-43e2-a757-141f16061f91",
      "metadata": {
        "tags": [],
        "id": "6415e0b1-2869-43e2-a757-141f16061f91"
      },
      "outputs": [],
      "source": [
        "from transformers import BertTokenizer, BertModel"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "cfbd2c75-9370-4413-a353-98ad21d2e534",
      "metadata": {
        "tags": [],
        "id": "cfbd2c75-9370-4413-a353-98ad21d2e534"
      },
      "outputs": [],
      "source": [
        "from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score\n",
        "import torch\n",
        "import numpy as np\n",
        "from sklearn.model_selection import StratifiedKFold, train_test_split\n",
        "from torch.utils.data import DataLoader\n",
        "import torch.nn as nn\n",
        "\n",
        "# Initial split: 85% for training/validation, 15% for test\n",
        "train_val_data, test_data = train_test_split(data, test_size=0.15, random_state=46, stratify=data['Label'])\n",
        "\n",
        "# Model and tokenizer setup (changing from Roberta to BERT)\n",
        "MAX_LEN = 512\n",
        "TRAIN_BATCH_SIZE = 32\n",
        "VALID_BATCH_SIZE = 32\n",
        "EPOCHS = 40\n",
        "LEARNING_RATE = 1e-05\n",
        "device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\n",
        "\n",
        "# Use BERT tokenizer instead of RoBERTa tokenizer\n",
        "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n",
        "\n",
        "# Custom BERT model class for binary classification\n",
        "class BERTClass(torch.nn.Module):\n",
        "    def __init__(self):\n",
        "        super(BERTClass, self).__init__()\n",
        "        # Use BERT model instead of RoBERTa\n",
        "        self.bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)\n",
        "        self.dropout = torch.nn.Dropout(0.3)\n",
        "        self.linear = torch.nn.Linear(768, 1)  # BERT also outputs 768-dimensional vectors\n",
        "\n",
        "    def forward(self, input_ids, attn_mask, token_type_ids):\n",
        "        output = self.bert_model(\n",
        "            input_ids,\n",
        "            attention_mask=attn_mask,\n",
        "            token_type_ids=token_type_ids\n",
        "        )\n",
        "        output_dropout = self.dropout(output.pooler_output)\n",
        "        output = self.linear(output_dropout)\n",
        "        return output\n",
        "\n",
        "# Custom dataset class for binary classification\n",
        "class CustomDataset(torch.utils.data.Dataset):\n",
        "    def __init__(self, df, tokenizer, max_len):\n",
        "        self.tokenizer = tokenizer\n",
        "        self.df = df\n",
        "        self.texts = df['Tokenized Text']\n",
        "        self.labels = df['Label']\n",
        "        self.max_len = max_len\n",
        "\n",
        "    def __len__(self):\n",
        "        return len(self.texts)\n",
        "\n",
        "    def __getitem__(self, index):\n",
        "        text = str(self.texts[index])\n",
        "        text = \" \".join(text.split())\n",
        "\n",
        "        inputs = self.tokenizer.encode_plus(\n",
        "            text,\n",
        "            None,\n",
        "            add_special_tokens=True,\n",
        "            max_length=self.max_len,\n",
        "            padding='max_length',\n",
        "            return_token_type_ids=True,\n",
        "            truncation=True,\n",
        "            return_attention_mask=True,\n",
        "            return_tensors='pt'\n",
        "        )\n",
        "\n",
        "        return {\n",
        "            'input_ids': inputs['input_ids'].flatten(),\n",
        "            'attention_mask': inputs['attention_mask'].flatten(),\n",
        "            'token_type_ids': inputs['token_type_ids'].flatten(),\n",
        "            'targets': torch.FloatTensor([self.labels[index]])  # Use FloatTensor for binary classification\n",
        "        }"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d8110154-fdcc-4ebc-82a5-73ace7d19aff",
      "metadata": {
        "tags": [],
        "id": "d8110154-fdcc-4ebc-82a5-73ace7d19aff"
      },
      "outputs": [],
      "source": [
        "# Create test dataset and loader (15% test set)\n",
        "test_data = test_data.reset_index(drop=True)\n",
        "\n",
        "test_dataset = CustomDataset(test_data, tokenizer, MAX_LEN)\n",
        "test_loader = DataLoader(test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "f3556ede-6730-4cfa-9155-e750a243a3e5",
      "metadata": {
        "tags": [],
        "id": "f3556ede-6730-4cfa-9155-e750a243a3e5"
      },
      "outputs": [],
      "source": [
        "\n",
        "# Function to evaluate the model and return metrics for the fold\n",
        "def evaluate_fold(model, data_loader):\n",
        "    model.eval()\n",
        "    all_preds = []\n",
        "    all_labels = []\n",
        "\n",
        "    with torch.no_grad():\n",
        "        for batch in data_loader:\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            token_type_ids = batch['token_type_ids'].to(device)\n",
        "            targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "            outputs = model(input_ids, attn_mask, token_type_ids).squeeze()\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "\n",
        "            all_preds.extend(predicted.cpu().numpy())\n",
        "            all_labels.extend(targets.cpu().numpy())\n",
        "\n",
        "    all_preds = np.array(all_preds)\n",
        "    all_labels = np.array(all_labels)\n",
        "\n",
        "    # Calculate metrics for the fold\n",
        "    accuracy = accuracy_score(all_labels, all_preds)\n",
        "    precision = precision_score(all_labels, all_preds)\n",
        "    recall = recall_score(all_labels, all_preds)\n",
        "    f1 = f1_score(all_labels, all_preds)\n",
        "\n",
        "    return accuracy, precision, recall, f1, all_labels, all_preds\n",
        "\n",
        "\n",
        "# 5-Fold Cross-Validation setup\n",
        "kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=46)\n",
        "\n",
        "# Track the best model and fold, and accumulate metrics\n",
        "best_model_state = None\n",
        "best_fold = -1\n",
        "best_loss = float('inf')\n",
        "\n",
        "# Start cross-validation\n",
        "for fold, (train_index, val_index) in enumerate(kf.split(train_val_data, train_val_data['Label'])):\n",
        "    print(f\"\\nFold {fold + 1}\\n\" + \"-\" * 10)\n",
        "\n",
        "    # Split the 85% training/validation data into training and validation for this fold\n",
        "    train_data = train_val_data.iloc[train_index].reset_index(drop=True)\n",
        "    val_data = train_val_data.iloc[val_index].reset_index(drop=True)\n",
        "\n",
        "    # Create datasets and loaders for this fold\n",
        "    train_dataset = CustomDataset(train_data, tokenizer, MAX_LEN)\n",
        "    val_dataset = CustomDataset(val_data, tokenizer, MAX_LEN)\n",
        "\n",
        "    train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=0)\n",
        "    val_loader = DataLoader(val_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)\n",
        "\n",
        "    # Custom model for binary classification\n",
        "    model = BERTClass()  # Change to BERT model\n",
        "    model.to(device)\n",
        "\n",
        "    criterion = nn.BCEWithLogitsLoss()\n",
        "    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)\n",
        "\n",
        "    # Training loop for this fold\n",
        "    patience = 8\n",
        "    best_valid_loss = float('inf')\n",
        "    patience_counter = 0\n",
        "\n",
        "    for epoch in range(EPOCHS):\n",
        "        model.train()\n",
        "        train_loss = 0.0\n",
        "        train_correct = 0\n",
        "        train_total = 0\n",
        "\n",
        "        for batch in train_loader:\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            token_type_ids = batch['token_type_ids'].to(device)\n",
        "            targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "            optimizer.zero_grad()\n",
        "            outputs = model(input_ids, attn_mask, token_type_ids).squeeze()\n",
        "            loss = criterion(outputs, targets)\n",
        "            loss.backward()\n",
        "            optimizer.step()\n",
        "\n",
        "            train_loss += loss.item()\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "            train_total += targets.size(0)\n",
        "            train_correct += (predicted == targets).sum().item()\n",
        "\n",
        "        train_accuracy = train_correct / train_total\n",
        "        train_loss /= len(train_loader)\n",
        "\n",
        "        # Validation loop\n",
        "        model.eval()\n",
        "        valid_loss = 0.0\n",
        "        valid_correct = 0\n",
        "        valid_total = 0\n",
        "\n",
        "        with torch.no_grad():\n",
        "            for batch in val_loader:\n",
        "                input_ids = batch['input_ids'].to(device)\n",
        "                attn_mask = batch['attention_mask'].to(device)\n",
        "                token_type_ids = batch['token_type_ids'].to(device)\n",
        "                targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "                outputs = model(input_ids, attn_mask, token_type_ids).squeeze()\n",
        "                loss = criterion(outputs, targets)\n",
        "                valid_loss += loss.item()\n",
        "                predicted = torch.round(torch.sigmoid(outputs))\n",
        "                valid_total += targets.size(0)\n",
        "                valid_correct += (predicted == targets).sum().item()\n",
        "\n",
        "        valid_accuracy = valid_correct / valid_total\n",
        "        valid_loss /= len(val_loader)\n",
        "\n",
        "        print(f'Epoch {epoch + 1}/{EPOCHS}, Training Accuracy: {train_accuracy:.4f}, Training Loss: {train_loss:.4f}, Validation Loss: {valid_loss:.4f}, Validation Accuracy: {valid_accuracy:.4f}')\n",
        "\n",
        "        # Early stopping logic and save best model\n",
        "        if valid_loss < best_valid_loss:\n",
        "            best_valid_loss = valid_loss\n",
        "            patience_counter = 0\n",
        "            # Save the model state for the current fold\n",
        "            torch.save(model.state_dict(), f'Data/ThirdIterationModels/BERT_Fold_{fold + 1}.pth')\n",
        "        else:\n",
        "            patience_counter += 1\n",
        "\n",
        "        if patience_counter >= patience:\n",
        "            print(\"Early stopping triggered\")\n",
        "            break\n",
        "\n",
        "    # Check if this is the best model across all folds\n",
        "    if best_valid_loss < best_loss:\n",
        "        best_loss = best_valid_loss\n",
        "        best_fold = fold + 1\n",
        "        best_model_state = model.state_dict()  # Save the state of the best model\n",
        "\n",
        "# Save the best model across all folds\n",
        "torch.save(best_model_state, 'Data/ThirdIterationModels/BERT_Best_Fold.pth')\n",
        "print(f\"\\nBest model saved from fold {best_fold}\")\n",
        "\n",
        "# Load the best model and evaluate on the test set\n",
        "model.load_state_dict(torch.load('Data/ThirdIterationModels/BERT_Best_Fold.pth'))\n",
        "model.to(device)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "e1b0299c-baf6-4a4b-8c68-7d1ec02a9939",
      "metadata": {
        "tags": [],
        "id": "e1b0299c-baf6-4a4b-8c68-7d1ec02a9939"
      },
      "outputs": [],
      "source": [
        "from sklearn.metrics import classification_report\n",
        "import torch\n",
        "import numpy as np\n",
        "\n",
        "# Load the saved model (ensure the model class matches the one you're using, e.g., BERTClass if you switched to BERT)\n",
        "model = BERTClass()  # If you're using RoBERTa, use RoBERTaClass\n",
        "model.load_state_dict(torch.load('Data/ThirdIterationModels/BERT_Best_Fold.pth'))  # Ensure this is the correct path to your saved model\n",
        "model.to(device)\n",
        "\n",
        "# Evaluate the model on the test set\n",
        "def evaluate_on_test(model, test_loader):\n",
        "    model.eval()\n",
        "    all_preds = []\n",
        "    all_labels = []\n",
        "\n",
        "    with torch.no_grad():\n",
        "        for batch in test_loader:  # Using test_loader here instead of val_loader\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            token_type_ids = batch['token_type_ids'].to(device)\n",
        "            targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "            outputs = model(input_ids, attn_mask, token_type_ids).squeeze()\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "\n",
        "            all_preds.extend(predicted.cpu().numpy())\n",
        "            all_labels.extend(targets.cpu().numpy())\n",
        "\n",
        "    return np.array(all_labels), np.array(all_preds)\n",
        "\n",
        "# Use the test loader for evaluation\n",
        "true_labels, predicted_labels = evaluate_on_test(model, test_loader)\n",
        "\n",
        "# Print the classification report\n",
        "print(\"\\nClassification Report on Test Set:\")\n",
        "print(classification_report(true_labels, predicted_labels, target_names=['Class 0', 'Class 1']))\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "1002421f-2b17-46d0-933c-1c808090bcb2",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true,
        "tags": [],
        "id": "1002421f-2b17-46d0-933c-1c808090bcb2"
      },
      "source": [
        "## **DistilBERT f1=75**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "bfabc0d2-d8c2-4306-a1ca-481501e6e94f",
      "metadata": {
        "tags": [],
        "id": "bfabc0d2-d8c2-4306-a1ca-481501e6e94f"
      },
      "outputs": [],
      "source": [
        "from transformers import DistilBertModel, DistilBertTokenizer\n",
        "from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score\n",
        "import torch\n",
        "import numpy as np\n",
        "from sklearn.model_selection import StratifiedKFold, train_test_split\n",
        "from torch.utils.data import DataLoader\n",
        "import torch.nn as nn\n",
        "\n",
        "# Initial split: 85% for training/validation, 15% for test\n",
        "train_val_data, test_data = train_test_split(data, test_size=0.15, random_state=46, stratify=data['Label'])\n",
        "\n",
        "# Model and tokenizer setup (changing from BERT to DistilBERT)\n",
        "MAX_LEN = 512\n",
        "TRAIN_BATCH_SIZE = 32\n",
        "VALID_BATCH_SIZE = 32\n",
        "EPOCHS = 40\n",
        "LEARNING_RATE = 1e-05\n",
        "device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\n",
        "\n",
        "# Use DistilBERT tokenizer\n",
        "tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')\n",
        "\n",
        "# Custom DistilBERT model class for binary classification\n",
        "class DistilBERTClass(torch.nn.Module):\n",
        "    def __init__(self):\n",
        "        super(DistilBERTClass, self).__init__()\n",
        "        # Use DistilBERT model instead of BERT\n",
        "        self.distilbert_model = DistilBertModel.from_pretrained('distilbert-base-uncased', return_dict=True)\n",
        "        self.dropout = torch.nn.Dropout(0.3)\n",
        "        self.linear = torch.nn.Linear(768, 1)  # DistilBERT outputs 768-dimensional vectors\n",
        "\n",
        "    def forward(self, input_ids, attn_mask):\n",
        "        # DistilBERT does not use token_type_ids\n",
        "        output = self.distilbert_model(\n",
        "            input_ids,\n",
        "            attention_mask=attn_mask\n",
        "        )\n",
        "        output_dropout = self.dropout(output.last_hidden_state[:, 0])  # Take the [CLS] token's hidden state\n",
        "        output = self.linear(output_dropout)\n",
        "        return output\n",
        "\n",
        "# Custom dataset class for binary classification\n",
        "class CustomDataset(torch.utils.data.Dataset):\n",
        "    def __init__(self, df, tokenizer, max_len):\n",
        "        self.tokenizer = tokenizer\n",
        "        self.df = df\n",
        "        self.texts = df['Tokenized Text']\n",
        "        self.labels = df['Label']\n",
        "        self.max_len = max_len\n",
        "\n",
        "    def __len__(self):\n",
        "        return len(self.texts)\n",
        "\n",
        "    def __getitem__(self, index):\n",
        "        text = str(self.texts[index])\n",
        "        text = \" \".join(text.split())\n",
        "\n",
        "        inputs = self.tokenizer.encode_plus(\n",
        "            text,\n",
        "            None,\n",
        "            add_special_tokens=True,\n",
        "            max_length=self.max_len,\n",
        "            padding='max_length',\n",
        "            truncation=True,\n",
        "            return_attention_mask=True,\n",
        "            return_tensors='pt'\n",
        "        )\n",
        "\n",
        "        return {\n",
        "            'input_ids': inputs['input_ids'].flatten(),\n",
        "            'attention_mask': inputs['attention_mask'].flatten(),\n",
        "            'targets': torch.FloatTensor([self.labels[index]])  # Use FloatTensor for binary classification\n",
        "        }\n",
        "\n",
        "# Create test dataset and loader (15% test set)\n",
        "test_data = test_data.reset_index(drop=True)\n",
        "test_dataset = CustomDataset(test_data, tokenizer, MAX_LEN)\n",
        "test_loader = DataLoader(test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d7e969af-0654-47f3-80f9-9a6ffcc0d845",
      "metadata": {
        "tags": [],
        "id": "d7e969af-0654-47f3-80f9-9a6ffcc0d845"
      },
      "outputs": [],
      "source": [
        "\n",
        "\n",
        "from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score\n",
        "import torch\n",
        "import numpy as np\n",
        "from sklearn.model_selection import StratifiedKFold, train_test_split\n",
        "from torch.utils.data import DataLoader\n",
        "import torch.nn as nn\n",
        "\n",
        "# Function to evaluate the model and return metrics for the fold\n",
        "def evaluate_fold(model, data_loader):\n",
        "    model.eval()\n",
        "    all_preds = []\n",
        "    all_labels = []\n",
        "\n",
        "    with torch.no_grad():\n",
        "        for batch in data_loader:\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "            outputs = model(input_ids, attn_mask).squeeze()  # No token_type_ids for DistilBERT\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "\n",
        "            all_preds.extend(predicted.cpu().numpy())\n",
        "            all_labels.extend(targets.cpu().numpy())\n",
        "\n",
        "    all_preds = np.array(all_preds)\n",
        "    all_labels = np.array(all_labels)\n",
        "\n",
        "    # Calculate metrics for the fold\n",
        "    accuracy = accuracy_score(all_labels, all_preds)\n",
        "    precision = precision_score(all_labels, all_preds)\n",
        "    recall = recall_score(all_labels, all_preds)\n",
        "    f1 = f1_score(all_labels, all_preds)\n",
        "\n",
        "    return accuracy, precision, recall, f1, all_labels, all_preds\n",
        "\n",
        "\n",
        "# 5-Fold Cross-Validation setup\n",
        "kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=46)\n",
        "\n",
        "# Track the best model and fold, and accumulate metrics\n",
        "best_model_state = None\n",
        "best_fold = -1\n",
        "best_loss = float('inf')\n",
        "\n",
        "# Start cross-validation\n",
        "for fold, (train_index, val_index) in enumerate(kf.split(train_val_data, train_val_data['Label'])):\n",
        "    print(f\"\\nFold {fold + 1}\\n\" + \"-\" * 10)\n",
        "\n",
        "    # Split the 85% training/validation data into training and validation for this fold\n",
        "    train_data = train_val_data.iloc[train_index].reset_index(drop=True)\n",
        "    val_data = train_val_data.iloc[val_index].reset_index(drop=True)\n",
        "\n",
        "    # Create datasets and loaders for this fold\n",
        "    train_dataset = CustomDataset(train_data, tokenizer, MAX_LEN)\n",
        "    val_dataset = CustomDataset(val_data, tokenizer, MAX_LEN)\n",
        "\n",
        "    train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=0)\n",
        "    val_loader = DataLoader(val_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)\n",
        "\n",
        "    # Custom model for binary classification using DistilBERT\n",
        "    model = DistilBERTClass()  # Change to DistilBERT model\n",
        "    model.to(device)\n",
        "\n",
        "    criterion = nn.BCEWithLogitsLoss()\n",
        "    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)\n",
        "\n",
        "    # Training loop for this fold\n",
        "    patience = 8\n",
        "    best_valid_loss = float('inf')\n",
        "    patience_counter = 0\n",
        "\n",
        "    for epoch in range(EPOCHS):\n",
        "        model.train()\n",
        "        train_loss = 0.0\n",
        "        train_correct = 0\n",
        "        train_total = 0\n",
        "\n",
        "        for batch in train_loader:\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "            optimizer.zero_grad()\n",
        "            outputs = model(input_ids, attn_mask).squeeze()  # No token_type_ids for DistilBERT\n",
        "            loss = criterion(outputs, targets)\n",
        "            loss.backward()\n",
        "            optimizer.step()\n",
        "\n",
        "            train_loss += loss.item()\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "            train_total += targets.size(0)\n",
        "            train_correct += (predicted == targets).sum().item()\n",
        "\n",
        "        train_accuracy = train_correct / train_total\n",
        "        train_loss /= len(train_loader)\n",
        "\n",
        "        # Validation loop\n",
        "        model.eval()\n",
        "        valid_loss = 0.0\n",
        "        valid_correct = 0\n",
        "        valid_total = 0\n",
        "\n",
        "        with torch.no_grad():\n",
        "            for batch in val_loader:\n",
        "                input_ids = batch['input_ids'].to(device)\n",
        "                attn_mask = batch['attention_mask'].to(device)\n",
        "                targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "                outputs = model(input_ids, attn_mask).squeeze()  # No token_type_ids for DistilBERT\n",
        "                loss = criterion(outputs, targets)\n",
        "                valid_loss += loss.item()\n",
        "                predicted = torch.round(torch.sigmoid(outputs))\n",
        "                valid_total += targets.size(0)\n",
        "                valid_correct += (predicted == targets).sum().item()\n",
        "\n",
        "        valid_accuracy = valid_correct / valid_total\n",
        "        valid_loss /= len(val_loader)\n",
        "\n",
        "        print(f'Epoch {epoch + 1}/{EPOCHS}, Training Accuracy: {train_accuracy:.4f}, Training Loss: {train_loss:.4f}, Validation Loss: {valid_loss:.4f}, Validation Accuracy: {valid_accuracy:.4f}')\n",
        "\n",
        "        # Early stopping logic and save best model\n",
        "        if valid_loss < best_valid_loss:\n",
        "            best_valid_loss = valid_loss\n",
        "            patience_counter = 0\n",
        "            # Save the model state for the current fold\n",
        "            torch.save(model.state_dict(), f'Data/ThirdIterationModels/DistilBERT_Fold_{fold + 1}.pth')\n",
        "        else:\n",
        "            patience_counter += 1\n",
        "\n",
        "        if patience_counter >= patience:\n",
        "            print(\"Early stopping triggered\")\n",
        "            break\n",
        "\n",
        "    # Check if this is the best model across all folds\n",
        "    if best_valid_loss < best_loss:\n",
        "        best_loss = best_valid_loss\n",
        "        best_fold = fold + 1\n",
        "        best_model_state = model.state_dict()  # Save the state of the best model\n",
        "\n",
        "# Save the best model across all folds\n",
        "torch.save(best_model_state, 'Data/ThirdIterationModels/DistilBERT_Best_Fold.pth')\n",
        "print(f\"\\nBest model saved from fold {best_fold}\")\n",
        "\n",
        "# Load the best model and evaluate on the test set\n",
        "model.load_state_dict(torch.load('Data/ThirdIterationModels/DistilBERT_Best_Fold.pth'))\n",
        "model.to(device)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "4603e209-82e2-47cb-a22a-4cdc342c3f84",
      "metadata": {
        "tags": [],
        "id": "4603e209-82e2-47cb-a22a-4cdc342c3f84"
      },
      "outputs": [],
      "source": [
        "from sklearn.metrics import classification_report\n",
        "import torch\n",
        "import numpy as np\n",
        "\n",
        "# Load the saved model (ensure the model class matches the one you're using)\n",
        "model = DistilBERTClass()  # If you're using DistilBERT\n",
        "model.load_state_dict(torch.load('Data/ThirdIterationModels/DistilBERT_Best_Fold.pth'))  # Ensure this is the correct path to your saved model\n",
        "model.to(device)\n",
        "\n",
        "# Evaluate the model on the test set\n",
        "def evaluate_on_test(model, test_loader):\n",
        "    model.eval()\n",
        "    all_preds = []\n",
        "    all_labels = []\n",
        "\n",
        "    with torch.no_grad():\n",
        "        for batch in test_loader:  # Using test_loader here instead of val_loader\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "            # DistilBERT does not use token_type_ids\n",
        "            outputs = model(input_ids, attn_mask).squeeze()\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "\n",
        "            all_preds.extend(predicted.cpu().numpy())\n",
        "            all_labels.extend(targets.cpu().numpy())\n",
        "\n",
        "    return np.array(all_labels), np.array(all_preds)\n",
        "\n",
        "# Use the test loader for evaluation\n",
        "true_labels, predicted_labels = evaluate_on_test(model, test_loader)\n",
        "\n",
        "# Print the classification report\n",
        "print(\"\\nClassification Report on Test Set:\")\n",
        "print(classification_report(true_labels, predicted_labels, target_names=['Class 0', 'Class 1']))\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "cc638221-a6fd-4b26-b857-a55933cf4478",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true,
        "tags": [],
        "id": "cc638221-a6fd-4b26-b857-a55933cf4478"
      },
      "source": [
        "## **ALBERT f1= 64**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "0a73aaf2-3475-4c47-9151-ff3d5848f667",
      "metadata": {
        "tags": [],
        "id": "0a73aaf2-3475-4c47-9151-ff3d5848f667"
      },
      "outputs": [],
      "source": [
        "from transformers import AlbertModel, AlbertTokenizer\n",
        "from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score\n",
        "import torch\n",
        "import numpy as np\n",
        "from sklearn.model_selection import StratifiedKFold, train_test_split\n",
        "from torch.utils.data import DataLoader\n",
        "import torch.nn as nn\n",
        "# Initial split: 85% for training/validation, 15% for test\n",
        "train_val_data, test_data = train_test_split(data, test_size=0.15, random_state=46, stratify=data['Label'])\n",
        "\n",
        "# Model and tokenizer setup (changing from DistilBERT to ALBERT)\n",
        "MAX_LEN = 512\n",
        "TRAIN_BATCH_SIZE = 32\n",
        "VALID_BATCH_SIZE = 32\n",
        "EPOCHS = 40\n",
        "LEARNING_RATE = 1e-07\n",
        "device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\n",
        "\n",
        "# Use ALBERT tokenizer\n",
        "tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')\n",
        "\n",
        "# Custom ALBERT model class for binary classification\n",
        "class ALBERTClass(torch.nn.Module):\n",
        "    def __init__(self):\n",
        "        super(ALBERTClass, self).__init__()\n",
        "        # Use ALBERT model instead of DistilBERT\n",
        "        self.albert_model = AlbertModel.from_pretrained('albert-base-v2', return_dict=True)\n",
        "        self.dropout = torch.nn.Dropout(0.5)\n",
        "        self.linear = torch.nn.Linear(768, 1)  # ALBERT also outputs 768-dimensional vectors\n",
        "\n",
        "    def forward(self, input_ids, attn_mask):\n",
        "        output = self.albert_model(\n",
        "            input_ids,\n",
        "            attention_mask=attn_mask\n",
        "        )\n",
        "        output_dropout = self.dropout(output.last_hidden_state[:, 0])  # Take the [CLS] token's hidden state\n",
        "        output = self.linear(output_dropout)\n",
        "        return output\n",
        "\n",
        "# Custom dataset class for binary classification\n",
        "class CustomDataset(torch.utils.data.Dataset):\n",
        "    def __init__(self, df, tokenizer, max_len):\n",
        "        self.tokenizer = tokenizer\n",
        "        self.df = df\n",
        "        self.texts = df['Tokenized Text']\n",
        "        self.labels = df['Label']\n",
        "        self.max_len = max_len\n",
        "\n",
        "    def __len__(self):\n",
        "        return len(self.texts)\n",
        "\n",
        "    def __getitem__(self, index):\n",
        "        text = str(self.texts[index])\n",
        "        text = \" \".join(text.split())\n",
        "\n",
        "        inputs = self.tokenizer.encode_plus(\n",
        "            text,\n",
        "            None,\n",
        "            add_special_tokens=True,\n",
        "            max_length=self.max_len,\n",
        "            padding='max_length',\n",
        "            truncation=True,\n",
        "            return_attention_mask=True,\n",
        "            return_tensors='pt'\n",
        "        )\n",
        "\n",
        "        return {\n",
        "            'input_ids': inputs['input_ids'].flatten(),\n",
        "            'attention_mask': inputs['attention_mask'].flatten(),\n",
        "            'targets': torch.FloatTensor([self.labels[index]])  # Use FloatTensor for binary classification\n",
        "        }\n",
        "\n",
        "# Create test dataset and loader (15% test set)\n",
        "test_data = test_data.reset_index(drop=True)\n",
        "test_dataset = CustomDataset(test_data, tokenizer, MAX_LEN)\n",
        "test_loader = DataLoader(test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "bb7e9e77-117d-4419-89af-612cfd92579a",
      "metadata": {
        "tags": [],
        "id": "bb7e9e77-117d-4419-89af-612cfd92579a"
      },
      "outputs": [],
      "source": [
        "# Function to evaluate the model and return metrics for the fold\n",
        "def evaluate_fold(model, data_loader):\n",
        "    model.eval()\n",
        "    all_preds = []\n",
        "    all_labels = []\n",
        "\n",
        "    with torch.no_grad():\n",
        "        for batch in data_loader:\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "            outputs = model(input_ids, attn_mask).squeeze()\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "\n",
        "            all_preds.extend(predicted.cpu().numpy())\n",
        "            all_labels.extend(targets.cpu().numpy())\n",
        "\n",
        "    all_preds = np.array(all_preds)\n",
        "    all_labels = np.array(all_labels)\n",
        "\n",
        "    # Calculate metrics for the fold\n",
        "    accuracy = accuracy_score(all_labels, all_preds)\n",
        "    precision = precision_score(all_labels, all_preds)\n",
        "    recall = recall_score(all_labels, all_preds)\n",
        "    f1 = f1_score(all_labels, all_preds)\n",
        "\n",
        "    return accuracy, precision, recall, f1, all_labels, all_preds\n",
        "\n",
        "\n",
        "# 5-Fold Cross-Validation setup\n",
        "kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=46)\n",
        "\n",
        "# Track the best model and fold, and accumulate metrics\n",
        "best_model_state = None\n",
        "best_fold = -1\n",
        "best_loss = float('inf')\n",
        "\n",
        "# Start cross-validation\n",
        "for fold, (train_index, val_index) in enumerate(kf.split(train_val_data, train_val_data['Label'])):\n",
        "    print(f\"\\nFold {fold + 1}\\n\" + \"-\" * 10)\n",
        "\n",
        "    # Split the 85% training/validation data into training and validation for this fold\n",
        "    train_data = train_val_data.iloc[train_index].reset_index(drop=True)\n",
        "    val_data = train_val_data.iloc[val_index].reset_index(drop=True)\n",
        "\n",
        "    # Create datasets and loaders for this fold\n",
        "    train_dataset = CustomDataset(train_data, tokenizer, MAX_LEN)\n",
        "    val_dataset = CustomDataset(val_data, tokenizer, MAX_LEN)\n",
        "\n",
        "    train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=0)\n",
        "    val_loader = DataLoader(val_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)\n",
        "\n",
        "    # Custom model for binary classification using ALBERT\n",
        "    model = ALBERTClass()  # Change to ALBERT model\n",
        "    model.to(device)\n",
        "\n",
        "    criterion = nn.BCEWithLogitsLoss()\n",
        "    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)\n",
        "\n",
        "    # Training loop for this fold\n",
        "    patience = 8\n",
        "    best_valid_loss = float('inf')\n",
        "    patience_counter = 0\n",
        "\n",
        "    for epoch in range(EPOCHS):\n",
        "        model.train()\n",
        "        train_loss = 0.0\n",
        "        train_correct = 0\n",
        "        train_total = 0\n",
        "\n",
        "        for batch in train_loader:\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "            optimizer.zero_grad()\n",
        "            outputs = model(input_ids, attn_mask).squeeze()\n",
        "            loss = criterion(outputs, targets)\n",
        "            loss.backward()\n",
        "            optimizer.step()\n",
        "\n",
        "            train_loss += loss.item()\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "            train_total += targets.size(0)\n",
        "            train_correct += (predicted == targets).sum().item()\n",
        "\n",
        "        train_accuracy = train_correct / train_total\n",
        "        train_loss /= len(train_loader)\n",
        "\n",
        "        # Validation loop\n",
        "        model.eval()\n",
        "        valid_loss = 0.0\n",
        "        valid_correct = 0\n",
        "        valid_total = 0\n",
        "\n",
        "        with torch.no_grad():\n",
        "            for batch in val_loader:\n",
        "                input_ids = batch['input_ids'].to(device)\n",
        "                attn_mask = batch['attention_mask'].to(device)\n",
        "                targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "                outputs = model(input_ids, attn_mask).squeeze()\n",
        "                loss = criterion(outputs, targets)\n",
        "                valid_loss += loss.item()\n",
        "                predicted = torch.round(torch.sigmoid(outputs))\n",
        "                valid_total += targets.size(0)\n",
        "                valid_correct += (predicted == targets).sum().item()\n",
        "\n",
        "        valid_accuracy = valid_correct / valid_total\n",
        "        valid_loss /= len(val_loader)\n",
        "\n",
        "        print(f'Epoch {epoch + 1}/{EPOCHS}, Training Accuracy: {train_accuracy:.4f}, Training Loss: {train_loss:.4f}, Validation Loss: {valid_loss:.4f}, Validation Accuracy: {valid_accuracy:.4f}')\n",
        "\n",
        "        # Early stopping logic and save best model\n",
        "        if valid_loss < best_valid_loss:\n",
        "            best_valid_loss = valid_loss\n",
        "            patience_counter = 0\n",
        "            # Save the model state for the current fold\n",
        "            torch.save(model.state_dict(), f'Data/ThirdIterationModels/ALBERT_Fold_{fold + 1}.pth')\n",
        "        else:\n",
        "            patience_counter += 1\n",
        "\n",
        "        if patience_counter >= patience:\n",
        "            print(\"Early stopping triggered\")\n",
        "            break\n",
        "\n",
        "    # Check if this is the best model across all folds\n",
        "    if best_valid_loss < best_loss:\n",
        "        best_loss = best_valid_loss\n",
        "        best_fold = fold + 1\n",
        "        best_model_state = model.state_dict()  # Save the state of the best model\n",
        "\n",
        "# Save the best model across all folds\n",
        "torch.save(best_model_state, 'Data/ThirdIterationModels/ALBERT_Best_Fold.pth')\n",
        "print(f\"\\nBest model saved from fold {best_fold}\")\n",
        "\n",
        "# Load the best model and evaluate on the test set\n",
        "model.load_state_dict(torch.load('Data/ThirdIterationModels/ALBERT_Best_Fold.pth'))\n",
        "model.to(device)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "5bfe716a-e637-4e3f-bb63-c24422d908ec",
      "metadata": {
        "tags": [],
        "id": "5bfe716a-e637-4e3f-bb63-c24422d908ec"
      },
      "outputs": [],
      "source": [
        "from sklearn.metrics import classification_report\n",
        "import torch\n",
        "import numpy as np\n",
        "\n",
        "# Load the saved model (ensure the model class matches the one you're using)\n",
        "model = ALBERTClass()  # If you're using ALBERT\n",
        "model.load_state_dict(torch.load('Data/ThirdIterationModels/ALBERT_Best_Fold.pth'))  # Ensure this is the correct path to your saved model\n",
        "model.to(device)\n",
        "\n",
        "# Evaluate the model on the test set\n",
        "def evaluate_on_test(model, test_loader):\n",
        "    model.eval()\n",
        "    all_preds = []\n",
        "    all_labels = []\n",
        "\n",
        "    with torch.no_grad():\n",
        "        for batch in test_loader:  # Using test_loader here instead of val_loader\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "            # ALBERT does not use token_type_ids\n",
        "            outputs = model(input_ids, attn_mask).squeeze()\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "\n",
        "            all_preds.extend(predicted.cpu().numpy())\n",
        "            all_labels.extend(targets.cpu().numpy())\n",
        "\n",
        "    return np.array(all_labels), np.array(all_preds)\n",
        "\n",
        "# Use the test loader for evaluation\n",
        "true_labels, predicted_labels = evaluate_on_test(model, test_loader)\n",
        "\n",
        "# Print the classification report\n",
        "print(\"\\nClassification Report on Test Set:\")\n",
        "print(classification_report(true_labels, predicted_labels, target_names=['Class 0', 'Class 1']))\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "79c22b95-ce2b-4fde-b26f-4875f8cd9790",
      "metadata": {
        "tags": [],
        "id": "79c22b95-ce2b-4fde-b26f-4875f8cd9790"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "torch.cuda.empty_cache()\n",
        "torch.cuda.ipc_collect()"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "c993c58a-8f1c-4786-9c36-f13267b5f7cf",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true,
        "tags": [],
        "id": "c993c58a-8f1c-4786-9c36-f13267b5f7cf"
      },
      "source": [
        "## **BART f1=74**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d6ac088f-e957-485e-af49-d23ed28d1d70",
      "metadata": {
        "tags": [],
        "id": "d6ac088f-e957-485e-af49-d23ed28d1d70"
      },
      "outputs": [],
      "source": [
        "from transformers import BartForSequenceClassification, BartTokenizer\n",
        "\n",
        "from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score\n",
        "import torch\n",
        "import numpy as np\n",
        "from torch import optim\n",
        "from torch.utils.data import DataLoader\n",
        "import torch.nn as nn\n",
        "import os\n",
        "from sklearn.model_selection import StratifiedKFold\n",
        "from transformers import BartTokenizer, BartForSequenceClassification\n",
        "\n",
        "# Environment setting for debugging CUDA errors\n",
        "os.environ[\"CUDA_LAUNCH_BLOCKING\"] = \"1\"\n",
        "\n",
        "# Model and tokenizer setup (changing from ALBERT to BART Large MNLI)\n",
        "MAX_LEN = 512\n",
        "TRAIN_BATCH_SIZE = 16\n",
        "VALID_BATCH_SIZE = 16\n",
        "EPOCHS = 40\n",
        "LEARNING_RATE = 1e-05\n",
        "device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\n",
        "\n",
        "# Use BART tokenizer\n",
        "tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-mnli')\n",
        "\n",
        "# Custom BART model class for binary classification\n",
        "class BARTClass(torch.nn.Module):\n",
        "    def __init__(self):\n",
        "        super(BARTClass, self).__init__()\n",
        "        # Use BART model and ignore mismatched sizes for the classification head\n",
        "        self.bart_model = BartForSequenceClassification.from_pretrained(\n",
        "            'facebook/bart-large-mnli',\n",
        "            num_labels=2,  # Set num_labels to 2 for binary classification\n",
        "            ignore_mismatched_sizes=True  # Ignore size mismatch for the classification head\n",
        "        )\n",
        "        self.dropout = torch.nn.Dropout(0.3)\n",
        "\n",
        "    def forward(self, input_ids, attn_mask):\n",
        "        output = self.bart_model(\n",
        "            input_ids=input_ids,\n",
        "            attention_mask=attn_mask\n",
        "        )\n",
        "        return output.logits  # Output logits for both classes\n",
        "\n",
        "\n",
        "# Custom dataset class for binary classification\n",
        "class CustomDataset(torch.utils.data.Dataset):\n",
        "    def __init__(self, df, tokenizer, max_len):\n",
        "        self.tokenizer = tokenizer\n",
        "        self.df = df\n",
        "        self.texts = df['Tokenized Text']\n",
        "        self.labels = df['Label']\n",
        "        self.max_len = max_len\n",
        "\n",
        "    def __len__(self):\n",
        "        return len(self.texts)\n",
        "\n",
        "    def __getitem__(self, index):\n",
        "        text = str(self.texts[index])\n",
        "        text = \" \".join(text.split())\n",
        "\n",
        "        inputs = self.tokenizer.encode_plus(\n",
        "            text,\n",
        "            None,\n",
        "            add_special_tokens=True,\n",
        "            max_length=self.max_len,\n",
        "            padding='max_length',\n",
        "            truncation=True,\n",
        "            return_attention_mask=True,\n",
        "            return_tensors='pt'\n",
        "        )\n",
        "\n",
        "        return {\n",
        "            'input_ids': inputs['input_ids'].flatten(),\n",
        "            'attention_mask': inputs['attention_mask'].flatten(),\n",
        "            'targets': torch.FloatTensor([self.labels[index]])  # Use FloatTensor for binary classification\n",
        "        }\n",
        "\n",
        "# Function to evaluate the model and return metrics for the fold\n",
        "def evaluate_fold(model, data_loader):\n",
        "    model.eval()\n",
        "    all_preds = []\n",
        "    all_labels = []\n",
        "\n",
        "    with torch.no_grad():\n",
        "        for batch in data_loader:\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "            outputs = model(input_ids, attn_mask).squeeze()\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "\n",
        "            all_preds.extend(predicted.cpu().numpy())\n",
        "            all_labels.extend(targets.cpu().numpy())\n",
        "\n",
        "    all_preds = np.array(all_preds)\n",
        "    all_labels = np.array(all_labels)\n",
        "\n",
        "    # Calculate metrics for the fold\n",
        "    accuracy = accuracy_score(all_labels, all_preds)\n",
        "    precision = precision_score(all_labels, all_preds)\n",
        "    recall = recall_score(all_labels, all_preds)\n",
        "    f1 = f1_score(all_labels, all_preds)\n",
        "\n",
        "    return accuracy, precision, recall, f1, all_labels, all_preds\n",
        "\n",
        "# 5-Fold Cross-Validation setup\n",
        "kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=46)\n",
        "\n",
        "# Track the best model and fold, and accumulate metrics\n",
        "best_model_state = None\n",
        "best_fold = -1\n",
        "best_loss = float('inf')\n",
        "\n",
        "# Start cross-validation\n",
        "for fold, (train_index, val_index) in enumerate(kf.split(train_val_data, train_val_data['Label'])):\n",
        "    print(f\"\\nFold {fold + 1}\\n\" + \"-\" * 10)\n",
        "\n",
        "    # Split the 85% training/validation data into training and validation for this fold\n",
        "    train_data = train_val_data.iloc[train_index].reset_index(drop=True)\n",
        "    val_data = train_val_data.iloc[val_index].reset_index(drop=True)\n",
        "\n",
        "    # Create datasets and loaders for this fold\n",
        "    train_dataset = CustomDataset(train_data, tokenizer, MAX_LEN)\n",
        "    val_dataset = CustomDataset(val_data, tokenizer, MAX_LEN)\n",
        "\n",
        "    train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=0)\n",
        "    val_loader = DataLoader(val_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)\n",
        "\n",
        "    # Custom model for binary classification using BART\n",
        "    model = BARTClass()  # Use BART model\n",
        "    model.to(device)\n",
        "\n",
        "    criterion = nn.BCEWithLogitsLoss()\n",
        "    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)\n",
        "\n",
        "    # Training loop for this fold\n",
        "    patience = 8\n",
        "    best_valid_loss = float('inf')\n",
        "    patience_counter = 0\n",
        "\n",
        "    for epoch in range(EPOCHS):\n",
        "        model.train()\n",
        "        train_loss = 0.0\n",
        "        train_correct = 0\n",
        "        train_total = 0\n",
        "\n",
        "        for batch in train_loader:\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "            optimizer.zero_grad()\n",
        "            outputs = model(input_ids, attn_mask).squeeze()\n",
        "\n",
        "            # Use only the logit for class 1\n",
        "            outputs = outputs[:, 1]  # Select the second logit\n",
        "\n",
        "            loss = criterion(outputs, targets)\n",
        "            loss.backward()\n",
        "            optimizer.step()\n",
        "\n",
        "            train_loss += loss.item()\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "            train_total += targets.size(0)\n",
        "            train_correct += (predicted == targets).sum().item()\n",
        "\n",
        "        train_accuracy = train_correct / train_total\n",
        "        train_loss /= len(train_loader)\n",
        "\n",
        "        # Validation loop\n",
        "        model.eval()\n",
        "        valid_loss = 0.0\n",
        "        valid_correct = 0\n",
        "        valid_total = 0\n",
        "\n",
        "        with torch.no_grad():\n",
        "            for batch in val_loader:\n",
        "                input_ids = batch['input_ids'].to(device)\n",
        "                attn_mask = batch['attention_mask'].to(device)\n",
        "                targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "                outputs = model(input_ids, attn_mask).squeeze()\n",
        "                outputs = outputs[:, 1]  # Use only the logit for class 1\n",
        "\n",
        "                loss = criterion(outputs, targets)\n",
        "                valid_loss += loss.item()\n",
        "                predicted = torch.round(torch.sigmoid(outputs))\n",
        "                valid_total += targets.size(0)\n",
        "                valid_correct += (predicted == targets).sum().item()\n",
        "\n",
        "        valid_accuracy = valid_correct / valid_total\n",
        "        valid_loss /= len(val_loader)\n",
        "\n",
        "        print(f'Epoch {epoch + 1}/{EPOCHS}, Training Accuracy: {train_accuracy:.4f}, Training Loss: {train_loss:.4f}, Validation Loss: {valid_loss:.4f}, Validation Accuracy: {valid_accuracy:.4f}')\n",
        "\n",
        "        # Early stopping logic and save best model\n",
        "        if valid_loss < best_valid_loss:\n",
        "            best_valid_loss = valid_loss\n",
        "            patience_counter = 0\n",
        "            # Save the model state for the current fold\n",
        "            torch.save(model.state_dict(), f'Data/ThirdIterationModels/BART_Fold_{fold + 1}.pth')\n",
        "        else:\n",
        "            patience_counter += 1\n",
        "\n",
        "        if patience_counter >= patience:\n",
        "            print(\"Early stopping triggered\")\n",
        "            break\n",
        "\n",
        "    # Check if this is the best model across all folds\n",
        "    if best_valid_loss < best_loss:\n",
        "        best_loss = best_valid_loss\n",
        "        best_fold = fold + 1\n",
        "        best_model_state = model.state_dict()  # Save the state of the best model\n",
        "\n",
        "# Save the best model across all folds\n",
        "torch.save(best_model_state, 'Data/ThirdIterationModels/BART_Best_Fold.pth')\n",
        "print(f\"\\nBest model saved from fold {best_fold}\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "a2dcdc36-0f20-42f9-aa05-0d40b0652475",
      "metadata": {
        "tags": [],
        "id": "a2dcdc36-0f20-42f9-aa05-0d40b0652475"
      },
      "outputs": [],
      "source": [
        "# Create test dataset and loader (15% test set)\n",
        "test_data = test_data.reset_index(drop=True)\n",
        "test_dataset = CustomDataset(test_data, tokenizer, MAX_LEN)\n",
        "test_loader = DataLoader(test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "f90d0b03-f76a-4632-b333-e2bed01e8c86",
      "metadata": {
        "tags": [],
        "id": "f90d0b03-f76a-4632-b333-e2bed01e8c86"
      },
      "outputs": [],
      "source": [
        "# Load the best model and evaluate on the test set\n",
        "# Correct: `model.load_state_dict()` should be applied on the initialized model, and the model should be moved to the device (GPU/CPU).\n",
        "model = BARTClass()  # Re-initialize the model\n",
        "model.load_state_dict(torch.load('Data/ThirdIterationModels/BART_Best_Fold.pth'))  # Load the best model from saved state\n",
        "model.to(device)  # Move the model to the appropriate device (GPU/CPU)\n",
        "\n",
        "# Evaluate the model on the test set\n",
        "def evaluate_on_test(model, test_loader):\n",
        "    model.eval()  # Set the model to evaluation mode\n",
        "    all_preds = []\n",
        "    all_labels = []\n",
        "\n",
        "    with torch.no_grad():  # No gradient calculation during evaluation\n",
        "        for batch in test_loader:\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "            # Forward pass to get outputs\n",
        "            outputs = model(input_ids, attn_mask).squeeze()\n",
        "\n",
        "            # Use sigmoid for binary classification logits and round predictions\n",
        "            predicted = torch.round(torch.sigmoid(outputs[:, 1]))  # Use the logit for class 1\n",
        "\n",
        "            # Store predictions and true labels for evaluation\n",
        "            all_preds.extend(predicted.cpu().numpy())\n",
        "            all_labels.extend(targets.cpu().numpy())\n",
        "\n",
        "    return np.array(all_labels), np.array(all_preds)\n",
        "\n",
        "# Test loader setup (ensure test_loader is created with test data)\n",
        "test_loader = DataLoader(test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)\n",
        "\n",
        "# Evaluate the model on the test set\n",
        "true_labels, predicted_labels = evaluate_on_test(model, test_loader)\n",
        "\n",
        "# Print the classification report using sklearn\n",
        "print(\"\\nClassification Report on Test Set:\")\n",
        "print(classification_report(true_labels, predicted_labels, target_names=['Class 0', 'Class 1']))\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "a4035e35-5387-445b-bd0a-952b8b3fc716",
      "metadata": {
        "tags": [],
        "id": "a4035e35-5387-445b-bd0a-952b8b3fc716"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "torch.cuda.empty_cache()\n",
        "torch.cuda.ipc_collect()"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "7b64c165-2796-4a5c-995b-68ee10261dee",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true,
        "tags": [],
        "id": "7b64c165-2796-4a5c-995b-68ee10261dee"
      },
      "source": [
        "## **DeBERTa f1=74**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "f27e6029-9d0d-4e84-b7c5-8cc0e0c54de9",
      "metadata": {
        "tags": [],
        "id": "f27e6029-9d0d-4e84-b7c5-8cc0e0c54de9"
      },
      "outputs": [],
      "source": [
        "from transformers import DebertaTokenizer, DebertaModel\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "6db51d5f-af2d-4aae-934e-08755ef7b53a",
      "metadata": {
        "tags": [],
        "id": "6db51d5f-af2d-4aae-934e-08755ef7b53a"
      },
      "outputs": [],
      "source": [
        "from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score\n",
        "import torch\n",
        "import numpy as np\n",
        "from sklearn.model_selection import StratifiedKFold, train_test_split\n",
        "from torch.utils.data import DataLoader\n",
        "import torch.nn as nn\n",
        "\n",
        "# Initial split: 85% for training/validation, 15% for test\n",
        "train_val_data, test_data = train_test_split(data, test_size=0.15, random_state=46, stratify=data['Label'])\n",
        "\n",
        "# Model and tokenizer setup\n",
        "MAX_LEN = 512\n",
        "TRAIN_BATCH_SIZE = 32\n",
        "VALID_BATCH_SIZE = 32\n",
        "EPOCHS = 40\n",
        "LEARNING_RATE = 1e-05\n",
        "device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\n",
        "tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')\n",
        "\n",
        "class DeBERTaClass(torch.nn.Module):\n",
        "    def __init__(self):\n",
        "        super(DeBERTaClass, self).__init__()\n",
        "        self.deberta_model = DebertaModel.from_pretrained('microsoft/deberta-base', return_dict=True)\n",
        "        self.dropout = torch.nn.Dropout(0.3)\n",
        "        self.linear = torch.nn.Linear(768, 1)  # Change output to 1 for binary classification\n",
        "\n",
        "    def forward(self, input_ids, attn_mask, token_type_ids):\n",
        "        # Get the last hidden state from the DeBERTa model\n",
        "        output = self.deberta_model(\n",
        "            input_ids,\n",
        "            attention_mask=attn_mask,\n",
        "            token_type_ids=token_type_ids\n",
        "        )\n",
        "\n",
        "        # Use the hidden state instead of pooler_output\n",
        "        # Take the first token's ([CLS] token) hidden state representation\n",
        "        last_hidden_state_cls = output.last_hidden_state[:, 0, :]  # CLS token representation\n",
        "\n",
        "        output_dropout = self.dropout(last_hidden_state_cls)\n",
        "        output = self.linear(output_dropout)\n",
        "        return output\n",
        "\n",
        "\n",
        "# Custom dataset class for binary classification\n",
        "class CustomDataset(torch.utils.data.Dataset):\n",
        "    def __init__(self, df, tokenizer, max_len):\n",
        "        self.tokenizer = tokenizer\n",
        "        self.df = df\n",
        "        self.texts = df['Tokenized Text']\n",
        "        self.labels = df['Label']\n",
        "        self.max_len = max_len\n",
        "\n",
        "    def __len__(self):\n",
        "        return len(self.texts)\n",
        "\n",
        "    def __getitem__(self, index):\n",
        "        text = str(self.texts[index])\n",
        "        text = \" \".join(text.split())\n",
        "\n",
        "        inputs = self.tokenizer.encode_plus(\n",
        "            text,\n",
        "            None,\n",
        "            add_special_tokens=True,\n",
        "            max_length=self.max_len,\n",
        "            padding='max_length',\n",
        "            return_token_type_ids=True,\n",
        "            truncation=True,\n",
        "            return_attention_mask=True,\n",
        "            return_tensors='pt'\n",
        "        )\n",
        "\n",
        "        return {\n",
        "            'input_ids': inputs['input_ids'].flatten(),\n",
        "            'attention_mask': inputs['attention_mask'].flatten(),\n",
        "            'token_type_ids': inputs['token_type_ids'].flatten(),\n",
        "            'targets': torch.FloatTensor([self.labels[index]])  # Use FloatTensor for binary classification\n",
        "        }"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "6287be9e-91b8-4c34-86c2-7aa8e28fa0f3",
      "metadata": {
        "tags": [],
        "id": "6287be9e-91b8-4c34-86c2-7aa8e28fa0f3"
      },
      "outputs": [],
      "source": [
        "# Create test dataset and loader (15% test set)\n",
        "test_data = test_data.reset_index(drop=True)\n",
        "\n",
        "test_dataset = CustomDataset(test_data, tokenizer, MAX_LEN)\n",
        "test_loader = DataLoader(test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "2a69eb11-b03c-4e83-af98-5b621dbe2033",
      "metadata": {
        "tags": [],
        "id": "2a69eb11-b03c-4e83-af98-5b621dbe2033"
      },
      "outputs": [],
      "source": [
        "\n",
        "\n",
        "# Function to evaluate the model and return metrics for the fold\n",
        "def evaluate_fold(model, data_loader):\n",
        "    model.eval()\n",
        "    all_preds = []\n",
        "    all_labels = []\n",
        "\n",
        "    with torch.no_grad():\n",
        "        for batch in data_loader:\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            token_type_ids = batch['token_type_ids'].to(device)\n",
        "            targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "            outputs = model(input_ids, attn_mask, token_type_ids).squeeze()\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "\n",
        "            all_preds.extend(predicted.cpu().numpy())\n",
        "            all_labels.extend(targets.cpu().numpy())\n",
        "\n",
        "    all_preds = np.array(all_preds)\n",
        "    all_labels = np.array(all_labels)\n",
        "\n",
        "    # Calculate metrics for the fold\n",
        "    accuracy = accuracy_score(all_labels, all_preds)\n",
        "    precision = precision_score(all_labels, all_preds)\n",
        "    recall = recall_score(all_labels, all_preds)\n",
        "    f1 = f1_score(all_labels, all_preds)\n",
        "\n",
        "    return accuracy, precision, recall, f1, all_labels, all_preds\n",
        "\n",
        "\n",
        "\n",
        "# 5-Fold Cross-Validation setup\n",
        "kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=46)\n",
        "\n",
        "# Track the best model and fold, and accumulate metrics\n",
        "best_model_state = None\n",
        "best_fold = -1\n",
        "best_loss = float('inf')\n",
        "\n",
        "# Start cross-validation\n",
        "for fold, (train_index, val_index) in enumerate(kf.split(train_val_data, train_val_data['Label'])):\n",
        "    print(f\"\\nFold {fold + 1}\\n\" + \"-\" * 10)\n",
        "\n",
        "    # Split the 85% training/validation data into training and validation for this fold\n",
        "    train_data = train_val_data.iloc[train_index].reset_index(drop=True)\n",
        "    val_data = train_val_data.iloc[val_index].reset_index(drop=True)\n",
        "\n",
        "    # Create datasets and loaders for this fold\n",
        "    train_dataset = CustomDataset(train_data, tokenizer, MAX_LEN)\n",
        "    val_dataset = CustomDataset(val_data, tokenizer, MAX_LEN)\n",
        "\n",
        "    train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=0)\n",
        "    val_loader = DataLoader(val_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)\n",
        "\n",
        "    # Custom model for binary classification\n",
        "    model = DeBERTaClass()\n",
        "    model.to(device)\n",
        "\n",
        "    criterion = nn.BCEWithLogitsLoss()\n",
        "    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)\n",
        "\n",
        "    # Training loop for this fold\n",
        "    patience = 8\n",
        "    best_valid_loss = float('inf')\n",
        "    patience_counter = 0\n",
        "\n",
        "    for epoch in range(EPOCHS):\n",
        "        model.train()\n",
        "        train_loss = 0.0\n",
        "        train_correct = 0\n",
        "        train_total = 0\n",
        "\n",
        "        for batch in train_loader:\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            token_type_ids = batch['token_type_ids'].to(device)\n",
        "            targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "            optimizer.zero_grad()\n",
        "            outputs = model(input_ids, attn_mask, token_type_ids).squeeze()\n",
        "            loss = criterion(outputs, targets)\n",
        "            loss.backward()\n",
        "            optimizer.step()\n",
        "\n",
        "            train_loss += loss.item()\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "            train_total += targets.size(0)\n",
        "            train_correct += (predicted == targets).sum().item()\n",
        "\n",
        "        train_accuracy = train_correct / train_total\n",
        "        train_loss /= len(train_loader)\n",
        "\n",
        "        # Validation loop\n",
        "        model.eval()\n",
        "        valid_loss = 0.0\n",
        "        valid_correct = 0\n",
        "        valid_total = 0\n",
        "\n",
        "        with torch.no_grad():\n",
        "            for batch in val_loader:\n",
        "                input_ids = batch['input_ids'].to(device)\n",
        "                attn_mask = batch['attention_mask'].to(device)\n",
        "                token_type_ids = batch['token_type_ids'].to(device)\n",
        "                targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "                outputs = model(input_ids, attn_mask, token_type_ids).squeeze()\n",
        "                loss = criterion(outputs, targets)\n",
        "                valid_loss += loss.item()\n",
        "                predicted = torch.round(torch.sigmoid(outputs))\n",
        "                valid_total += targets.size(0)\n",
        "                valid_correct += (predicted == targets).sum().item()\n",
        "\n",
        "        valid_accuracy = valid_correct / valid_total\n",
        "        valid_loss /= len(val_loader)\n",
        "\n",
        "        print(f'Epoch {epoch + 1}/{EPOCHS},Training Accuracy: {train_accuracy:.4f}, Training Loss: {train_loss:.4f}, Validation Loss: {valid_loss:.4f}, Validation Accuracy: {valid_accuracy:.4f}')\n",
        "\n",
        "        # Early stopping logic and save best model\n",
        "        if valid_loss < best_valid_loss:\n",
        "            best_valid_loss = valid_loss\n",
        "            patience_counter = 0\n",
        "            # Save the model state for the current fold\n",
        "            torch.save(model.state_dict(), f'Data/ThirdIterationModels/Deberta_Fold_{fold + 1}.pth')\n",
        "        else:\n",
        "            patience_counter += 1\n",
        "\n",
        "        if patience_counter >= patience:\n",
        "            print(\"Early stopping triggered\")\n",
        "            break\n",
        "\n",
        "    # Check if this is the best model across all folds\n",
        "    if best_valid_loss < best_loss:\n",
        "        best_loss = best_valid_loss\n",
        "        best_fold = fold + 1\n",
        "        best_model_state = model.state_dict()  # Save the state of the best model\n",
        "\n",
        "# Save the best model across all folds\n",
        "torch.save(best_model_state, 'Data/ThirdIterationModels/Deberta_Best_Fold.pth')\n",
        "print(f\"\\nBest model saved from fold {best_fold}\")\n",
        "\n",
        "# Load the best model and evaluate on the test set\n",
        "model.load_state_dict(torch.load('Data/ThirdIterationModels/Deberta_Best_Fold.pth'))\n",
        "model.to(device)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "009684eb-4f72-4adf-9d45-08561f45c11e",
      "metadata": {
        "tags": [],
        "id": "009684eb-4f72-4adf-9d45-08561f45c11e"
      },
      "outputs": [],
      "source": [
        "from sklearn.metrics import classification_report\n",
        "\n",
        "# Load the saved model\n",
        "model = DeBERTaClass()  # Updated to use DeBERTa\n",
        "model.load_state_dict(torch.load('Data/ThirdIterationModels/Deberta_Best_Fold.pth'))  # Updated model path\n",
        "model.to(device)\n",
        "\n",
        "# Evaluate the model on the validation set\n",
        "def evaluate_on_validation(model, val_loader):\n",
        "    model.eval()\n",
        "    all_preds = []\n",
        "    all_labels = []\n",
        "\n",
        "    with torch.no_grad():\n",
        "        for batch in val_loader:\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            token_type_ids = batch['token_type_ids'].to(device)\n",
        "            targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "            outputs = model(input_ids, attn_mask, token_type_ids).squeeze()\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "\n",
        "            all_preds.extend(predicted.cpu().numpy())\n",
        "            all_labels.extend(targets.cpu().numpy())\n",
        "\n",
        "    return np.array(all_labels), np.array(all_preds)\n",
        "\n",
        "# Use the validation loader for evaluation\n",
        "true_labels, predicted_labels = evaluate_on_validation(model, test_loader)\n",
        "\n",
        "# Print the classification report\n",
        "print(\"\\nClassification Report on Validation Set:\")\n",
        "print(classification_report(true_labels, predicted_labels, target_names=['Class 0', 'Class 1']))\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d6f3001f-71d3-435f-99eb-e899962c6a4a",
      "metadata": {
        "tags": [],
        "id": "d6f3001f-71d3-435f-99eb-e899962c6a4a"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "torch.cuda.empty_cache()\n",
        "torch.cuda.ipc_collect()"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "ce4c2155-2cd6-4a64-a9e3-06c9db7c3286",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true,
        "tags": [],
        "id": "ce4c2155-2cd6-4a64-a9e3-06c9db7c3286"
      },
      "source": [
        "## **ELECTRA f1=74**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "0167d198-a586-4d81-92f4-4fe32dc0472a",
      "metadata": {
        "tags": [],
        "id": "0167d198-a586-4d81-92f4-4fe32dc0472a"
      },
      "outputs": [],
      "source": [
        "from transformers import ElectraModel, ElectraTokenizer"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "c30ca7e6-ab36-461d-a73a-6851dd96a6f6",
      "metadata": {
        "tags": [],
        "id": "c30ca7e6-ab36-461d-a73a-6851dd96a6f6"
      },
      "outputs": [],
      "source": [
        "from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score\n",
        "import torch\n",
        "import numpy as np\n",
        "from sklearn.model_selection import StratifiedKFold, train_test_split\n",
        "from torch.utils.data import DataLoader\n",
        "import torch.nn as nn\n",
        "from transformers import ElectraTokenizer, ElectraModel\n",
        "\n",
        "# Initial split: 85% for training/validation, 15% for test\n",
        "train_val_data, test_data = train_test_split(data, test_size=0.15, random_state=46, stratify=data['Label'])\n",
        "\n",
        "# Model and tokenizer setup\n",
        "MAX_LEN = 512\n",
        "TRAIN_BATCH_SIZE = 32\n",
        "VALID_BATCH_SIZE = 32\n",
        "EPOCHS = 40\n",
        "LEARNING_RATE = 1e-05\n",
        "device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\n",
        "tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')\n",
        "\n",
        "class ELECTRAClass(torch.nn.Module):\n",
        "    def __init__(self):\n",
        "        super(ELECTRAClass, self).__init__()\n",
        "        self.electra_model = ElectraModel.from_pretrained('google/electra-base-discriminator', return_dict=True)\n",
        "        self.dropout = torch.nn.Dropout(0.3)\n",
        "        self.linear = torch.nn.Linear(768, 1)  # Binary classification output\n",
        "\n",
        "    def forward(self, input_ids, attn_mask, token_type_ids):\n",
        "        # Get the last hidden state from Electra\n",
        "        output = self.electra_model(\n",
        "            input_ids,\n",
        "            attention_mask=attn_mask,\n",
        "            token_type_ids=token_type_ids\n",
        "        )\n",
        "\n",
        "        # Use the [CLS] token representation for classification (first token in hidden states)\n",
        "        cls_token = output.last_hidden_state[:, 0, :]  # Extract the [CLS] token hidden state\n",
        "\n",
        "        # Apply dropout and pass through the linear layer\n",
        "        output_dropout = self.dropout(cls_token)\n",
        "        output = self.linear(output_dropout)\n",
        "        return output\n",
        "\n",
        "\n",
        "# Custom dataset class for binary classification\n",
        "class CustomDataset(torch.utils.data.Dataset):\n",
        "    def __init__(self, df, tokenizer, max_len):\n",
        "        self.tokenizer = tokenizer\n",
        "        self.df = df\n",
        "        self.texts = df['Tokenized Text']\n",
        "        self.labels = df['Label']\n",
        "        self.max_len = max_len\n",
        "\n",
        "    def __len__(self):\n",
        "        return len(self.texts)\n",
        "\n",
        "    def __getitem__(self, index):\n",
        "        text = str(self.texts[index])\n",
        "        text = \" \".join(text.split())\n",
        "\n",
        "        inputs = self.tokenizer.encode_plus(\n",
        "            text,\n",
        "            None,\n",
        "            add_special_tokens=True,\n",
        "            max_length=self.max_len,\n",
        "            padding='max_length',\n",
        "            return_token_type_ids=True,\n",
        "            truncation=True,\n",
        "            return_attention_mask=True,\n",
        "            return_tensors='pt'\n",
        "        )\n",
        "\n",
        "        return {\n",
        "            'input_ids': inputs['input_ids'].flatten(),\n",
        "            'attention_mask': inputs['attention_mask'].flatten(),\n",
        "            'token_type_ids': inputs['token_type_ids'].flatten(),\n",
        "            'targets': torch.FloatTensor([self.labels[index]])  # Binary classification: FloatTensor for targets\n",
        "        }\n",
        "\n",
        "# Function to evaluate the model and return metrics for the fold\n",
        "def evaluate_fold(model, data_loader):\n",
        "    model.eval()\n",
        "    all_preds = []\n",
        "    all_labels = []\n",
        "\n",
        "    with torch.no_grad():\n",
        "        for batch in data_loader:\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            token_type_ids = batch['token_type_ids'].to(device)\n",
        "            targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "            outputs = model(input_ids, attn_mask, token_type_ids).squeeze()\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "\n",
        "            all_preds.extend(predicted.cpu().numpy())\n",
        "            all_labels.extend(targets.cpu().numpy())\n",
        "\n",
        "    all_preds = np.array(all_preds)\n",
        "    all_labels = np.array(all_labels)\n",
        "\n",
        "    # Calculate metrics for the fold\n",
        "    accuracy = accuracy_score(all_labels, all_preds)\n",
        "    precision = precision_score(all_labels, all_preds)\n",
        "    recall = recall_score(all_labels, all_preds)\n",
        "    f1 = f1_score(all_labels, all_preds)\n",
        "\n",
        "    return accuracy, precision, recall, f1, all_labels, all_preds\n",
        "\n",
        "# Create test dataset and loader (15% test set)\n",
        "test_data = test_data.reset_index(drop=True)\n",
        "\n",
        "test_dataset = CustomDataset(test_data, tokenizer, MAX_LEN)\n",
        "test_loader = DataLoader(test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)\n",
        "\n",
        "# 5-Fold Cross-Validation setup\n",
        "kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=46)\n",
        "\n",
        "# Track the best model and fold, and accumulate metrics\n",
        "best_model_state = None\n",
        "best_fold = -1\n",
        "best_loss = float('inf')\n",
        "\n",
        "# Start cross-validation\n",
        "for fold, (train_index, val_index) in enumerate(kf.split(train_val_data, train_val_data['Label'])):\n",
        "    print(f\"\\nFold {fold + 1}\\n\" + \"-\" * 10)\n",
        "\n",
        "    # Split the 85% training/validation data into training and validation for this fold\n",
        "    train_data = train_val_data.iloc[train_index].reset_index(drop=True)\n",
        "    val_data = train_val_data.iloc[val_index].reset_index(drop=True)\n",
        "\n",
        "    # Create datasets and loaders for this fold\n",
        "    train_dataset = CustomDataset(train_data, tokenizer, MAX_LEN)\n",
        "    val_dataset = CustomDataset(val_data, tokenizer, MAX_LEN)\n",
        "\n",
        "    train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=0)\n",
        "    val_loader = DataLoader(val_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)\n",
        "\n",
        "    # Custom model for binary classification\n",
        "    model = ELECTRAClass()\n",
        "    model.to(device)\n",
        "\n",
        "    criterion = nn.BCEWithLogitsLoss()\n",
        "    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)\n",
        "\n",
        "    # Training loop for this fold\n",
        "    patience = 8\n",
        "    best_valid_loss = float('inf')\n",
        "    patience_counter = 0\n",
        "\n",
        "    for epoch in range(EPOCHS):\n",
        "        model.train()\n",
        "        train_loss = 0.0\n",
        "        train_correct = 0\n",
        "        train_total = 0\n",
        "\n",
        "        for batch in train_loader:\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            token_type_ids = batch['token_type_ids'].to(device)\n",
        "            targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "            optimizer.zero_grad()\n",
        "            outputs = model(input_ids, attn_mask, token_type_ids).squeeze()\n",
        "            loss = criterion(outputs, targets)\n",
        "            loss.backward()\n",
        "            optimizer.step()\n",
        "\n",
        "            train_loss += loss.item()\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "            train_total += targets.size(0)\n",
        "            train_correct += (predicted == targets).sum().item()\n",
        "\n",
        "        train_accuracy = train_correct / train_total\n",
        "        train_loss /= len(train_loader)\n",
        "\n",
        "        # Validation loop\n",
        "        model.eval()\n",
        "        valid_loss = 0.0\n",
        "        valid_correct = 0\n",
        "        valid_total = 0\n",
        "\n",
        "        with torch.no_grad():\n",
        "            for batch in val_loader:\n",
        "                input_ids = batch['input_ids'].to(device)\n",
        "                attn_mask = batch['attention_mask'].to(device)\n",
        "                token_type_ids = batch['token_type_ids'].to(device)\n",
        "                targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "                outputs = model(input_ids, attn_mask, token_type_ids).squeeze()\n",
        "                loss = criterion(outputs, targets)\n",
        "                valid_loss += loss.item()\n",
        "                predicted = torch.round(torch.sigmoid(outputs))\n",
        "                valid_total += targets.size(0)\n",
        "                valid_correct += (predicted == targets).sum().item()\n",
        "\n",
        "        valid_accuracy = valid_correct / valid_total\n",
        "        valid_loss /= len(val_loader)\n",
        "\n",
        "        print(f'Epoch {epoch + 1}/{EPOCHS}, Training Accuracy: {train_accuracy:.4f}, Training Loss: {train_loss:.4f}, Validation Loss: {valid_loss:.4f}, Validation Accuracy: {valid_accuracy:.4f}')\n",
        "\n",
        "        # Early stopping logic and save best model\n",
        "        if valid_loss < best_valid_loss:\n",
        "            best_valid_loss = valid_loss\n",
        "            patience_counter = 0\n",
        "            # Save the model state for the current fold\n",
        "            torch.save(model.state_dict(), f'Data/ThirdIterationModels/Electra_Fold_{fold + 1}.pth')\n",
        "        else:\n",
        "            patience_counter += 1\n",
        "\n",
        "        if patience_counter >= patience:\n",
        "            print(\"Early stopping triggered\")\n",
        "            break\n",
        "\n",
        "    # Check if this is the best model across all folds\n",
        "    if best_valid_loss < best_loss:\n",
        "        best_loss = best_valid_loss\n",
        "        best_fold = fold + 1\n",
        "        best_model_state = model.state_dict()  # Save the state of the best model\n",
        "\n",
        "# Save the best model across all folds\n",
        "torch.save(best_model_state, 'Data/ThirdIterationModels/Electra_Best_Fold.pth')\n",
        "print(f\"\\nBest model saved from fold {best_fold}\")\n",
        "\n",
        "# Load the best model and evaluate on the test set\n",
        "model.load_state_dict(torch.load('Data/ThirdIterationModels/Electra_Best_Fold.pth'))\n",
        "model.to(device)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "38003145-8ed7-43c6-8bd3-0ec523b0eece",
      "metadata": {
        "tags": [],
        "id": "38003145-8ed7-43c6-8bd3-0ec523b0eece"
      },
      "outputs": [],
      "source": [
        "from sklearn.metrics import classification_report\n",
        "\n",
        "# Load the saved model\n",
        "model = ELECTRAClass()  # Updated to use DeBERTa\n",
        "model.load_state_dict(torch.load('Data/ThirdIterationModels/Electra_Best_Fold.pth'))  # Updated model path\n",
        "model.to(device)\n",
        "\n",
        "# Evaluate the model on the validation set\n",
        "def evaluate_on_validation(model, val_loader):\n",
        "    model.eval()\n",
        "    all_preds = []\n",
        "    all_labels = []\n",
        "\n",
        "    with torch.no_grad():\n",
        "        for batch in val_loader:\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            token_type_ids = batch['token_type_ids'].to(device)\n",
        "            targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "            outputs = model(input_ids, attn_mask, token_type_ids).squeeze()\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "\n",
        "            all_preds.extend(predicted.cpu().numpy())\n",
        "            all_labels.extend(targets.cpu().numpy())\n",
        "\n",
        "    return np.array(all_labels), np.array(all_preds)\n",
        "\n",
        "# Use the validation loader for evaluation\n",
        "true_labels, predicted_labels = evaluate_on_validation(model, test_loader)\n",
        "\n",
        "# Print the classification report\n",
        "print(\"\\nClassification Report on Validation Set:\")\n",
        "print(classification_report(true_labels, predicted_labels, target_names=['Class 0', 'Class 1']))\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "fbe4eaef-87a8-42d6-bfb9-85c7be7e1b66",
      "metadata": {
        "tags": [],
        "id": "fbe4eaef-87a8-42d6-bfb9-85c7be7e1b66"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "torch.cuda.empty_cache()\n",
        "torch.cuda.ipc_collect()"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "81219191-f210-49af-8ad3-33852a51d2ce",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true,
        "tags": [],
        "id": "81219191-f210-49af-8ad3-33852a51d2ce"
      },
      "source": [
        "## **XLNet f1=49**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "386192c2-2adf-4832-b5f4-be6a73cd218c",
      "metadata": {
        "tags": [],
        "id": "386192c2-2adf-4832-b5f4-be6a73cd218c"
      },
      "outputs": [],
      "source": [
        "from transformers import XLNetTokenizer, XLNetModel\n",
        "\n",
        "# Initial split: 85% for training/validation, 15% for test\n",
        "train_val_data, test_data = train_test_split(data, test_size=0.15, random_state=46, stratify=data['Label'])\n",
        "\n",
        "# Model and tokenizer setup\n",
        "MAX_LEN = 512\n",
        "TRAIN_BATCH_SIZE = 32\n",
        "VALID_BATCH_SIZE = 32\n",
        "EPOCHS = 40\n",
        "LEARNING_RATE = 1e-05\n",
        "device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\n",
        "tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')\n",
        "\n",
        "class XLNetClass(torch.nn.Module):\n",
        "    def __init__(self):\n",
        "        super(XLNetClass, self).__init__()\n",
        "        self.xlnet_model = XLNetModel.from_pretrained('xlnet-base-cased', return_dict=True)\n",
        "        self.dropout = torch.nn.Dropout(0.3)\n",
        "        self.linear = torch.nn.Linear(768, 1)  # Output to 1 for binary classification\n",
        "\n",
        "    def forward(self, input_ids, attn_mask, token_type_ids):\n",
        "        output = self.xlnet_model(\n",
        "            input_ids,\n",
        "            attention_mask=attn_mask,\n",
        "            token_type_ids=token_type_ids\n",
        "        )\n",
        "        output_dropout = self.dropout(output.last_hidden_state[:, -1, :])  # Get the CLS token output\n",
        "        output = self.linear(output_dropout)\n",
        "        return output\n",
        "\n",
        "# Custom dataset class for binary classification\n",
        "class CustomDataset(torch.utils.data.Dataset):\n",
        "    def __init__(self, df, tokenizer, max_len):\n",
        "        self.tokenizer = tokenizer\n",
        "        self.df = df\n",
        "        self.texts = df['Tokenized Text']\n",
        "        self.labels = df['Label']\n",
        "        self.max_len = max_len\n",
        "\n",
        "    def __len__(self):\n",
        "        return len(self.texts)\n",
        "\n",
        "    def __getitem__(self, index):\n",
        "        text = str(self.texts[index])\n",
        "        text = \" \".join(text.split())\n",
        "\n",
        "        inputs = self.tokenizer.encode_plus(\n",
        "            text,\n",
        "            None,\n",
        "            add_special_tokens=True,\n",
        "            max_length=self.max_len,\n",
        "            padding='max_length',\n",
        "            return_token_type_ids=True,\n",
        "            truncation=True,\n",
        "            return_attention_mask=True,\n",
        "            return_tensors='pt'\n",
        "        )\n",
        "\n",
        "        return {\n",
        "            'input_ids': inputs['input_ids'].flatten(),\n",
        "            'attention_mask': inputs['attention_mask'].flatten(),\n",
        "            'token_type_ids': inputs['token_type_ids'].flatten(),\n",
        "            'targets': torch.FloatTensor([self.labels[index]])  # Use FloatTensor for binary classification\n",
        "        }\n",
        "\n",
        "# Function to evaluate the model and return metrics for the fold\n",
        "def evaluate_fold(model, data_loader):\n",
        "    model.eval()\n",
        "    all_preds = []\n",
        "    all_labels = []\n",
        "\n",
        "    with torch.no_grad():\n",
        "        for batch in data_loader:\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            token_type_ids = batch['token_type_ids'].to(device)\n",
        "            targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "            outputs = model(input_ids, attn_mask, token_type_ids).squeeze()\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "\n",
        "            all_preds.extend(predicted.cpu().numpy())\n",
        "            all_labels.extend(targets.cpu().numpy())\n",
        "\n",
        "    all_preds = np.array(all_preds)\n",
        "    all_labels = np.array(all_labels)\n",
        "\n",
        "    # Calculate metrics for the fold\n",
        "    accuracy = accuracy_score(all_labels, all_preds)\n",
        "    precision = precision_score(all_labels, all_preds)\n",
        "    recall = recall_score(all_labels, all_preds)\n",
        "    f1 = f1_score(all_labels, all_preds)\n",
        "\n",
        "    return accuracy, precision, recall, f1, all_labels, all_preds\n",
        "\n",
        "# Create test dataset and loader (15% test set)\n",
        "test_data = test_data.reset_index(drop=True)\n",
        "test_dataset = CustomDataset(test_data, tokenizer, MAX_LEN)\n",
        "test_loader = DataLoader(test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)\n",
        "\n",
        "# 5-Fold Cross-Validation setup\n",
        "kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=46)\n",
        "\n",
        "# Track the best model and fold, and accumulate metrics\n",
        "best_model_state = None\n",
        "best_fold = -1\n",
        "best_loss = float('inf')\n",
        "\n",
        "# Start cross-validation\n",
        "for fold, (train_index, val_index) in enumerate(kf.split(train_val_data, train_val_data['Label'])):\n",
        "    print(f\"\\nFold {fold + 1}\\n\" + \"-\" * 10)\n",
        "\n",
        "    # Split the 85% training/validation data into training and validation for this fold\n",
        "    train_data = train_val_data.iloc[train_index].reset_index(drop=True)\n",
        "    val_data = train_val_data.iloc[val_index].reset_index(drop=True)\n",
        "\n",
        "    # Create datasets and loaders for this fold\n",
        "    train_dataset = CustomDataset(train_data, tokenizer, MAX_LEN)\n",
        "    val_dataset = CustomDataset(val_data, tokenizer, MAX_LEN)\n",
        "\n",
        "    train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=0)\n",
        "    val_loader = DataLoader(val_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)\n",
        "\n",
        "    # Custom model for binary classification\n",
        "    model = XLNetClass()\n",
        "    model.to(device)\n",
        "\n",
        "    criterion = nn.BCEWithLogitsLoss()\n",
        "    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)\n",
        "\n",
        "    # Training loop for this fold\n",
        "    patience = 8\n",
        "    best_valid_loss = float('inf')\n",
        "    patience_counter = 0\n",
        "\n",
        "    for epoch in range(EPOCHS):\n",
        "        model.train()\n",
        "        train_loss = 0.0\n",
        "        train_correct = 0\n",
        "        train_total = 0\n",
        "\n",
        "        for batch in train_loader:\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            token_type_ids = batch['token_type_ids'].to(device)\n",
        "            targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "            optimizer.zero_grad()\n",
        "            outputs = model(input_ids, attn_mask, token_type_ids).squeeze()\n",
        "            loss = criterion(outputs, targets)\n",
        "            loss.backward()\n",
        "            optimizer.step()\n",
        "\n",
        "            train_loss += loss.item()\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "            train_total += targets.size(0)\n",
        "            train_correct += (predicted == targets).sum().item()\n",
        "\n",
        "        train_accuracy = train_correct / train_total\n",
        "        train_loss /= len(train_loader)\n",
        "\n",
        "        # Validation loop\n",
        "        model.eval()\n",
        "        valid_loss = 0.0\n",
        "        valid_correct = 0\n",
        "        valid_total = 0\n",
        "\n",
        "        with torch.no_grad():\n",
        "            for batch in val_loader:\n",
        "                input_ids = batch['input_ids'].to(device)\n",
        "                attn_mask = batch['attention_mask'].to(device)\n",
        "                token_type_ids = batch['token_type_ids'].to(device)\n",
        "                targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "                outputs = model(input_ids, attn_mask, token_type_ids).squeeze()\n",
        "                loss = criterion(outputs, targets)\n",
        "                valid_loss += loss.item()\n",
        "                predicted = torch.round(torch.sigmoid(outputs))\n",
        "                valid_total += targets.size(0)\n",
        "                valid_correct += (predicted == targets).sum().item()\n",
        "\n",
        "        valid_accuracy = valid_correct / valid_total\n",
        "        valid_loss /= len(val_loader)\n",
        "\n",
        "        print(f'Epoch {epoch + 1}/{EPOCHS},Training Accuracy: {train_accuracy:.4f}, Training Loss: {train_loss:.4f}, Validation Loss: {valid_loss:.4f}, Validation Accuracy: {valid_accuracy:.4f}')\n",
        "\n",
        "        # Early stopping logic and save best model\n",
        "        if valid_loss < best_valid_loss:\n",
        "            best_valid_loss = valid_loss\n",
        "            patience_counter = 0\n",
        "            # Save the model state for the current fold\n",
        "            torch.save(model.state_dict(), f'Data/ThirdIterationModels/XLNet_Fold_{fold + 1}.pth')\n",
        "        else:\n",
        "            patience_counter += 1\n",
        "\n",
        "        if patience_counter >= patience:\n",
        "            print(\"Early stopping triggered\")\n",
        "            break\n",
        "\n",
        "    # Check if this is the best model across all folds\n",
        "    if best_valid_loss < best_loss:\n",
        "        best_loss = best_valid_loss\n",
        "        best_fold = fold + 1\n",
        "        best_model_state = model.state_dict()  # Save the state of the best model\n",
        "\n",
        "# Save the best model across all folds\n",
        "torch.save(best_model_state, 'Data/ThirdIterationModels/XLNet_Best_Fold.pth')\n",
        "print(f\"\\nBest model saved from fold {best_fold}\")\n",
        "\n",
        "# Load the best model and evaluate on the test set\n",
        "model.load_state_dict(torch.load('Data/ThirdIterationModels/XLNet_Best_Fold.pth'))\n",
        "model.to(device)\n",
        "\n",
        "# Test the best model on the test set\n",
        "accuracy, precision, recall, f1, test_labels, test_preds = evaluate_fold(model, test_loader)\n",
        "print(f\"Test Set - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "9fbe3402-5e31-4266-9f5c-44375823e17a",
      "metadata": {
        "tags": [],
        "id": "9fbe3402-5e31-4266-9f5c-44375823e17a"
      },
      "outputs": [],
      "source": [
        "class XLNetClass(torch.nn.Module):\n",
        "    def __init__(self):\n",
        "        super(XLNetClass, self).__init__()\n",
        "        self.xlnet_model = XLNetModel.from_pretrained('xlnet-base-cased', return_dict=True)\n",
        "        self.dropout = torch.nn.Dropout(0.3)\n",
        "        self.linear = torch.nn.Linear(768, 1)  # Change output to 1 for binary classification\n",
        "\n",
        "    def forward(self, input_ids, token_type_ids=None, attention_mask=None):\n",
        "        # XLNet uses input_ids, token_type_ids, and attention_mask\n",
        "        output = self.xlnet_model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)\n",
        "        output_dropout = self.dropout(output.last_hidden_state[:, 0, :])  # Use CLS token's representation\n",
        "        output = self.linear(output_dropout)\n",
        "        return output\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "f427d8f4-05ad-4f22-b7ec-aa6371f2c715",
      "metadata": {
        "tags": [],
        "id": "f427d8f4-05ad-4f22-b7ec-aa6371f2c715"
      },
      "outputs": [],
      "source": [
        "from sklearn.metrics import classification_report\n",
        "import torch\n",
        "\n",
        "# Load the saved XLNet model\n",
        "model = XLNetClass()  # Ensure that XLNetClass is defined and initialized\n",
        "model.load_state_dict(torch.load('Data/ThirdIterationModels/XLNet_Best_Fold.pth'))\n",
        "model.to(device)\n",
        "\n",
        "# Evaluate the model on the validation set\n",
        "def evaluate_on_validation(model, test_loader):\n",
        "    model.eval()  # Set model to evaluation mode\n",
        "    all_preds = []\n",
        "    all_labels = []\n",
        "\n",
        "    with torch.no_grad():  # Disable gradient computation for evaluation\n",
        "        for batch in test_loader:\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            token_type_ids = batch['token_type_ids'].to(device)\n",
        "            targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "            # Perform forward pass with XLNet\n",
        "            outputs = model(input_ids=input_ids, attention_mask=attn_mask, token_type_ids=token_type_ids).squeeze()\n",
        "\n",
        "            # Convert logits to binary predictions (0 or 1)\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "\n",
        "            # Collect predictions and true labels\n",
        "            all_preds.extend(predicted.cpu().numpy())\n",
        "            all_labels.extend(targets.cpu().numpy())\n",
        "\n",
        "    return np.array(all_labels), np.array(all_preds)\n",
        "\n",
        "\n",
        "# Use the validation loader for evaluation\n",
        "true_labels, predicted_labels = evaluate_on_validation(model, test_loader)\n",
        "\n",
        "# Print the classification report\n",
        "print(\"\\nClassification Report on Validation Set:\")\n",
        "print(classification_report(true_labels, predicted_labels, target_names=['Class 0', 'Class 1']))\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "c48ef290-748e-4f86-a00e-183d1e8cd0c1",
      "metadata": {
        "tags": [],
        "id": "c48ef290-748e-4f86-a00e-183d1e8cd0c1"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "torch.cuda.empty_cache()\n",
        "torch.cuda.ipc_collect()"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "4a48a0de-eaaa-4069-88bd-a43447193409",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true,
        "tags": [],
        "id": "4a48a0de-eaaa-4069-88bd-a43447193409"
      },
      "source": [
        "## **T5 f1=48**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "30cdc992-5911-4e96-9fa1-b641cb764294",
      "metadata": {
        "tags": [],
        "id": "30cdc992-5911-4e96-9fa1-b641cb764294"
      },
      "outputs": [],
      "source": [
        "from transformers import T5Tokenizer, T5ForConditionalGeneration\n",
        "\n",
        "train_val_data, test_data = train_test_split(data, test_size=0.15, random_state=46, stratify=data['Label'])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "7c5ffcf1-b10a-4fec-ad1c-319851e91f4c",
      "metadata": {
        "tags": [],
        "id": "7c5ffcf1-b10a-4fec-ad1c-319851e91f4c"
      },
      "outputs": [],
      "source": [
        "from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score\n",
        "import torch\n",
        "import numpy as np\n",
        "from sklearn.model_selection import StratifiedKFold, train_test_split\n",
        "from torch.utils.data import DataLoader\n",
        "import torch.nn as nn\n",
        "from transformers import T5Tokenizer, T5ForSequenceClassification\n",
        "\n",
        "# Initial split: 85% for training/validation, 15% for test\n",
        "train_val_data, test_data = train_test_split(data, test_size=0.15, random_state=46, stratify=data['Label'])\n",
        "\n",
        "# Model and tokenizer setup\n",
        "MAX_LEN = 512\n",
        "TRAIN_BATCH_SIZE = 32\n",
        "VALID_BATCH_SIZE = 32\n",
        "EPOCHS = 40\n",
        "LEARNING_RATE = 1e-05\n",
        "device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\n",
        "\n",
        "tokenizer = T5Tokenizer.from_pretrained('t5-base')\n",
        "\n",
        "# Custom model for binary classification using T5ForSequenceClassification\n",
        "class T5Class(torch.nn.Module):\n",
        "    def __init__(self):\n",
        "        super(T5Class, self).__init__()\n",
        "        self.t5 = T5ForSequenceClassification.from_pretrained('t5-base', num_labels=1)\n",
        "\n",
        "    def forward(self, input_ids, attention_mask):\n",
        "        outputs = self.t5(input_ids=input_ids, attention_mask=attention_mask)\n",
        "        return outputs.logits\n",
        "\n",
        "\n",
        "# Custom dataset class for binary classification\n",
        "class CustomDataset(torch.utils.data.Dataset):\n",
        "    def __init__(self, df, tokenizer, max_len):\n",
        "        self.tokenizer = tokenizer\n",
        "        self.df = df\n",
        "        self.texts = df['Tokenized Text']\n",
        "        self.labels = df['Label']\n",
        "        self.max_len = max_len\n",
        "\n",
        "    def __len__(self):\n",
        "        return len(self.texts)\n",
        "\n",
        "    def __getitem__(self, index):\n",
        "        text = str(self.texts[index])\n",
        "        text = \" \".join(text.split())\n",
        "\n",
        "        inputs = self.tokenizer.encode_plus(\n",
        "            text,\n",
        "            None,\n",
        "            add_special_tokens=True,\n",
        "            max_length=self.max_len,\n",
        "            padding='max_length',\n",
        "            truncation=True,\n",
        "            return_attention_mask=True,\n",
        "            return_tensors='pt'\n",
        "        )\n",
        "\n",
        "        return {\n",
        "            'input_ids': inputs['input_ids'].flatten(),\n",
        "            'attention_mask': inputs['attention_mask'].flatten(),\n",
        "            'targets': torch.FloatTensor([self.labels[index]])  # Binary classification: FloatTensor for targets\n",
        "        }\n",
        "\n",
        "\n",
        "# Function to evaluate the model and return metrics for the fold\n",
        "def evaluate_fold(model, data_loader):\n",
        "    model.eval()\n",
        "    all_preds = []\n",
        "    all_labels = []\n",
        "\n",
        "    with torch.no_grad():\n",
        "        for batch in data_loader:\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "            outputs = model(input_ids, attn_mask).squeeze()\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "\n",
        "            all_preds.extend(predicted.cpu().numpy())\n",
        "            all_labels.extend(targets.cpu().numpy())\n",
        "\n",
        "    all_preds = np.array(all_preds)\n",
        "    all_labels = np.array(all_labels)\n",
        "\n",
        "    # Calculate metrics for the fold\n",
        "    accuracy = accuracy_score(all_labels, all_preds)\n",
        "    precision = precision_score(all_labels, all_preds)\n",
        "    recall = recall_score(all_labels, all_preds)\n",
        "    f1 = f1_score(all_labels, all_preds)\n",
        "\n",
        "    return accuracy, precision, recall, f1, all_labels, all_preds\n",
        "\n",
        "\n",
        "# Create test dataset and loader (15% test set)\n",
        "test_data = test_data.reset_index(drop=True)\n",
        "\n",
        "test_dataset = CustomDataset(test_data, tokenizer, MAX_LEN)\n",
        "test_loader = DataLoader(test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)\n",
        "\n",
        "# 5-Fold Cross-Validation setup\n",
        "kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=46)\n",
        "\n",
        "# Track the best model and fold, and accumulate metrics\n",
        "best_model_state = None\n",
        "best_fold = -1\n",
        "best_loss = float('inf')\n",
        "\n",
        "# Start cross-validation\n",
        "for fold, (train_index, val_index) in enumerate(kf.split(train_val_data, train_val_data['Label'])):\n",
        "    print(f\"\\nFold {fold + 1}\\n\" + \"-\" * 10)\n",
        "\n",
        "    # Split the 85% training/validation data into training and validation for this fold\n",
        "    train_data = train_val_data.iloc[train_index].reset_index(drop=True)\n",
        "    val_data = train_val_data.iloc[val_index].reset_index(drop=True)\n",
        "\n",
        "    # Create datasets and loaders for this fold\n",
        "    train_dataset = CustomDataset(train_data, tokenizer, MAX_LEN)\n",
        "    val_dataset = CustomDataset(val_data, tokenizer, MAX_LEN)\n",
        "\n",
        "    train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=0)\n",
        "    val_loader = DataLoader(val_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)\n",
        "\n",
        "    # Custom model for binary classification\n",
        "    model = T5Class()\n",
        "    model.to(device)\n",
        "\n",
        "    criterion = nn.BCEWithLogitsLoss()\n",
        "    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)\n",
        "\n",
        "    # Training loop for this fold\n",
        "    patience = 8\n",
        "    best_valid_loss = float('inf')\n",
        "    patience_counter = 0\n",
        "\n",
        "    for epoch in range(EPOCHS):\n",
        "        model.train()\n",
        "        train_loss = 0.0\n",
        "        train_correct = 0\n",
        "        train_total = 0\n",
        "\n",
        "        for batch in train_loader:\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "            optimizer.zero_grad()\n",
        "            outputs = model(input_ids, attn_mask).squeeze()\n",
        "            loss = criterion(outputs, targets)\n",
        "            loss.backward()\n",
        "            optimizer.step()\n",
        "\n",
        "            train_loss += loss.item()\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "            train_total += targets.size(0)\n",
        "            train_correct += (predicted == targets).sum().item()\n",
        "\n",
        "        train_accuracy = train_correct / train_total\n",
        "        train_loss /= len(train_loader)\n",
        "\n",
        "        # Validation loop\n",
        "        model.eval()\n",
        "        valid_loss = 0.0\n",
        "        valid_correct = 0\n",
        "        valid_total = 0\n",
        "\n",
        "        with torch.no_grad():\n",
        "            for batch in val_loader:\n",
        "                input_ids = batch['input_ids'].to(device)\n",
        "                attn_mask = batch['attention_mask'].to(device)\n",
        "                targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "                outputs = model(input_ids, attn_mask).squeeze()\n",
        "                loss = criterion(outputs, targets)\n",
        "                valid_loss += loss.item()\n",
        "                predicted = torch.round(torch.sigmoid(outputs))\n",
        "                valid_total += targets.size(0)\n",
        "                valid_correct += (predicted == targets).sum().item()\n",
        "\n",
        "        valid_accuracy = valid_correct / valid_total\n",
        "        valid_loss /= len(val_loader)\n",
        "\n",
        "        print(f'Epoch {epoch + 1}/{EPOCHS}, Training Accuracy: {train_accuracy:.4f}, Training Loss: {train_loss:.4f}, Validation Loss: {valid_loss:.4f}, Validation Accuracy: {valid_accuracy:.4f}')\n",
        "\n",
        "        # Early stopping logic and save best model\n",
        "        if valid_loss < best_valid_loss:\n",
        "            best_valid_loss = valid_loss\n",
        "            patience_counter = 0\n",
        "            # Save the model state for the current fold\n",
        "            torch.save(model.state_dict(), f'Data/ThirdIterationModels/T5_Fold_{fold + 1}.pth')\n",
        "        else:\n",
        "            patience_counter += 1\n",
        "\n",
        "        if patience_counter >= patience:\n",
        "            print(\"Early stopping triggered\")\n",
        "            break\n",
        "\n",
        "    # Check if this is the best model across all folds\n",
        "    if best_valid_loss < best_loss:\n",
        "        best_loss = best_valid_loss\n",
        "        best_fold = fold + 1\n",
        "        best_model_state = model.state_dict()  # Save the state of the best model\n",
        "\n",
        "# Save the best model across all folds\n",
        "torch.save(best_model_state, 'Data/ThirdIterationModels/T5_Best_Fold.pth')\n",
        "print(f\"\\nBest model saved from fold {best_fold}\")\n",
        "\n",
        "# Load the best model and evaluate on the test set\n",
        "model.load_state_dict(torch.load('Data/ThirdIterationModels/T5_Best_Fold.pth'))\n",
        "model.to(device)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "4c54e4b9-bf1e-457d-b304-6f7db220eded",
      "metadata": {
        "tags": [],
        "id": "4c54e4b9-bf1e-457d-b304-6f7db220eded"
      },
      "outputs": [],
      "source": [
        "from sklearn.metrics import classification_report\n",
        "import torch\n",
        "import numpy as np\n",
        "\n",
        "# Function to evaluate the model on the test dataset\n",
        "def evaluate_on_test(model, test_loader):\n",
        "    model.eval()\n",
        "    all_preds = []\n",
        "    all_labels = []\n",
        "\n",
        "    with torch.no_grad():\n",
        "        for batch in test_loader:\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "            outputs = model(input_ids, attn_mask).squeeze()\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "\n",
        "            all_preds.extend(predicted.cpu().numpy())\n",
        "            all_labels.extend(targets.cpu().numpy())\n",
        "\n",
        "    return np.array(all_preds), np.array(all_labels)\n",
        "\n",
        "# Load the saved model (update the path if needed)\n",
        "model = T5Class()  # Use the same model architecture as during training\n",
        "model.load_state_dict(torch.load('Data/ThirdIterationModels/T5_Best_Fold.pth'))  # Path to the saved model\n",
        "model.to(device)\n",
        "\n",
        "# Create test dataset and loader (assuming test_data is already prepared)\n",
        "test_dataset = CustomDataset(test_data, tokenizer, MAX_LEN)\n",
        "test_loader = DataLoader(test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)\n",
        "\n",
        "# Evaluate the model on the test dataset\n",
        "y_pred, y_true = evaluate_on_test(model, test_loader)\n",
        "\n",
        "# Generate classification report\n",
        "report = classification_report(y_true, y_pred, target_names=['class 0', 'class 1'])\n",
        "\n",
        "# Print the classification report\n",
        "print(report)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "379a23b5-7b49-47af-bdee-c7174346e0cf",
      "metadata": {
        "tags": [],
        "id": "379a23b5-7b49-47af-bdee-c7174346e0cf"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "torch.cuda.empty_cache()\n",
        "torch.cuda.ipc_collect()"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "44e9b4a5-2434-41c1-b553-c7b585108f4a",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true,
        "tags": [],
        "id": "44e9b4a5-2434-41c1-b553-c7b585108f4a"
      },
      "source": [
        "## **LLaMA-1.3B FewShot f1=45**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "c1c3e004-3669-4734-859c-030427a5b94b",
      "metadata": {
        "tags": [],
        "id": "c1c3e004-3669-4734-859c-030427a5b94b"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "from transformers import LlamaTokenizer, LlamaForCausalLM\n",
        "from sklearn.metrics import classification_report\n",
        "from sklearn.model_selection import train_test_split\n",
        "import psutil\n",
        "import re\n",
        "\n",
        "# Set device to CUDA if available, otherwise use CPU\n",
        "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
        "\n",
        "# Load the tokenizer and model\n",
        "tokenizer = LlamaTokenizer.from_pretrained('princeton-nlp/Sheared-LLaMA-1.3B')\n",
        "llama_model = LlamaForCausalLM.from_pretrained('princeton-nlp/Sheared-LLaMA-1.3B').to(device)\n",
        "\n",
        "\n",
        "\n",
        "# Add special tokens\n",
        "special_tokens_dict = {'pad_token': '[PAD]', 'bos_token': '[BOS]', 'eos_token': '[EOS]'}\n",
        "num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)\n",
        "\n",
        "# Resize token embeddings in the model to accommodate new tokens\n",
        "llama_model.resize_token_embeddings(len(tokenizer))\n",
        "\n",
        "# Function to extract the label from the generated output using regular expressions\n",
        "def extract_label_from_output(output_text):\n",
        "    \"\"\"\n",
        "    Use regular expressions to extract the label from the generated text.\n",
        "    Expected format: 'Label: [predicted label]'\n",
        "    \"\"\"\n",
        "    match = re.search(r'Label:\\s*(\\w+)', output_text)\n",
        "    if match:\n",
        "        return match.group(1)  # Extract the label\n",
        "    else:\n",
        "        return \"Label not found\"\n",
        "\n",
        "# Create few-shot prompts with a clear label structure\n",
        "def create_few_shot_prompt(train_data, test_instance, num_examples=5):\n",
        "    few_shot_examples = train_data.sample(n=num_examples)\n",
        "    prompt = \"\"\n",
        "    for _, example in few_shot_examples.iterrows():\n",
        "        prompt += f\"Text: {example['Tokenized Text']}\\nLabel: {example['Label']}\\n---\\n\"\n",
        "\n",
        "    # Ensure the test instance follows the same format\n",
        "    prompt += f\"Text: {test_instance['Tokenized Text']}\\nLabel: ?\\n\"\n",
        "    return prompt\n",
        "\n",
        "# Function to predict using the LLaMA model and extract the label\n",
        "def predict_with_llama(prompt):\n",
        "    # Tokenize and generate output using the model\n",
        "    inputs = tokenizer(prompt, return_tensors=\"pt\", padding=True, truncation=True, max_length=128).to(device)\n",
        "    input_ids = inputs['input_ids']\n",
        "\n",
        "    # Generate output with a limit on the number of new tokens to avoid noise\n",
        "    output = llama_model.generate(input_ids, max_new_tokens=10, num_return_sequences=1, pad_token_id=tokenizer.pad_token_id)\n",
        "    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)\n",
        "\n",
        "    # Extract the label from the generated output\n",
        "    predicted_label = extract_label_from_output(decoded_output)\n",
        "\n",
        "    return predicted_label\n",
        "\n",
        "# Function to batch process and predict on test data\n",
        "def batch_predict(test_data, train_data, batch_size=1):\n",
        "    predictions = []\n",
        "    true_labels = test_data['Label'].values\n",
        "    for i in range(0, len(test_data), batch_size):\n",
        "        batch = test_data.iloc[i:i + batch_size]\n",
        "        for _, test_instance in batch.iterrows():\n",
        "            prompt = create_few_shot_prompt(train_data, test_instance)\n",
        "            predicted_label = predict_with_llama(prompt)\n",
        "            predictions.append(predicted_label)\n",
        "    return predictions, true_labels\n",
        "\n",
        "# Example dataset splitting (assuming 'data' DataFrame is available)\n",
        "train_val_data, test_data = train_test_split(data, test_size=0.15, random_state=46, stratify=data['Label'])\n",
        "\n",
        "# Monitor memory usage before processing\n",
        "print(f\"Memory usage: {psutil.virtual_memory().percent}%\")\n",
        "\n",
        "# Process predictions in batches to save memory\n",
        "predictions, true_labels = batch_predict(test_data, train_val_data, batch_size=1)\n",
        "\n",
        "# Ensure consistent types for labels and predictions (convert everything to strings)\n",
        "true_labels_str = [str(label) for label in true_labels]\n",
        "predictions_str = [str(pred) for pred in predictions]\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "698320cb-ff7f-4a84-9313-9e3c334c88d4",
      "metadata": {
        "tags": [],
        "id": "698320cb-ff7f-4a84-9313-9e3c334c88d4"
      },
      "outputs": [],
      "source": [
        "predictions_str_cl = ['0' if text == 'Label not found' else text for text in predictions_str]\n",
        "# Generate the classification report\n",
        "report = classification_report(true_labels_str, predictions_str_cl, target_names=['Label_1', 'Label_2'])\n",
        "print(report)\n",
        "\n",
        "# Monitor memory usage after processing\n",
        "print(f\"Memory usage: {psutil.virtual_memory().percent}%\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "37467ad7-7e0f-47ca-a32a-2a054d1ed960",
      "metadata": {
        "tags": [],
        "id": "37467ad7-7e0f-47ca-a32a-2a054d1ed960"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "torch.cuda.empty_cache()\n",
        "torch.cuda.ipc_collect()"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "9e97d425-1b1e-4859-ac31-7beae16b4af3",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true,
        "tags": [],
        "id": "9e97d425-1b1e-4859-ac31-7beae16b4af3"
      },
      "source": [
        "## **LLaMA-7B FewShot f1=34**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "ddf316be-1345-41d3-bb55-9f3f58f6aada",
      "metadata": {
        "tags": [],
        "id": "ddf316be-1345-41d3-bb55-9f3f58f6aada"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "from transformers import LlamaTokenizer, LlamaForCausalLM\n",
        "from sklearn.metrics import classification_report\n",
        "from sklearn.model_selection import train_test_split\n",
        "import psutil\n",
        "import re\n",
        "\n",
        "# Set device to CUDA if available, otherwise use CPU\n",
        "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
        "\n",
        "# Load the tokenizer and model\n",
        "# tokenizer = LlamaTokenizer.from_pretrained('princeton-nlp/Sheared-LLaMA-1.3B')\n",
        "# llama_model = LlamaForCausalLM.from_pretrained('princeton-nlp/Sheared-LLaMA-1.3B').to(device)\n",
        "tokenizer = LlamaTokenizer.from_pretrained('huggyllama/llama-7b')\n",
        "llama_model = LlamaForCausalLM.from_pretrained('huggyllama/llama-7b').to(device)\n",
        "\n",
        "\n",
        "# Add special tokens\n",
        "special_tokens_dict = {'pad_token': '[PAD]', 'bos_token': '[BOS]', 'eos_token': '[EOS]'}\n",
        "num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)\n",
        "\n",
        "# Resize token embeddings in the model to accommodate new tokens\n",
        "llama_model.resize_token_embeddings(len(tokenizer))\n",
        "\n",
        "# Function to extract the label from the generated output using regular expressions\n",
        "def extract_label_from_output(output_text):\n",
        "    \"\"\"\n",
        "    Use regular expressions to extract the label from the generated text.\n",
        "    Expected format: 'Label: [predicted label]'\n",
        "    \"\"\"\n",
        "    match = re.search(r'Label:\\s*(\\w+)', output_text)\n",
        "    if match:\n",
        "        return match.group(1)  # Extract the label\n",
        "    else:\n",
        "        return \"Label not found\"\n",
        "\n",
        "# Create few-shot prompts with a clear label structure\n",
        "def create_few_shot_prompt(train_data, test_instance, num_examples=5):\n",
        "    few_shot_examples = train_data.sample(n=num_examples)\n",
        "    prompt = \"\"\n",
        "    for _, example in few_shot_examples.iterrows():\n",
        "        prompt += f\"Text: {example['Tokenized Text']}\\nLabel: {example['Label']}\\n---\\n\"\n",
        "\n",
        "    # Ensure the test instance follows the same format\n",
        "    prompt += f\"Text: {test_instance['Tokenized Text']}\\nLabel: ?\\n\"\n",
        "    return prompt\n",
        "\n",
        "# Function to predict using the LLaMA model and extract the label\n",
        "def predict_with_llama(prompt):\n",
        "    # Tokenize and generate output using the model\n",
        "    inputs = tokenizer(prompt, return_tensors=\"pt\", padding=True, truncation=True, max_length=128).to(device)\n",
        "    input_ids = inputs['input_ids']\n",
        "\n",
        "    # Generate output with a limit on the number of new tokens to avoid noise\n",
        "    output = llama_model.generate(input_ids, max_new_tokens=10, num_return_sequences=1, pad_token_id=tokenizer.pad_token_id)\n",
        "    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)\n",
        "\n",
        "    # Extract the label from the generated output\n",
        "    predicted_label = extract_label_from_output(decoded_output)\n",
        "\n",
        "    return predicted_label\n",
        "\n",
        "# Function to batch process and predict on test data\n",
        "def batch_predict(test_data, train_data, batch_size=1):\n",
        "    predictions = []\n",
        "    true_labels = test_data['Label'].values\n",
        "    for i in range(0, len(test_data), batch_size):\n",
        "        batch = test_data.iloc[i:i + batch_size]\n",
        "        for _, test_instance in batch.iterrows():\n",
        "            prompt = create_few_shot_prompt(train_data, test_instance)\n",
        "            predicted_label = predict_with_llama(prompt)\n",
        "            predictions.append(predicted_label)\n",
        "    return predictions, true_labels\n",
        "\n",
        "# Example dataset splitting (assuming 'data' DataFrame is available)\n",
        "train_val_data, test_data = train_test_split(data, test_size=0.15, random_state=46, stratify=data['Label'])\n",
        "\n",
        "# Monitor memory usage before processing\n",
        "print(f\"Memory usage: {psutil.virtual_memory().percent}%\")\n",
        "\n",
        "# Process predictions in batches to save memory\n",
        "predictions, true_labels = batch_predict(test_data, train_val_data, batch_size=1)\n",
        "\n",
        "# Ensure consistent types for labels and predictions (convert everything to strings)\n",
        "true_labels_str = [str(label) for label in true_labels]\n",
        "predictions_str = [str(pred) for pred in predictions]\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "c6af7f40-41b3-4c0e-9939-172a5ae2d327",
      "metadata": {
        "tags": [],
        "id": "c6af7f40-41b3-4c0e-9939-172a5ae2d327"
      },
      "outputs": [],
      "source": [
        "predictions_str_cl = ['0' if text == 'Label not found' else text for text in predictions_str]\n",
        "\n",
        "# Generate the classification report\n",
        "report = classification_report(true_labels_str, predictions_str_cl, target_names=['Label_1', 'Label_2'])\n",
        "print(report)\n",
        "\n",
        "# Monitor memory usage after processing\n",
        "print(f\"Memory usage: {psutil.virtual_memory().percent}%\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "a8f5bd44-40e5-44b1-9ae9-6113f3a82ba9",
      "metadata": {
        "tags": [],
        "id": "a8f5bd44-40e5-44b1-9ae9-6113f3a82ba9"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "torch.cuda.empty_cache()\n",
        "torch.cuda.ipc_collect()"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "33c65d39-8591-4a77-a98c-a611b46b56de",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true,
        "tags": [],
        "id": "33c65d39-8591-4a77-a98c-a611b46b56de"
      },
      "source": [
        "## **Claudia FewShot f1=23**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "de199260-634d-4b78-b7f1-b1fd99a6ab5d",
      "metadata": {
        "tags": [],
        "id": "de199260-634d-4b78-b7f1-b1fd99a6ab5d"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
        "from sklearn.metrics import classification_report\n",
        "from sklearn.model_selection import train_test_split\n",
        "import psutil\n",
        "import re\n",
        "from torch.cuda.amp import autocast, GradScaler\n",
        "\n",
        "# Set device to CUDA if available, otherwise use CPU\n",
        "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
        "\n",
        "# Load the tokenizer and model\n",
        "tokenizer = AutoTokenizer.from_pretrained('KoboldAI/OPT-13B-Erebus')\n",
        "llama_model = AutoModelForCausalLM.from_pretrained('KoboldAI/OPT-13B-Erebus').to(device)\n",
        "\n",
        "# Enable gradient checkpointing to save memory\n",
        "llama_model.gradient_checkpointing_enable()\n",
        "\n",
        "# Add special tokens\n",
        "special_tokens_dict = {'pad_token': '[PAD]', 'bos_token': '[BOS]', 'eos_token': '[EOS]'}\n",
        "num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)\n",
        "\n",
        "# Resize token embeddings in the model to accommodate new tokens\n",
        "llama_model.resize_token_embeddings(len(tokenizer))\n",
        "\n",
        "# Function to extract the label from the generated output using regular expressions\n",
        "def extract_label_from_output(output_text):\n",
        "    \"\"\"\n",
        "    Use regular expressions to extract the label from the generated text.\n",
        "    Expected format: 'Label: [predicted label]'\n",
        "    \"\"\"\n",
        "    match = re.search(r'Label:\\s*(\\w+)', output_text)\n",
        "    if match:\n",
        "        return match.group(1)  # Extract the label\n",
        "    else:\n",
        "        return \"Label not found\"\n",
        "\n",
        "# Create few-shot prompts with a clear label structure\n",
        "def create_few_shot_prompt(train_data, test_instance, num_examples=5):\n",
        "    few_shot_examples = train_data.sample(n=num_examples)\n",
        "    prompt = \"\"\n",
        "    for _, example in few_shot_examples.iterrows():\n",
        "        prompt += f\"Text: {example['Tokenized Text']}\\nLabel: {example['Label']}\\n---\\n\"\n",
        "\n",
        "    # Ensure the test instance follows the same format\n",
        "    prompt += f\"Text: {test_instance['Tokenized Text']}\\nLabel: ?\\n\"\n",
        "    return prompt\n",
        "\n",
        "# Function to predict using the LLaMA model and extract the label\n",
        "def predict_with_llama(prompt):\n",
        "    # Tokenize and generate output using the model\n",
        "    inputs = tokenizer(prompt, return_tensors=\"pt\", padding=True, truncation=True, max_length=64).to(device)  # Reduce max_length\n",
        "    input_ids = inputs['input_ids']\n",
        "\n",
        "    # Mixed precision training using autocast\n",
        "    with autocast():\n",
        "        output = llama_model.generate(input_ids, max_new_tokens=10, num_return_sequences=1, pad_token_id=tokenizer.pad_token_id)\n",
        "\n",
        "    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)\n",
        "\n",
        "    # Extract the label from the generated output\n",
        "    predicted_label = extract_label_from_output(decoded_output)\n",
        "\n",
        "    return predicted_label\n",
        "\n",
        "# Function to batch process and predict on test data\n",
        "def batch_predict(test_data, train_data, batch_size=1):\n",
        "    predictions = []\n",
        "    true_labels = test_data['Label'].values\n",
        "    scaler = GradScaler()  # Use mixed precision scaling\n",
        "    for i in range(0, len(test_data), batch_size):\n",
        "        batch = test_data.iloc[i:i + batch_size]\n",
        "        for _, test_instance in batch.iterrows():\n",
        "            prompt = create_few_shot_prompt(train_data, test_instance)\n",
        "            predicted_label = predict_with_llama(prompt)\n",
        "            predictions.append(predicted_label)\n",
        "    return predictions, true_labels\n",
        "\n",
        "# Example dataset splitting (assuming 'data' DataFrame is available)\n",
        "train_val_data, test_data = train_test_split(data, test_size=0.15, random_state=46, stratify=data['Label'])\n",
        "\n",
        "# Monitor memory usage before processing\n",
        "print(f\"Memory usage: {psutil.virtual_memory().percent}%\")\n",
        "\n",
        "# Process predictions in batches to save memory\n",
        "predictions, true_labels = batch_predict(test_data, train_val_data, batch_size=1)\n",
        "\n",
        "# Ensure consistent types for labels and predictions (convert everything to strings)\n",
        "true_labels_str = [str(label) for label in true_labels]\n",
        "predictions_str = [str(pred) for pred in predictions]\n",
        "\n",
        "# Print classification report\n",
        "print(classification_report(true_labels_str, predictions_str))\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "366d8aa4-ea8e-4f24-b754-d7d4b8a07974",
      "metadata": {
        "tags": [],
        "id": "366d8aa4-ea8e-4f24-b754-d7d4b8a07974"
      },
      "outputs": [],
      "source": [
        "predictions_str_cl = ['0' if text == 'Label not found' else text for text in predictions_str]\n",
        "\n",
        "predictions_str_cl = ['0' if text == 'rpi' else text for text in predictions_str_cl]\n",
        "predictions_str_cl = ['0' if text == 'I' else text for text in predictions_str_cl]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "b6804df9-36f1-4d13-b091-50ecc3b56d93",
      "metadata": {
        "tags": [],
        "id": "b6804df9-36f1-4d13-b091-50ecc3b56d93"
      },
      "outputs": [],
      "source": [
        "predictions_str_cl = ['0' if text == 'Text' else text for text in predictions_str_cl]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "11f27a99-0406-492b-b4db-cca9a2ddf144",
      "metadata": {
        "tags": [],
        "id": "11f27a99-0406-492b-b4db-cca9a2ddf144"
      },
      "outputs": [],
      "source": [
        "# Generate the classification report\n",
        "report = classification_report(true_labels_str, predictions_str_cl, target_names=['Label_1', 'Label_2'])\n",
        "print(report)\n",
        "\n",
        "# Monitor memory usage after processing\n",
        "print(f\"Memory usage: {psutil.virtual_memory().percent}%\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "8b1814d5-ffb5-416e-bdaf-4b9871cf3a58",
      "metadata": {
        "tags": [],
        "id": "8b1814d5-ffb5-416e-bdaf-4b9871cf3a58"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "torch.cuda.empty_cache()\n",
        "torch.cuda.ipc_collect()"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "4c80de02-fefd-4fc5-bf80-6c33020efa30",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true,
        "tags": [],
        "id": "4c80de02-fefd-4fc5-bf80-6c33020efa30"
      },
      "source": [
        "## **OPT-13B FewShot f1=40**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "557ab974-d6f7-4ce1-b11b-9f26139f1f43",
      "metadata": {
        "tags": [],
        "id": "557ab974-d6f7-4ce1-b11b-9f26139f1f43"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
        "from sklearn.metrics import classification_report\n",
        "from sklearn.model_selection import train_test_split\n",
        "import psutil\n",
        "import re\n",
        "\n",
        "# Set device to CUDA if available, otherwise use CPU\n",
        "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
        "\n",
        "# Load the tokenizer and model\n",
        "tokenizer = AutoTokenizer.from_pretrained('KoboldAI/OPT-13B-Erebus')\n",
        "llama_model = AutoModelForCausalLM.from_pretrained('KoboldAI/OPT-13B-Erebus').to(device)\n",
        "\n",
        "\n",
        "# Add special tokens\n",
        "special_tokens_dict = {'pad_token': '[PAD]', 'bos_token': '[BOS]', 'eos_token': '[EOS]'}\n",
        "num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)\n",
        "\n",
        "# Resize token embeddings in the model to accommodate new tokens\n",
        "llama_model.resize_token_embeddings(len(tokenizer))\n",
        "\n",
        "# Function to extract the label from the generated output using regular expressions\n",
        "def extract_label_from_output(output_text):\n",
        "    \"\"\"\n",
        "    Use regular expressions to extract the label from the generated text.\n",
        "    Expected format: 'Label: [predicted label]'\n",
        "    \"\"\"\n",
        "    match = re.search(r'Label:\\s*(\\w+)', output_text)\n",
        "    if match:\n",
        "        return match.group(1)  # Extract the label\n",
        "    else:\n",
        "        return \"Label not found\"\n",
        "\n",
        "# Create few-shot prompts with a clear label structure\n",
        "def create_few_shot_prompt(train_data, test_instance, num_examples=5):\n",
        "    few_shot_examples = train_data.sample(n=num_examples)\n",
        "    prompt = \"\"\n",
        "    for _, example in few_shot_examples.iterrows():\n",
        "        prompt += f\"Text: {example['Tokenized Text']}\\nLabel: {example['Label']}\\n---\\n\"\n",
        "\n",
        "    # Ensure the test instance follows the same format\n",
        "    prompt += f\"Text: {test_instance['Tokenized Text']}\\nLabel: ?\\n\"\n",
        "    return prompt\n",
        "\n",
        "# Function to predict using the LLaMA model and extract the label\n",
        "def predict_with_llama(prompt):\n",
        "    # Tokenize and generate output using the model\n",
        "    inputs = tokenizer(prompt, return_tensors=\"pt\", padding=True, truncation=True, max_length=128).to(device)\n",
        "    input_ids = inputs['input_ids']\n",
        "\n",
        "    # Generate output with a limit on the number of new tokens to avoid noise\n",
        "    output = llama_model.generate(input_ids, max_new_tokens=10, num_return_sequences=1, pad_token_id=tokenizer.pad_token_id)\n",
        "    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)\n",
        "\n",
        "    # Extract the label from the generated output\n",
        "    predicted_label = extract_label_from_output(decoded_output)\n",
        "\n",
        "    return predicted_label\n",
        "\n",
        "# Function to batch process and predict on test data\n",
        "def batch_predict(test_data, train_data, batch_size=1):\n",
        "    predictions = []\n",
        "    true_labels = test_data['Label'].values\n",
        "    for i in range(0, len(test_data), batch_size):\n",
        "        batch = test_data.iloc[i:i + batch_size]\n",
        "        for _, test_instance in batch.iterrows():\n",
        "            prompt = create_few_shot_prompt(train_data, test_instance)\n",
        "            predicted_label = predict_with_llama(prompt)\n",
        "            predictions.append(predicted_label)\n",
        "    return predictions, true_labels\n",
        "\n",
        "# Example dataset splitting (assuming 'data' DataFrame is available)\n",
        "train_val_data, test_data = train_test_split(data, test_size=0.15, random_state=46, stratify=data['Label'])\n",
        "\n",
        "# Monitor memory usage before processing\n",
        "print(f\"Memory usage: {psutil.virtual_memory().percent}%\")\n",
        "\n",
        "# Process predictions in batches to save memory\n",
        "predictions, true_labels = batch_predict(test_data, train_val_data, batch_size=1)\n",
        "\n",
        "# Ensure consistent types for labels and predictions (convert everything to strings)\n",
        "true_labels_str = [str(label) for label in true_labels]\n",
        "predictions_str = [str(pred) for pred in predictions]\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "3b351902-4812-4a8d-a1c7-df062f67b787",
      "metadata": {
        "id": "3b351902-4812-4a8d-a1c7-df062f67b787"
      },
      "outputs": [],
      "source": [
        "predictions_str_cl = ['0' if text == 'Label not found' else text for text in predictions_str]\n",
        "predictions_str_cl = ['0' if text == 'Text' else text for text in predictions_str_cl]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "ea40c5f4-8058-4d1c-9f6a-c62adfb8437a",
      "metadata": {
        "tags": [],
        "id": "ea40c5f4-8058-4d1c-9f6a-c62adfb8437a"
      },
      "outputs": [],
      "source": [
        "predictions_str_cl = ['0' if text == 'cc' else text for text in predictions_str_cl]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "8f53fe9a-aa6b-4ba4-bbf5-65f3d5b34dc0",
      "metadata": {
        "tags": [],
        "id": "8f53fe9a-aa6b-4ba4-bbf5-65f3d5b34dc0"
      },
      "outputs": [],
      "source": [
        "\n",
        "# Generate the classification report\n",
        "report = classification_report(true_labels_str, predictions_str_cl, target_names=['Label_1', 'Label_2'])\n",
        "print(report)\n",
        "\n",
        "# Monitor memory usage after processing\n",
        "print(f\"Memory usage: {psutil.virtual_memory().percent}%\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "60dcd791-7ca1-413e-9d01-d80b80262adc",
      "metadata": {
        "tags": [],
        "id": "60dcd791-7ca1-413e-9d01-d80b80262adc"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "torch.cuda.empty_cache()\n",
        "torch.cuda.ipc_collect()\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "88d02bf2-20c2-4347-97f0-95b85007fadc",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true,
        "tags": [],
        "id": "88d02bf2-20c2-4347-97f0-95b85007fadc"
      },
      "source": [
        "## **LoRA Fine-Tuning LLaMA 1.3B f1=33**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "a1f5963d-aa45-4e0d-a7c3-269f4501fcf9",
      "metadata": {
        "tags": [],
        "id": "a1f5963d-aa45-4e0d-a7c3-269f4501fcf9"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "from transformers import LlamaTokenizer, LlamaForCausalLM\n",
        "from sklearn.metrics import classification_report\n",
        "from sklearn.model_selection import train_test_split\n",
        "import psutil\n",
        "import re\n",
        "from transformers import LlamaModel, LlamaTokenizer\n",
        "from peft import get_peft_model, LoraConfig\n",
        "from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score\n",
        "import torch\n",
        "import numpy as np\n",
        "from sklearn.model_selection import StratifiedKFold, train_test_split\n",
        "from torch.utils.data import DataLoader\n",
        "import torch.nn as nn\n",
        "import torch.optim as optim"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "78158d55-a4c4-480c-9927-ef4eeda754f1",
      "metadata": {
        "tags": [],
        "id": "78158d55-a4c4-480c-9927-ef4eeda754f1"
      },
      "outputs": [],
      "source": [
        "# Initial split: 85% for training/validation, 15% for test\n",
        "train_val_data, test_data = train_test_split(data, test_size=0.15, random_state=46, stratify=data['Label'])\n",
        "\n",
        "# Model and tokenizer setup (changing to LLaMA with LoRA)\n",
        "MAX_LEN = 512\n",
        "TRAIN_BATCH_SIZE = 8\n",
        "VALID_BATCH_SIZE = 8\n",
        "EPOCHS = 40\n",
        "LEARNING_RATE = 1e-05\n",
        "WEIGHT_DECAY = 0.01  # Regularization\n",
        "DROPOUT_RATE = 0.3  # Regularization\n",
        "\n",
        "device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\n",
        "\n",
        "# Load the LLaMA model and tokenizer\n",
        "tokenizer = LlamaTokenizer.from_pretrained('princeton-nlp/Sheared-LLaMA-1.3B')\n",
        "llama_model = LlamaModel.from_pretrained('princeton-nlp/Sheared-LLaMA-1.3B')  # Update with correct path/model name\n",
        "\n",
        "tokenizer.add_special_tokens({'pad_token': '[PAD]', 'bos_token': '[BOS]', 'eos_token': '[EOS]'})\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "584c4e30-8a4e-4557-8071-f3f5f365c4f2",
      "metadata": {
        "tags": [],
        "id": "584c4e30-8a4e-4557-8071-f3f5f365c4f2"
      },
      "outputs": [],
      "source": [
        "class LoRALLaMAClass(torch.nn.Module):\n",
        "    def __init__(self, model, classifier_dropout=0.1):\n",
        "        super(LoRALLaMAClass, self).__init__()\n",
        "        self.model = model  # LLaMA model\n",
        "        self.config = self.model.config  # Accessing the config of the LLaMA model\n",
        "        self.dropout = torch.nn.Dropout(classifier_dropout)\n",
        "        self.classifier = torch.nn.Linear(self.config.hidden_size, 1)  # Assuming binary classification\n",
        "\n",
        "    def forward(self, input_ids, attn_mask):\n",
        "        # Forward pass through LLaMA model\n",
        "        output = self.model(input_ids=input_ids, attention_mask=attn_mask)\n",
        "        pooled_output = self.dropout(output.last_hidden_state[:, 0])  # CLS token for classification\n",
        "        return self.classifier(pooled_output)\n",
        "\n",
        "# Reinitialize weights for the embedding layer within the LLaMA model\n",
        "def reinitialize_weights(module):\n",
        "    if isinstance(module, (torch.nn.Linear, torch.nn.Embedding)):\n",
        "        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "2b1315e3-ec9d-41a8-a1a4-acf537844ef9",
      "metadata": {
        "tags": [],
        "id": "2b1315e3-ec9d-41a8-a1a4-acf537844ef9"
      },
      "outputs": [],
      "source": [
        "# Custom dataset class for binary classification\n",
        "class CustomDataset(torch.utils.data.Dataset):\n",
        "    def __init__(self, df, tokenizer, max_len):\n",
        "        self.tokenizer = tokenizer\n",
        "        self.df = df\n",
        "        self.texts = df['Tokenized Text']\n",
        "        self.labels = df['Label']\n",
        "        self.max_len = max_len\n",
        "\n",
        "    def __len__(self):\n",
        "        return len(self.texts)\n",
        "\n",
        "    def __getitem__(self, index):\n",
        "        text = str(self.texts[index])\n",
        "        text = \" \".join(text.split())\n",
        "\n",
        "        inputs = self.tokenizer.encode_plus(\n",
        "            text,\n",
        "            None,\n",
        "            add_special_tokens=True,\n",
        "            max_length=self.max_len,\n",
        "            padding='max_length',\n",
        "            truncation=True,\n",
        "            return_attention_mask=True,\n",
        "            return_tensors='pt'\n",
        "        )\n",
        "\n",
        "        return {\n",
        "            'input_ids': inputs['input_ids'].flatten(),\n",
        "            'attention_mask': inputs['attention_mask'].flatten(),\n",
        "            'targets': torch.FloatTensor([self.labels[index]])  # Use FloatTensor for binary classification\n",
        "        }\n",
        "\n",
        "# Create test dataset and loader (15% test set)\n",
        "test_data = test_data.reset_index(drop=True)\n",
        "test_dataset = CustomDataset(test_data, tokenizer, MAX_LEN)\n",
        "test_loader = DataLoader(test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "897e0ae7-ab23-4763-a1d9-31caf7f98ae3",
      "metadata": {
        "tags": [],
        "id": "897e0ae7-ab23-4763-a1d9-31caf7f98ae3"
      },
      "outputs": [],
      "source": [
        "import os\n",
        "os.environ['CUDA_LAUNCH_BLOCKING'] = '1'\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "c80d5957-0ff1-4b1b-91f0-f0900ed69747",
      "metadata": {
        "tags": [],
        "id": "c80d5957-0ff1-4b1b-91f0-f0900ed69747"
      },
      "outputs": [],
      "source": [
        "from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score\n",
        "import torch\n",
        "import numpy as np\n",
        "from torch import optim\n",
        "from torch.utils.data import DataLoader\n",
        "import torch.nn as nn\n",
        "import gc\n",
        "from sklearn.model_selection import StratifiedKFold, train_test_split\n",
        "\n",
        "# Environment setting for debugging CUDA errors\n",
        "import os\n",
        "os.environ[\"CUDA_LAUNCH_BLOCKING\"] = \"1\"\n",
        "\n",
        "# Function to evaluate the model and return metrics for the fold\n",
        "def evaluate_fold(model, data_loader):\n",
        "    model.eval()\n",
        "    all_preds = []\n",
        "    all_labels = []\n",
        "\n",
        "    with torch.no_grad():\n",
        "        for batch in data_loader:\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            targets = batch['targets'].to(device).float().view(-1, 1)  # Ensure correct shape and dtype\n",
        "\n",
        "            # Clip token IDs to be within the model's vocab size\n",
        "            input_ids = torch.clamp(input_ids, max=model.config.vocab_size - 1)\n",
        "\n",
        "            outputs = model(input_ids, attn_mask).view(-1, 1)  # Ensure correct output shape\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "\n",
        "            all_preds.extend(predicted.cpu().numpy())\n",
        "            all_labels.extend(targets.cpu().numpy())\n",
        "\n",
        "    all_preds = np.array(all_preds)\n",
        "    all_labels = np.array(all_labels)\n",
        "\n",
        "    # Calculate metrics for the fold\n",
        "    accuracy = accuracy_score(all_labels, all_preds)\n",
        "    precision = precision_score(all_labels, all_preds)\n",
        "    recall = recall_score(all_labels, all_preds)\n",
        "    f1 = f1_score(all_labels, all_preds)\n",
        "\n",
        "    return accuracy, precision, recall, f1, all_labels, all_preds\n",
        "\n",
        "# Setup for 5-Fold Cross-Validation\n",
        "kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=46)\n",
        "\n",
        "# Track the best model and fold, and accumulate metrics\n",
        "best_model_state = None\n",
        "best_fold = -1\n",
        "best_loss = float('inf')\n",
        "\n",
        "# Hyperparameters\n",
        "MAX_LEN = 512\n",
        "TRAIN_BATCH_SIZE = 2  # Further reduced batch size\n",
        "VALID_BATCH_SIZE = 2  # Reduced batch size\n",
        "EPOCHS = 10  # Start with fewer epochs to check early stopping\n",
        "LEARNING_RATE = 1e-4  # Increased learning rate\n",
        "WEIGHT_DECAY = 1e-5\n",
        "ACCUMULATION_STEPS = 8  # Gradient accumulation\n",
        "CLIP_VALUE = 0.05  # More aggressive gradient clipping\n",
        "PATIENCE = 5  # Early stopping patience\n",
        "device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\n",
        "\n",
        "# Use GradScaler for mixed precision training\n",
        "scaler = torch.cuda.amp.GradScaler()\n",
        "\n",
        "# Function to reinitialize weights of classifier\n",
        "def reinitialize_classifier_weights(layer):\n",
        "    if isinstance(layer, nn.Linear):\n",
        "        nn.init.xavier_uniform_(layer.weight)\n",
        "        nn.init.zeros_(layer.bias)\n",
        "\n",
        "# Start cross-validation\n",
        "for fold, (train_index, val_index) in enumerate(kf.split(train_val_data, train_val_data['Label'])):\n",
        "    print(f\"\\nFold {fold + 1}\\n\" + \"-\" * 10)\n",
        "\n",
        "    # Split the 85% training/validation data into training and validation for this fold\n",
        "    train_data = train_val_data.iloc[train_index].reset_index(drop=True)\n",
        "    val_data = train_val_data.iloc[val_index].reset_index(drop=True)\n",
        "\n",
        "    # Create datasets and loaders for this fold\n",
        "    train_dataset = CustomDataset(train_data, tokenizer, MAX_LEN)\n",
        "    val_dataset = CustomDataset(val_data, tokenizer, MAX_LEN)\n",
        "\n",
        "    train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=0)\n",
        "    val_loader = DataLoader(val_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)\n",
        "\n",
        "    # Initialize the LoRALLaMAClass with the pre-trained LLaMA model\n",
        "    model = LoRALLaMAClass(llama_model)\n",
        "\n",
        "    # Reinitialize classifier weights\n",
        "    model.classifier.apply(reinitialize_classifier_weights)\n",
        "\n",
        "    # Ensure all model parameters have requires_grad=True\n",
        "    for param in model.parameters():\n",
        "        param.requires_grad = True\n",
        "\n",
        "    # Resize token embeddings in case special tokens are added\n",
        "    model.model.resize_token_embeddings(len(tokenizer))\n",
        "\n",
        "    model.to(device)\n",
        "\n",
        "    optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)\n",
        "    criterion = nn.BCEWithLogitsLoss()\n",
        "\n",
        "    best_valid_loss = float('inf')\n",
        "    patience_counter = 0\n",
        "\n",
        "    for epoch in range(EPOCHS):\n",
        "        model.train()\n",
        "        train_loss = 0.0\n",
        "        train_correct = 0\n",
        "        train_total = 0\n",
        "\n",
        "        for step, batch in enumerate(train_loader):\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            targets = batch['targets'].to(device).float().view(-1, 1)  # Ensure correct target shape and dtype\n",
        "\n",
        "            # Clip token IDs to be within the model's vocab size\n",
        "            input_ids = torch.clamp(input_ids, max=model.config.vocab_size - 1)\n",
        "\n",
        "            optimizer.zero_grad()\n",
        "\n",
        "            # Mixed precision training with autocast\n",
        "            with torch.amp.autocast('cuda'):\n",
        "                outputs = model(input_ids, attn_mask).view(-1, 1)  # Ensure the outputs are of correct shape\n",
        "                loss = criterion(outputs, targets)\n",
        "\n",
        "            # Scale loss and backward pass\n",
        "            scaler.scale(loss).backward()\n",
        "\n",
        "            # Gradient Clipping\n",
        "            torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP_VALUE)\n",
        "\n",
        "            # Gradient accumulation\n",
        "            if (step + 1) % ACCUMULATION_STEPS == 0:\n",
        "                scaler.step(optimizer)\n",
        "                scaler.update()\n",
        "                optimizer.zero_grad()\n",
        "\n",
        "            train_loss += loss.item()\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "            train_total += targets.size(0)\n",
        "            train_correct += (predicted == targets).sum().item()\n",
        "\n",
        "            torch.cuda.empty_cache()\n",
        "\n",
        "        train_accuracy = train_correct / train_total\n",
        "        train_loss /= len(train_loader)\n",
        "\n",
        "        # Validation loop\n",
        "        model.eval()\n",
        "        valid_loss = 0.0\n",
        "        valid_correct = 0\n",
        "        valid_total = 0\n",
        "\n",
        "        with torch.no_grad():\n",
        "            for batch in val_loader:\n",
        "                input_ids = batch['input_ids'].to(device)\n",
        "                attn_mask = batch['attention_mask'].to(device)\n",
        "                targets = batch['targets'].to(device).float().view(-1, 1)\n",
        "\n",
        "                input_ids = torch.clamp(input_ids, max=model.config.vocab_size - 1)\n",
        "\n",
        "                outputs = model(input_ids, attn_mask).view(-1, 1)\n",
        "                loss = criterion(outputs, targets)\n",
        "                valid_loss += loss.item()\n",
        "                predicted = torch.round(torch.sigmoid(outputs))\n",
        "                valid_total += targets.size(0)\n",
        "                valid_correct += (predicted == targets).sum().item()\n",
        "\n",
        "                torch.cuda.empty_cache()\n",
        "\n",
        "        valid_accuracy = valid_correct / valid_total\n",
        "        valid_loss /= len(val_loader)\n",
        "\n",
        "        print(f'Epoch {epoch + 1}/{EPOCHS}, Training Accuracy: {train_accuracy:.4f}, Training Loss: {train_loss:.4f}, Validation Loss: {valid_loss:.4f}, Validation Accuracy: {valid_accuracy:.4f}')\n",
        "\n",
        "        # Early stopping logic\n",
        "        if valid_loss < best_valid_loss:\n",
        "            best_valid_loss = valid_loss\n",
        "            patience_counter = 0\n",
        "            # Save the best model state for the current fold\n",
        "            torch.save(model.state_dict(), f'Data/ThirdIterationModels/LLaMA_LoRA_Fold_{fold + 1}.pth')\n",
        "        else:\n",
        "            patience_counter += 1\n",
        "\n",
        "        if patience_counter >= PATIENCE:\n",
        "            print(\"Early stopping triggered\")\n",
        "            break\n",
        "\n",
        "    # Check if this is the best model across all folds\n",
        "    if best_valid_loss < best_loss:\n",
        "        best_loss = best_valid_loss\n",
        "        best_fold = fold + 1\n",
        "        best_model_state = model.state_dict()  # Save the state of the best model\n",
        "\n",
        "# Save the best model across all folds\n",
        "torch.save(best_model_state, 'Data/ThirdIterationModels/LLaMA_LoRA_Best_Fold.pth')\n",
        "print(f\"\\nBest model saved from fold {best_fold}\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "303f9d56-7467-4786-9e01-8dc2a5279eda",
      "metadata": {
        "tags": [],
        "id": "303f9d56-7467-4786-9e01-8dc2a5279eda"
      },
      "outputs": [],
      "source": [
        "# Load the best model and evaluate on the test set\n",
        "model.load_state_dict(torch.load('Data/ThirdIterationModels/LLaMA_LoRA_Best_Fold.pth'))\n",
        "model.to(device)\n",
        "\n",
        "# Evaluate on the test set\n",
        "def evaluate_on_test(model, test_loader):\n",
        "    model.eval()\n",
        "    all_preds = []\n",
        "    all_labels = []\n",
        "\n",
        "    with torch.no_grad():\n",
        "        for batch in test_loader:\n",
        "            input_ids = batch['input_ids'].to(device)\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            targets = batch['targets'].to(device).squeeze()\n",
        "\n",
        "            outputs = model(input_ids, attn_mask).squeeze()\n",
        "            predicted = torch.round(torch.sigmoid(outputs))\n",
        "\n",
        "            all_preds.extend(predicted.cpu().numpy())\n",
        "            all_labels.extend(targets.cpu().numpy())\n",
        "\n",
        "    return np.array(all_labels), np.array(all_preds)\n",
        "\n",
        "# Use the test loader for evaluation\n",
        "true_labels, predicted_labels = evaluate_on_test(model, test_loader)\n",
        "\n",
        "# Print the classification report\n",
        "print(\"\\nClassification Report on Test Set:\")\n",
        "print(classification_report(true_labels, predicted_labels, target_names=['Class 0', 'Class 1']))\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "aa65b6a8-07c0-46db-9078-60346f982bd7",
      "metadata": {
        "tags": [],
        "id": "aa65b6a8-07c0-46db-9078-60346f982bd7"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "torch.cuda.empty_cache()\n",
        "torch.cuda.ipc_collect()\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "3788b33b-a842-4b6b-b4c3-32d6f8c7e827",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true,
        "tags": [],
        "id": "3788b33b-a842-4b6b-b4c3-32d6f8c7e827"
      },
      "source": [
        "## **LORA Finetune  GPT-2 f1=33**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "dbc5525f-0275-49e3-b801-31998b933bcb",
      "metadata": {
        "tags": [],
        "id": "dbc5525f-0275-49e3-b801-31998b933bcb"
      },
      "outputs": [],
      "source": [
        "# Model and tokenizer setup (changing to LLaMA with LoRA)\n",
        "MAX_LEN = 512\n",
        "TRAIN_BATCH_SIZE = 8\n",
        "VALID_BATCH_SIZE = 8\n",
        "EPOCHS = 40\n",
        "LEARNING_RATE = 1e-08\n",
        "WEIGHT_DECAY = 0.01  # Regularization\n",
        "DROPOUT_RATE = 0.3  # Regularization"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "f8f92b78-6e0b-4737-9f4e-63dde5077531",
      "metadata": {
        "tags": [],
        "id": "f8f92b78-6e0b-4737-9f4e-63dde5077531"
      },
      "outputs": [],
      "source": [
        "# Custom dataset class for binary classification\n",
        "class CustomDataset(torch.utils.data.Dataset):\n",
        "    def __init__(self, df, tokenizer, max_len):\n",
        "        self.tokenizer = tokenizer\n",
        "        self.df = df\n",
        "        self.texts = df['Tokenized Text']\n",
        "        self.labels = df['Label']\n",
        "        self.max_len = max_len\n",
        "\n",
        "    def __len__(self):\n",
        "        return len(self.texts)\n",
        "\n",
        "    def __getitem__(self, index):\n",
        "        text = str(self.texts[index])\n",
        "        text = \" \".join(text.split())\n",
        "\n",
        "        inputs = self.tokenizer.encode_plus(\n",
        "            text,\n",
        "            None,\n",
        "            add_special_tokens=True,\n",
        "            max_length=self.max_len,\n",
        "            padding='max_length',\n",
        "            truncation=True,\n",
        "            return_attention_mask=True,\n",
        "            return_tensors='pt'\n",
        "        )\n",
        "\n",
        "        return {\n",
        "            'input_ids': inputs['input_ids'].flatten(),\n",
        "            'attention_mask': inputs['attention_mask'].flatten(),\n",
        "            'targets': torch.FloatTensor([self.labels[index]])  # Use FloatTensor for binary classification\n",
        "        }\n",
        "\n",
        "# # Create test dataset and loader (15% test set)\n",
        "# test_data = test_data.reset_index(drop=True)\n",
        "# test_dataset = CustomDataset(test_data, tokenizer, MAX_LEN)\n",
        "# test_loader = DataLoader(test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "718ff742-b0ea-454e-aca6-1cf40269834b",
      "metadata": {
        "tags": [],
        "id": "718ff742-b0ea-454e-aca6-1cf40269834b"
      },
      "outputs": [],
      "source": [
        "# Environment setting for debugging CUDA errors\n",
        "import os\n",
        "import torch\n",
        "import torch.nn as nn\n",
        "import torch.optim as optim\n",
        "from sklearn.model_selection import train_test_split, StratifiedKFold\n",
        "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
        "from torch.utils.data import DataLoader\n",
        "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n",
        "import numpy as np\n",
        "\n",
        "os.environ[\"CUDA_LAUNCH_BLOCKING\"] = \"1\"\n",
        "\n",
        "# Set up device\n",
        "device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\n",
        "\n",
        "# Initial split: 85% for training/validation, 15% for test\n",
        "train_val_data, test_data = train_test_split(data, test_size=0.15, random_state=46, stratify=data['Label'])\n",
        "\n",
        "# Load GPT-2 tokenizer and model\n",
        "tokenizer = AutoTokenizer.from_pretrained('gpt2', padding_side='left')\n",
        "\n",
        "# GPT-2 does not have a pad token by default, so we set it\n",
        "tokenizer.pad_token = tokenizer.eos_token\n",
        "\n",
        "model = AutoModelForCausalLM.from_pretrained('gpt2').to(device)\n",
        "\n",
        "# CustomDataset class definition\n",
        "class CustomDataset(torch.utils.data.Dataset):\n",
        "    def __init__(self, texts, labels, tokenizer, max_len):\n",
        "        self.texts = texts\n",
        "        self.labels = labels\n",
        "        self.tokenizer = tokenizer\n",
        "        self.max_len = max_len\n",
        "\n",
        "    def __len__(self):\n",
        "        return len(self.texts)\n",
        "\n",
        "    def __getitem__(self, index):\n",
        "        text = str(self.texts[index])\n",
        "        text = \" \".join(text.split())\n",
        "\n",
        "        # Encode the text using the tokenizer\n",
        "        inputs = self.tokenizer.encode_plus(\n",
        "            text,\n",
        "            None,\n",
        "            add_special_tokens=True,\n",
        "            max_length=self.max_len,\n",
        "            padding='max_length',  # Pad to the max length\n",
        "            truncation=True,\n",
        "            return_attention_mask=True,\n",
        "            return_tensors='pt'\n",
        "        )\n",
        "\n",
        "        return {\n",
        "            'input_ids': inputs['input_ids'].flatten(),\n",
        "            'attention_mask': inputs['attention_mask'].flatten(),\n",
        "            'targets': torch.FloatTensor([self.labels[index]])  # Use FloatTensor for binary classification\n",
        "        }\n",
        "\n",
        "# Function to evaluate the model and return metrics for the fold\n",
        "def evaluate_fold(model, data_loader):\n",
        "    model.eval()\n",
        "    all_preds = []\n",
        "    all_labels = []\n",
        "\n",
        "    with torch.no_grad():\n",
        "        for batch in data_loader:\n",
        "            input_ids = batch['input_ids'].to(device).long()  # Ensure input_ids are LongTensor\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            targets = batch['targets'].to(device).view(-1, 1)  # Ensure correct shape and dtype\n",
        "\n",
        "            # Forward pass using the model's forward method to obtain the logits\n",
        "            outputs = model(input_ids=input_ids, attention_mask=attn_mask)\n",
        "\n",
        "            # Extract the logits from the last token for binary classification\n",
        "            logits = outputs.logits[:, -1, :]  # Using the last token's logits\n",
        "            predicted = torch.sigmoid(logits[:, 0]).unsqueeze(-1)\n",
        "\n",
        "            all_preds.extend(predicted.cpu().numpy())\n",
        "            all_labels.extend(targets.cpu().numpy())\n",
        "\n",
        "    all_preds = np.round(np.array(all_preds))  # Round predictions to 0 or 1\n",
        "    all_labels = np.array(all_labels)\n",
        "\n",
        "    # Calculate metrics for the fold\n",
        "    accuracy = accuracy_score(all_labels, all_preds)\n",
        "    precision = precision_score(all_labels, all_preds)\n",
        "    recall = recall_score(all_labels, all_preds)\n",
        "    f1 = f1_score(all_labels, all_preds)\n",
        "\n",
        "    return accuracy, precision, recall, f1, all_labels, all_preds\n",
        "\n",
        "# Setup for 5-Fold Cross-Validation\n",
        "kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=46)\n",
        "\n",
        "# Track the best model and fold, and accumulate metrics\n",
        "best_model_state = None\n",
        "best_fold = -1\n",
        "best_loss = float('inf')\n",
        "\n",
        "# Hyperparameters\n",
        "MAX_LEN = 512\n",
        "TRAIN_BATCH_SIZE = 2  # Reduced batch size\n",
        "VALID_BATCH_SIZE = 2  # Reduced batch size\n",
        "EPOCHS = 10  # Start with fewer epochs\n",
        "LEARNING_RATE = 1e-4\n",
        "WEIGHT_DECAY = 1e-5\n",
        "ACCUMULATION_STEPS = 8  # Gradient accumulation\n",
        "CLIP_VALUE = 0.05  # Gradient clipping\n",
        "PATIENCE = 5  # Early stopping patience\n",
        "\n",
        "# Use GradScaler for mixed precision training\n",
        "scaler = torch.amp.GradScaler()\n",
        "\n",
        "# Function to reinitialize weights of classifier\n",
        "def reinitialize_classifier_weights(layer):\n",
        "    if isinstance(layer, nn.Linear):\n",
        "        nn.init.xavier_uniform_(layer.weight)\n",
        "        nn.init.zeros_(layer.bias)\n",
        "\n",
        "# Start cross-validation\n",
        "for fold, (train_index, val_index) in enumerate(kf.split(train_val_data, train_val_data['Label'])):\n",
        "    print(f\"\\nFold {fold + 1}\\n\" + \"-\" * 10)\n",
        "\n",
        "    # Split the 85% training/validation data into training and validation for this fold\n",
        "    train_data = train_val_data.iloc[train_index].reset_index(drop=True)\n",
        "    val_data = train_val_data.iloc[val_index].reset_index(drop=True)\n",
        "\n",
        "    # Create datasets and loaders for this fold\n",
        "    train_dataset = CustomDataset(train_data['Tokenized Text'], train_data['Label'], tokenizer, MAX_LEN)\n",
        "    val_dataset = CustomDataset(val_data['Tokenized Text'], val_data['Label'], tokenizer, MAX_LEN)\n",
        "\n",
        "    train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=0)\n",
        "    val_loader = DataLoader(val_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)\n",
        "\n",
        "    # Initialize GPT-2 model for classification\n",
        "    model = AutoModelForCausalLM.from_pretrained('gpt2')\n",
        "    model.to(device)\n",
        "\n",
        "    optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)\n",
        "    criterion = nn.BCEWithLogitsLoss()\n",
        "\n",
        "    best_valid_loss = float('inf')\n",
        "    patience_counter = 0\n",
        "\n",
        "    for epoch in range(EPOCHS):\n",
        "        model.train()\n",
        "        train_loss = 0.0\n",
        "        train_correct = 0\n",
        "        train_total = 0\n",
        "\n",
        "        for step, batch in enumerate(train_loader):\n",
        "            input_ids = batch['input_ids'].to(device).long()  # Ensure input_ids are LongTensor\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            targets = batch['targets'].to(device).float().view(-1, 1)  # Keep targets as float for BCEWithLogitsLoss\n",
        "\n",
        "            optimizer.zero_grad()\n",
        "\n",
        "            # Mixed precision training with autocast\n",
        "            with torch.amp.autocast(device_type='cuda'):\n",
        "                # Forward pass using the model's forward method to obtain the logits\n",
        "                outputs = model(input_ids=input_ids, attention_mask=attn_mask)\n",
        "\n",
        "                # Extract logits from the last token for binary classification\n",
        "                logits = outputs.logits[:, -1, :]  # Using the last token's logits\n",
        "                predicted = torch.sigmoid(logits[:, 0]).unsqueeze(-1)\n",
        "\n",
        "                # Compute the loss\n",
        "                loss = criterion(predicted, targets)\n",
        "\n",
        "            # Scale loss and backward pass\n",
        "            scaler.scale(loss).backward()\n",
        "\n",
        "            # Gradient Clipping\n",
        "            torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP_VALUE)\n",
        "\n",
        "            # Gradient accumulation\n",
        "            if (step + 1) % ACCUMULATION_STEPS == 0:\n",
        "                scaler.step(optimizer)\n",
        "                scaler.update()\n",
        "                optimizer.zero_grad()\n",
        "\n",
        "            train_loss += loss.item()\n",
        "            train_total += targets.size(0)\n",
        "            train_correct += (predicted.round() == targets).sum().item()\n",
        "\n",
        "            torch.cuda.empty_cache()\n",
        "\n",
        "        train_accuracy = train_correct / train_total\n",
        "        train_loss /= len(train_loader)\n",
        "\n",
        "        # Validation loop\n",
        "        model.eval()\n",
        "        valid_loss = 0.0\n",
        "        valid_correct = 0\n",
        "        valid_total = 0\n",
        "\n",
        "        with torch.no_grad():\n",
        "            for batch in val_loader:\n",
        "                input_ids = batch['input_ids'].to(device).long()  # Ensure input_ids are LongTensor\n",
        "                attn_mask = batch['attention_mask'].to(device)\n",
        "                targets = batch['targets'].to(device).float().view(-1, 1)\n",
        "\n",
        "                # Forward pass using the model's forward method to obtain the logits\n",
        "                outputs = model(input_ids=input_ids, attention_mask=attn_mask)\n",
        "\n",
        "                # Extract logits from the last token for binary classification\n",
        "                logits = outputs.logits[:, -1, :]  # Using the last token's logits\n",
        "                predicted = torch.sigmoid(logits[:, 0]).unsqueeze(-1)\n",
        "\n",
        "                loss = criterion(predicted, targets)\n",
        "                valid_loss += loss.item()\n",
        "                valid_total += targets.size(0)\n",
        "                valid_correct += (predicted.round() == targets).sum().item()\n",
        "\n",
        "                torch.cuda.empty_cache()\n",
        "\n",
        "        valid_accuracy = valid_correct / valid_total\n",
        "        valid_loss /= len(val_loader)\n",
        "\n",
        "        print(f'Epoch {epoch + 1}/{EPOCHS}, Training Accuracy: {train_accuracy:.4f}, Training Loss: {train_loss:.4f}, Validation Loss: {valid_loss:.4f}, Validation Accuracy: {valid_accuracy:.4f}')\n",
        "\n",
        "        # Early stopping logic\n",
        "        if valid_loss < best_valid_loss:\n",
        "            best_valid_loss = valid_loss\n",
        "            patience_counter = 0\n",
        "            # Save the best model state for the current fold\n",
        "            torch.save(model.state_dict(), f'Data/ThirdIterationModels/GPT2_Fold_{fold + 1}.pth')\n",
        "        else:\n",
        "            patience_counter += 1\n",
        "\n",
        "        if patience_counter >= PATIENCE:\n",
        "            print(\"Early stopping triggered\")\n",
        "            break\n",
        "\n",
        "    # Check if this is the best model across all folds\n",
        "    if best_valid_loss < best_loss:\n",
        "        best_loss = best_valid_loss\n",
        "        best_fold = fold + 1\n",
        "        best_model_state = model.state_dict()  # Save the state of the best model\n",
        "\n",
        "# Save the best model across all folds\n",
        "torch.save(best_model_state, 'Data/ThirdIterationModels/GPT2_Best_Fold.pth')\n",
        "print(f\"\\nBest model saved from fold {best_fold}\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "0d811517-2efd-48a0-afc0-0771e82f0623",
      "metadata": {
        "tags": [],
        "id": "0d811517-2efd-48a0-afc0-0771e82f0623"
      },
      "outputs": [],
      "source": [
        "from sklearn.metrics import classification_report\n",
        "\n",
        "# Load the best model saved during cross-validation\n",
        "best_model = AutoModelForCausalLM.from_pretrained('gpt2')\n",
        "best_model.load_state_dict(torch.load('Data/ThirdIterationModels/GPT2_Best_Fold.pth'))\n",
        "best_model.to(device)\n",
        "best_model.eval()\n",
        "\n",
        "\n",
        "test_dataset = CustomDataset(test_data['Tokenized Text'].tolist(), test_data['Label'].tolist(), tokenizer, MAX_LEN)\n",
        "test_loader = DataLoader(test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)\n",
        "\n",
        "# Function to evaluate the model on the test set and return predictions and true labels\n",
        "def evaluate_on_test(model, test_loader):\n",
        "    model.eval()\n",
        "    all_preds = []\n",
        "    all_labels = []\n",
        "\n",
        "    with torch.no_grad():\n",
        "        for batch in test_loader:\n",
        "            input_ids = batch['input_ids'].to(device).long()\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            targets = batch['targets'].to(device).view(-1, 1)\n",
        "\n",
        "            # Forward pass to obtain logits\n",
        "            outputs = model(input_ids=input_ids, attention_mask=attn_mask)\n",
        "            logits = outputs.logits[:, -1, :]  # Get the logits from the last token\n",
        "            predicted = torch.sigmoid(logits[:, 0]).unsqueeze(-1)\n",
        "\n",
        "            all_preds.extend(predicted.cpu().numpy())\n",
        "            all_labels.extend(targets.cpu().numpy())\n",
        "\n",
        "    # Convert predictions to binary format (0 or 1)\n",
        "    all_preds = np.round(np.array(all_preds))\n",
        "    all_labels = np.array(all_labels)\n",
        "\n",
        "    return all_preds, all_labels\n",
        "\n",
        "# Evaluate the model on the test dataset\n",
        "y_pred, y_true = evaluate_on_test(best_model, test_loader)\n",
        "\n",
        "# Generate classification report\n",
        "print(classification_report(y_true, y_pred, target_names=['class 0', 'class 1']))\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "cf0f7eb2-83e3-4ee8-b570-bf1ac3f59335",
      "metadata": {
        "tags": [],
        "id": "cf0f7eb2-83e3-4ee8-b570-bf1ac3f59335"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "torch.cuda.empty_cache()\n",
        "torch.cuda.ipc_collect()\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "63fe3947-196e-492e-a28f-e7b646ff5040",
      "metadata": {
        "jp-MarkdownHeadingCollapsed": true,
        "tags": [],
        "id": "63fe3947-196e-492e-a28f-e7b646ff5040"
      },
      "source": [
        "## **LORA Finetune DistillGPT f1=33**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "34b39957-264c-4db4-8fc4-069fd26b650a",
      "metadata": {
        "tags": [],
        "id": "34b39957-264c-4db4-8fc4-069fd26b650a"
      },
      "outputs": [],
      "source": [
        "# Custom dataset class for binary classification\n",
        "class CustomDataset(torch.utils.data.Dataset):\n",
        "    def __init__(self, df, tokenizer, max_len):\n",
        "        self.tokenizer = tokenizer\n",
        "        self.df = df\n",
        "        self.texts = df['Tokenized Text']\n",
        "        self.labels = df['Label']\n",
        "        self.max_len = max_len\n",
        "\n",
        "    def __len__(self):\n",
        "        return len(self.texts)\n",
        "\n",
        "    def __getitem__(self, index):\n",
        "        text = str(self.texts[index])\n",
        "        text = \" \".join(text.split())\n",
        "\n",
        "        inputs = self.tokenizer.encode_plus(\n",
        "            text,\n",
        "            None,\n",
        "            add_special_tokens=True,\n",
        "            max_length=self.max_len,\n",
        "            padding='max_length',\n",
        "            truncation=True,\n",
        "            return_attention_mask=True,\n",
        "            return_tensors='pt'\n",
        "        )\n",
        "\n",
        "        return {\n",
        "            'input_ids': inputs['input_ids'].flatten(),\n",
        "            'attention_mask': inputs['attention_mask'].flatten(),\n",
        "            'targets': torch.FloatTensor([self.labels[index]])  # Use FloatTensor for binary classification\n",
        "        }\n",
        "\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "656c74c5-e244-423a-af8e-67dbab7dcbd7",
      "metadata": {
        "tags": [],
        "id": "656c74c5-e244-423a-af8e-67dbab7dcbd7"
      },
      "outputs": [],
      "source": [
        "# Model and tokenizer setup (changing to LLaMA with LoRA)\n",
        "MAX_LEN = 512\n",
        "TRAIN_BATCH_SIZE = 8\n",
        "VALID_BATCH_SIZE = 8\n",
        "EPOCHS = 40\n",
        "LEARNING_RATE = 1e-08\n",
        "WEIGHT_DECAY = 0.01  # Regularization\n",
        "DROPOUT_RATE = 0.3  # Regularization"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "df4ae3fa-8206-46da-a66b-f1a6de65d625",
      "metadata": {
        "tags": [],
        "id": "df4ae3fa-8206-46da-a66b-f1a6de65d625"
      },
      "outputs": [],
      "source": [
        "# Environment setting for debugging CUDA errors\n",
        "import os\n",
        "import torch\n",
        "import torch.nn as nn\n",
        "import torch.optim as optim\n",
        "from sklearn.model_selection import train_test_split, StratifiedKFold\n",
        "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
        "from torch.utils.data import DataLoader\n",
        "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n",
        "import numpy as np\n",
        "\n",
        "os.environ[\"CUDA_LAUNCH_BLOCKING\"] = \"1\"\n",
        "\n",
        "# Set up device\n",
        "device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\n",
        "\n",
        "# Initial split: 85% for training/validation, 15% for test\n",
        "train_val_data, test_data = train_test_split(data, test_size=0.15, random_state=46, stratify=data['Label'])\n",
        "\n",
        "# Load GPT-2 tokenizer and model\n",
        "tokenizer = AutoTokenizer.from_pretrained('distilgpt2', padding_side='left')\n",
        "\n",
        "# GPT-2 does not have a pad token by default, so we set it\n",
        "tokenizer.pad_token = tokenizer.eos_token\n",
        "\n",
        "model = AutoModelForCausalLM.from_pretrained('distilgpt2').to(device)\n",
        "\n",
        "# CustomDataset class definition\n",
        "class CustomDataset(torch.utils.data.Dataset):\n",
        "    def __init__(self, texts, labels, tokenizer, max_len):\n",
        "        self.texts = texts\n",
        "        self.labels = labels\n",
        "        self.tokenizer = tokenizer\n",
        "        self.max_len = max_len\n",
        "\n",
        "    def __len__(self):\n",
        "        return len(self.texts)\n",
        "\n",
        "    def __getitem__(self, index):\n",
        "        text = str(self.texts[index])\n",
        "        text = \" \".join(text.split())\n",
        "\n",
        "        # Encode the text using the tokenizer\n",
        "        inputs = self.tokenizer.encode_plus(\n",
        "            text,\n",
        "            None,\n",
        "            add_special_tokens=True,\n",
        "            max_length=self.max_len,\n",
        "            padding='max_length',  # Pad to the max length\n",
        "            truncation=True,\n",
        "            return_attention_mask=True,\n",
        "            return_tensors='pt'\n",
        "        )\n",
        "\n",
        "        return {\n",
        "            'input_ids': inputs['input_ids'].flatten(),\n",
        "            'attention_mask': inputs['attention_mask'].flatten(),\n",
        "            'targets': torch.FloatTensor([self.labels[index]])  # Use FloatTensor for binary classification\n",
        "        }\n",
        "\n",
        "# Function to evaluate the model and return metrics for the fold\n",
        "def evaluate_fold(model, data_loader):\n",
        "    model.eval()\n",
        "    all_preds = []\n",
        "    all_labels = []\n",
        "\n",
        "    with torch.no_grad():\n",
        "        for batch in data_loader:\n",
        "            input_ids = batch['input_ids'].to(device).long()  # Ensure input_ids are LongTensor\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            targets = batch['targets'].to(device).view(-1, 1)  # Ensure correct shape and dtype\n",
        "\n",
        "            # Forward pass using the model's forward method to obtain the logits\n",
        "            outputs = model(input_ids=input_ids, attention_mask=attn_mask)\n",
        "\n",
        "            # Extract the logits from the last token for binary classification\n",
        "            logits = outputs.logits[:, -1, :]  # Using the last token's logits\n",
        "            predicted = torch.sigmoid(logits[:, 0]).unsqueeze(-1)\n",
        "\n",
        "            all_preds.extend(predicted.cpu().numpy())\n",
        "            all_labels.extend(targets.cpu().numpy())\n",
        "\n",
        "    all_preds = np.round(np.array(all_preds))  # Round predictions to 0 or 1\n",
        "    all_labels = np.array(all_labels)\n",
        "\n",
        "    # Calculate metrics for the fold\n",
        "    accuracy = accuracy_score(all_labels, all_preds)\n",
        "    precision = precision_score(all_labels, all_preds)\n",
        "    recall = recall_score(all_labels, all_preds)\n",
        "    f1 = f1_score(all_labels, all_preds)\n",
        "\n",
        "    return accuracy, precision, recall, f1, all_labels, all_preds\n",
        "\n",
        "# Setup for 5-Fold Cross-Validation\n",
        "kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=46)\n",
        "\n",
        "# Track the best model and fold, and accumulate metrics\n",
        "best_model_state = None\n",
        "best_fold = -1\n",
        "best_loss = float('inf')\n",
        "\n",
        "# Hyperparameters\n",
        "MAX_LEN = 512\n",
        "TRAIN_BATCH_SIZE = 2  # Reduced batch size\n",
        "VALID_BATCH_SIZE = 2  # Reduced batch size\n",
        "EPOCHS = 10  # Start with fewer epochs\n",
        "LEARNING_RATE = 1e-4\n",
        "WEIGHT_DECAY = 1e-5\n",
        "ACCUMULATION_STEPS = 8  # Gradient accumulation\n",
        "CLIP_VALUE = 0.05  # Gradient clipping\n",
        "PATIENCE = 5  # Early stopping patience\n",
        "\n",
        "# Use GradScaler for mixed precision training\n",
        "scaler = torch.amp.GradScaler()\n",
        "\n",
        "# Function to reinitialize weights of classifier\n",
        "def reinitialize_classifier_weights(layer):\n",
        "    if isinstance(layer, nn.Linear):\n",
        "        nn.init.xavier_uniform_(layer.weight)\n",
        "        nn.init.zeros_(layer.bias)\n",
        "\n",
        "# Start cross-validation\n",
        "for fold, (train_index, val_index) in enumerate(kf.split(train_val_data, train_val_data['Label'])):\n",
        "    print(f\"\\nFold {fold + 1}\\n\" + \"-\" * 10)\n",
        "\n",
        "    # Split the 85% training/validation data into training and validation for this fold\n",
        "    train_data = train_val_data.iloc[train_index].reset_index(drop=True)\n",
        "    val_data = train_val_data.iloc[val_index].reset_index(drop=True)\n",
        "\n",
        "    # Create datasets and loaders for this fold\n",
        "    train_dataset = CustomDataset(train_data['Tokenized Text'], train_data['Label'], tokenizer, MAX_LEN)\n",
        "    val_dataset = CustomDataset(val_data['Tokenized Text'], val_data['Label'], tokenizer, MAX_LEN)\n",
        "\n",
        "    train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=0)\n",
        "    val_loader = DataLoader(val_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)\n",
        "\n",
        "    # Initialize GPT-2 model for classification\n",
        "    model = AutoModelForCausalLM.from_pretrained('distilgpt2')\n",
        "    model.to(device)\n",
        "\n",
        "    optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)\n",
        "    criterion = nn.BCEWithLogitsLoss()\n",
        "\n",
        "    best_valid_loss = float('inf')\n",
        "    patience_counter = 0\n",
        "\n",
        "    for epoch in range(EPOCHS):\n",
        "        model.train()\n",
        "        train_loss = 0.0\n",
        "        train_correct = 0\n",
        "        train_total = 0\n",
        "\n",
        "        for step, batch in enumerate(train_loader):\n",
        "            input_ids = batch['input_ids'].to(device).long()  # Ensure input_ids are LongTensor\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            targets = batch['targets'].to(device).float().view(-1, 1)  # Keep targets as float for BCEWithLogitsLoss\n",
        "\n",
        "            optimizer.zero_grad()\n",
        "\n",
        "            # Mixed precision training with autocast\n",
        "            with torch.amp.autocast(device_type='cuda'):\n",
        "                # Forward pass using the model's forward method to obtain the logits\n",
        "                outputs = model(input_ids=input_ids, attention_mask=attn_mask)\n",
        "\n",
        "                # Extract logits from the last token for binary classification\n",
        "                logits = outputs.logits[:, -1, :]  # Using the last token's logits\n",
        "                predicted = torch.sigmoid(logits[:, 0]).unsqueeze(-1)\n",
        "\n",
        "                # Compute the loss\n",
        "                loss = criterion(predicted, targets)\n",
        "\n",
        "            # Scale loss and backward pass\n",
        "            scaler.scale(loss).backward()\n",
        "\n",
        "            # Gradient Clipping\n",
        "            torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP_VALUE)\n",
        "\n",
        "            # Gradient accumulation\n",
        "            if (step + 1) % ACCUMULATION_STEPS == 0:\n",
        "                scaler.step(optimizer)\n",
        "                scaler.update()\n",
        "                optimizer.zero_grad()\n",
        "\n",
        "            train_loss += loss.item()\n",
        "            train_total += targets.size(0)\n",
        "            train_correct += (predicted.round() == targets).sum().item()\n",
        "\n",
        "            torch.cuda.empty_cache()\n",
        "\n",
        "        train_accuracy = train_correct / train_total\n",
        "        train_loss /= len(train_loader)\n",
        "\n",
        "        # Validation loop\n",
        "        model.eval()\n",
        "        valid_loss = 0.0\n",
        "        valid_correct = 0\n",
        "        valid_total = 0\n",
        "\n",
        "        with torch.no_grad():\n",
        "            for batch in val_loader:\n",
        "                input_ids = batch['input_ids'].to(device).long()  # Ensure input_ids are LongTensor\n",
        "                attn_mask = batch['attention_mask'].to(device)\n",
        "                targets = batch['targets'].to(device).float().view(-1, 1)\n",
        "\n",
        "                # Forward pass using the model's forward method to obtain the logits\n",
        "                outputs = model(input_ids=input_ids, attention_mask=attn_mask)\n",
        "\n",
        "                # Extract logits from the last token for binary classification\n",
        "                logits = outputs.logits[:, -1, :]  # Using the last token's logits\n",
        "                predicted = torch.sigmoid(logits[:, 0]).unsqueeze(-1)\n",
        "\n",
        "                loss = criterion(predicted, targets)\n",
        "                valid_loss += loss.item()\n",
        "                valid_total += targets.size(0)\n",
        "                valid_correct += (predicted.round() == targets).sum().item()\n",
        "\n",
        "                torch.cuda.empty_cache()\n",
        "\n",
        "        valid_accuracy = valid_correct / valid_total\n",
        "        valid_loss /= len(val_loader)\n",
        "\n",
        "        print(f'Epoch {epoch + 1}/{EPOCHS}, Training Accuracy: {train_accuracy:.4f}, Training Loss: {train_loss:.4f}, Validation Loss: {valid_loss:.4f}, Validation Accuracy: {valid_accuracy:.4f}')\n",
        "\n",
        "        # Early stopping logic\n",
        "        if valid_loss < best_valid_loss:\n",
        "            best_valid_loss = valid_loss\n",
        "            patience_counter = 0\n",
        "            # Save the best model state for the current fold\n",
        "            torch.save(model.state_dict(), f'Data/ThirdIterationModels/Distill_GPTFold_{fold + 1}.pth')\n",
        "        else:\n",
        "            patience_counter += 1\n",
        "\n",
        "        if patience_counter >= PATIENCE:\n",
        "            print(\"Early stopping triggered\")\n",
        "            break\n",
        "\n",
        "    # Check if this is the best model across all folds\n",
        "    if best_valid_loss < best_loss:\n",
        "        best_loss = best_valid_loss\n",
        "        best_fold = fold + 1\n",
        "        best_model_state = model.state_dict()  # Save the state of the best model\n",
        "\n",
        "# Save the best model across all folds\n",
        "torch.save(best_model_state, 'Data/ThirdIterationModels/Distill_GPT2_Best_Fold.pth')\n",
        "print(f\"\\nBest model saved from fold {best_fold}\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "37e7c3ed-5fbe-4e5a-bf61-da56e0fa930b",
      "metadata": {
        "tags": [],
        "id": "37e7c3ed-5fbe-4e5a-bf61-da56e0fa930b"
      },
      "outputs": [],
      "source": [
        "from sklearn.metrics import classification_report\n",
        "\n",
        "# Load the best model saved during cross-validation\n",
        "best_model = AutoModelForCausalLM.from_pretrained('distilgpt2')\n",
        "best_model.load_state_dict(torch.load('Data/ThirdIterationModels/Distill_GPT2_Best_Fold.pth'))\n",
        "best_model.to(device)\n",
        "best_model.eval()\n",
        "\n",
        "\n",
        "test_dataset = CustomDataset(test_data['Tokenized Text'].tolist(), test_data['Label'].tolist(), tokenizer, MAX_LEN)\n",
        "test_loader = DataLoader(test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0)\n",
        "\n",
        "# Function to evaluate the model on the test set and return predictions and true labels\n",
        "def evaluate_on_test(model, test_loader):\n",
        "    model.eval()\n",
        "    all_preds = []\n",
        "    all_labels = []\n",
        "\n",
        "    with torch.no_grad():\n",
        "        for batch in test_loader:\n",
        "            input_ids = batch['input_ids'].to(device).long()\n",
        "            attn_mask = batch['attention_mask'].to(device)\n",
        "            targets = batch['targets'].to(device).view(-1, 1)\n",
        "\n",
        "            # Forward pass to obtain logits\n",
        "            outputs = model(input_ids=input_ids, attention_mask=attn_mask)\n",
        "            logits = outputs.logits[:, -1, :]  # Get the logits from the last token\n",
        "            predicted = torch.sigmoid(logits[:, 0]).unsqueeze(-1)\n",
        "\n",
        "            all_preds.extend(predicted.cpu().numpy())\n",
        "            all_labels.extend(targets.cpu().numpy())\n",
        "\n",
        "    # Convert predictions to binary format (0 or 1)\n",
        "    all_preds = np.round(np.array(all_preds))\n",
        "    all_labels = np.array(all_labels)\n",
        "\n",
        "    return all_preds, all_labels\n",
        "\n",
        "# Evaluate the model on the test dataset\n",
        "y_pred, y_true = evaluate_on_test(best_model, test_loader)\n",
        "\n",
        "# Generate classification report\n",
        "print(classification_report(y_true, y_pred, target_names=['class 0', 'class 1']))\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "52b8b97c-abc5-404f-bf00-70cfd6ac2308",
      "metadata": {
        "tags": [],
        "id": "52b8b97c-abc5-404f-bf00-70cfd6ac2308"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "torch.cuda.empty_cache()\n",
        "torch.cuda.ipc_collect()\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "a38af93e-f37f-4974-ad0b-225a8b4a820d",
      "metadata": {
        "id": "a38af93e-f37f-4974-ad0b-225a8b4a820d"
      },
      "source": [
        "## **END**"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "myenv",
      "language": "python",
      "name": "myenv"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.10.14"
    },
    "colab": {
      "provenance": []
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}