{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "33e68399",
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.tokenize import sent_tokenize\n",
    "import os\n",
    "import pandas as pd\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "import sys\n",
    "sys.path.append('..')\n",
    "from src.data.preprocess_data import load_data\n",
    "\n",
    "\n",
    "os.environ[\"CUDA_VISIBLE_DEVICES\"]='7'\n",
    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"False\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0454addb",
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.tokenize import word_tokenize, sent_tokenize\n",
    "\n",
    "def clean_content_from_source(source_text):\n",
    "    sentences_in_doc = []\n",
    "    final_sentences = []\n",
    "    temp_to_join = \"\"\n",
    "\n",
    "    for pre_sentence in source_text:         \n",
    "        if pre_sentence[-1] != \".\":                     ### cumulate till find puntuation mark\n",
    "            temp_to_join += \" \"+pre_sentence            \n",
    "        else:                                           ### sentence ends in punctuation mark\n",
    "            if temp_to_join != \"\":\n",
    "                temp_to_join += \" \"+pre_sentence        ### merging                    \n",
    "                sentences_in_doc.append(temp_to_join.strip())\n",
    "                temp_to_join = \"\"\n",
    "            else:\n",
    "                sentences_in_doc.append(pre_sentence.strip())\n",
    "        \n",
    "    if temp_to_join != \"\":\n",
    "        sentences_in_doc.append(temp_to_join.strip())\n",
    "        temp_to_join = \"\"\n",
    "    \n",
    "    for pre_sentence in sentences_in_doc:\n",
    "        pre_sentence_list= pre_sentence.split(\"\\n\")\n",
    "        temp_to_join = pre_sentence_list[0].strip()     ### first sub-sentence - base case\n",
    "        if len(pre_sentence_list) != 1:\n",
    "            for sent in pre_sentence_list[1:]:\n",
    "                try:\n",
    "                    if sent[0].isdigit() or sent[-1]!='.' or len(word_tokenize(sent)) < 5 :\n",
    "                        temp_to_join += \" \"+sent.strip()\n",
    "                    else:\n",
    "                        if temp_to_join[-1]=='.':\n",
    "                            final_sentences.append(temp_to_join.strip())\n",
    "                            temp_to_join = \"\"\n",
    "                        if sent[-1]=='.':\n",
    "                            if temp_to_join[-1]!='.':\n",
    "                                final_sentences.append(temp_to_join+\" \"+sent.strip())\n",
    "                                temp_to_join = \"\"\n",
    "                            else:\n",
    "                                final_sentences.append(sent.strip())\n",
    "                except:\n",
    "                    pass\n",
    "\n",
    "        if temp_to_join != \"\":\n",
    "            final_sentences.append(temp_to_join.strip())\n",
    "\n",
    "    cumulated = \"\"\n",
    "    final_sentences_cleaned = []\n",
    "    for sent in final_sentences:\n",
    "        if len(word_tokenize(sent)) < 5: ### if sentence is too short, accumulate\n",
    "            cumulated += \" \"+sent\n",
    "        else:\n",
    "            final_sentences_cleaned.append((cumulated+\" \"+sent).strip())\n",
    "            cumulated = \"\"\n",
    "\n",
    "    return final_sentences_cleaned   \n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "699233a7",
   "metadata": {},
   "source": [
    "#### Data preparation -- ArXiv for Classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9251c6b8",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Generating train split: 100%|██████████| 28388/28388 [00:09<00:00, 2885.56 examples/s]\n",
      "Generating validation split: 100%|██████████| 2500/2500 [00:00<00:00, 2643.41 examples/s]\n",
      "Generating test split: 100%|██████████| 2500/2500 [00:00<00:00, 2867.52 examples/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['text', 'label'],\n",
       "        num_rows: 28388\n",
       "    })\n",
       "    validation: Dataset({\n",
       "        features: ['text', 'label'],\n",
       "        num_rows: 2500\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['text', 'label'],\n",
       "        num_rows: 2500\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "#https://huggingface.co/datasets/ccdv/arxiv-classification/\n",
    "dataset = load_dataset('ccdv/arxiv-classification')\n",
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "282ffdc3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Constrained Submodular Maximization via a\\nNon...</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Self Organizing Maps Whose Topologies Can Be L...</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Robust Satisfaction of Temporal Logic Specific...</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>BATCHED QR AND SVD ALGORITHMS ON GPUS WITH APP...</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Analytical and simplified models for dynamic a...</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                text  label\n",
       "0  Constrained Submodular Maximization via a\\nNon...      8\n",
       "1  Self Organizing Maps Whose Topologies Can Be L...      9\n",
       "2  Robust Satisfaction of Temporal Logic Specific...      3\n",
       "3  BATCHED QR AND SVD ALGORITHMS ON GPUS WITH APP...      8\n",
       "4  Analytical and simplified models for dynamic a...      5"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "partition = 'train'\n",
    "df_train = pd.DataFrame(dataset[partition])\n",
    "df_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "id": "bafe93a4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "#Empty docs: 0 \t#Short documents: 0\n",
      "There are 1067 duplicated documents in the partition.\n",
      "Removing duplicates...\n"
     ]
    }
   ],
   "source": [
    "empty_docs = []\n",
    "short_doc =[]\n",
    "\n",
    "df_ = pd.DataFrame(columns=[\"article_id\", \"article_text\", \"article_label\"])\n",
    "\n",
    "ide = 0     \n",
    "for sample in dataset[partition]:\n",
    "    source_text = sample['text']\n",
    "    source_text = source_text.replace(\"\\n\\n\", \". \")\n",
    "    source_text = source_text.replace(\"\\n\", \" \")\n",
    "    label = sample['label']\n",
    "    if len(sent_tokenize(source_text)) == 0:\n",
    "        empty_docs.append(ide)\n",
    "        continue\n",
    "    if len(sent_tokenize(source_text)) == 1:\n",
    "        short_doc.append(ide)\n",
    "        continue\n",
    "    sent_in_doc = clean_content_from_source(sent_tokenize(source_text))\n",
    "    sent_in_doc_as_text = ' [St]'.join(sent_in_doc)\n",
    "    df_.loc[len(df_)] = {\"article_id\": ide, \"article_text\": sent_in_doc_as_text, \"article_label\": label}\n",
    "    ide += 1\n",
    "    \n",
    "\n",
    "print (\"#Empty docs:\", len(empty_docs), \"\\t#Short documents:\", len(short_doc))\n",
    "in_dataframe_duplicated = df_[df_.duplicated(subset=[\"article_text\"])]\n",
    "print (\"There are\", in_dataframe_duplicated.shape[0], \"duplicated documents in the partition.\")\n",
    "print (\"Removing duplicates...\")\n",
    "in_dataframe = df_.drop_duplicates(subset=[\"article_text\"])\n",
    "\n",
    "path_dataset= \"path/to/arXiv-Classification/processed/folder/\"\n",
    "in_dataframe.to_csv(path_dataset+\"df_\"+partition+\".csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "id": "8ebcc4a9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "#Empty docs: 0 \t#Short documents: 0\n",
      "There are 7 duplicated documents in the partition.\n",
      "Removing duplicates...\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(2493, 3)"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "partition='validation'\n",
    "df_val = pd.DataFrame(dataset[partition])\n",
    "empty_docs = []\n",
    "short_doc =[]\n",
    "\n",
    "df_ = pd.DataFrame(columns=[\"article_id\", \"article_text\", \"article_label\"])\n",
    "\n",
    "ide = 0     \n",
    "for sample in dataset[partition]:\n",
    "    source_text = sample['text']\n",
    "    source_text = source_text.replace(\"\\n\\n\", \". \")\n",
    "    source_text = source_text.replace(\"\\n\", \" \")\n",
    "    label = sample['label']\n",
    "    if len(sent_tokenize(source_text)) == 0:\n",
    "        empty_docs.append(ide)\n",
    "        continue\n",
    "    if len(sent_tokenize(source_text)) == 1:\n",
    "        short_doc.append(ide)\n",
    "        continue\n",
    "    sent_in_doc = clean_content_from_source(sent_tokenize(source_text))\n",
    "    sent_in_doc_as_text = ' [St]'.join(sent_in_doc)\n",
    "    df_.loc[len(df_)] = {\"article_id\": ide, \"article_text\": sent_in_doc_as_text, \"article_label\": label}\n",
    "    ide += 1\n",
    "\n",
    "print (\"#Empty docs:\", len(empty_docs), \"\\t#Short documents:\", len(short_doc))\n",
    "in_dataframe_duplicated = df_[df_.duplicated(subset=[\"article_text\"])]\n",
    "print (\"There are\", in_dataframe_duplicated.shape[0], \"duplicated documents in the partition.\")\n",
    "print (\"Removing duplicates...\")\n",
    "in_dataframe_val = df_.drop_duplicates(subset=[\"article_text\"])\n",
    "in_dataframe_val.to_csv(path_dataset+\"df_\"+partition+\".csv\", index=False)\n",
    "in_dataframe_val.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "id": "b485fa50",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "#Empty docs: 0 \t#Short documents: 0\n",
      "There are 5 duplicated documents in the partition.\n",
      "Removing duplicates...\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(2495, 3)"
      ]
     },
     "execution_count": 95,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "partition='test'\n",
    "df_val = pd.DataFrame(dataset[partition])\n",
    "empty_docs = []\n",
    "short_doc =[]\n",
    "\n",
    "df_ = pd.DataFrame(columns=[\"article_id\", \"article_text\", \"article_label\"])\n",
    "\n",
    "ide = 0     \n",
    "for sample in dataset[partition]:\n",
    "    source_text = sample['text']\n",
    "    source_text = source_text.replace(\"\\n\\n\", \". \")\n",
    "    source_text = source_text.replace(\"\\n\", \" \")\n",
    "    label = sample['label']\n",
    "    if len(sent_tokenize(source_text)) == 0:\n",
    "        empty_docs.append(ide)\n",
    "        continue\n",
    "    if len(sent_tokenize(source_text)) == 1:\n",
    "        short_doc.append(ide)\n",
    "        continue\n",
    "    sent_in_doc = clean_content_from_source(sent_tokenize(source_text))\n",
    "    sent_in_doc_as_text = ' [St]'.join(sent_in_doc)\n",
    "    df_.loc[len(df_)] = {\"article_id\": ide, \"article_text\": sent_in_doc_as_text, \"article_label\": label}\n",
    "    ide += 1\n",
    "\n",
    "print (\"#Empty docs:\", len(empty_docs), \"\\t#Short documents:\", len(short_doc))\n",
    "\n",
    "in_dataframe_duplicated = df_[df_.duplicated(subset=[\"article_text\"])]\n",
    "print (\"There are\", in_dataframe_duplicated.shape[0], \"duplicated documents in the partition.\")\n",
    "print (\"Removing duplicates...\")\n",
    "in_dataframe_test = df_.drop_duplicates(subset=[\"article_text\"])\n",
    "in_dataframe_test.to_csv(path_dataset+\"df_\"+partition+\".csv\", index=False)\n",
    "in_dataframe_test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e09c73f4",
   "metadata": {},
   "source": [
    "##### Load Data -- ArXiv for Classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "44c3ac5a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading from Processed folder\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "df_train (27321, 3)\n",
      "df_val (2493, 3)\n",
      "df_test (2495, 3)\n"
     ]
    }
   ],
   "source": [
    "in_path = \"/path/to/arXiv-Classification/folder/\"\n",
    "\n",
    "data_train = \"\"\n",
    "labels_train = \"\"\n",
    "data_test = \"\"\n",
    "labels_test = \"\"\n",
    "\n",
    "df_train, df_val, df_test = load_data(in_path, data_train, labels_train, data_test, labels_test, with_val=True)\n",
    "\n",
    "print (\"df_train\", df_train.shape)\n",
    "print (\"df_val\", df_val.shape)\n",
    "print (\"df_test\", df_test.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7b198b53",
   "metadata": {},
   "source": [
    "#### Data preparation -- BBC News"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ee91e6cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "df = pd.read_csv(\"/path/to/BBC-News/data/folder/BBC_data.csv\")\n",
    "print (\"Input dataframe has shape: \", df.shape)\n",
    "\n",
    "# Split into training (80%) and test (20%) sets\n",
    "df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)\n",
    "\n",
    "# Further split training data to extract 10% for validation\n",
    "df_train, df_val = train_test_split(df_train, test_size=0.1, random_state=42, shuffle=True)\n",
    "\n",
    "print (f\"Train size: {len(df_train)}\")\n",
    "print (f\"Val size: {len(df_val)}\")\n",
    "print (f\"Test size: {len(df_test)}\")\n",
    "\n",
    "# Save the resulting dataframes to temporal CSV files\n",
    "df_train.to_csv(\"/path/to/BBC-News/auxiliar/folder/df_train_aux.csv\", index=False)\n",
    "df_val.to_csv(\"/path/to/BBC-News/auxiliar/folder/df_val_aux.csv\", index=False)\n",
    "df_test.to_csv(\"/path/to/BBC-News/auxiliar/folder/df_test_aux.csv\", index=False)\n",
    "print(\"Data successfully split and saved!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "879a39ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_cleaned_dataframes(inpuit_dataframe):\n",
    "\n",
    "    empty_docs = []\n",
    "    short_doc =[]\n",
    "\n",
    "    df_ = pd.DataFrame(columns=[\"article_id\", \"article_text\", \"article_label\"])\n",
    "\n",
    "    ide = 0     \n",
    "    for index, sample in inpuit_dataframe.iterrows():\n",
    "        source_text = sample['article_text']\n",
    "        source_text = source_text.replace(\"\\n\\n\", \". \")\n",
    "        source_text = source_text.replace(\"..\", \".\")\n",
    "        source_text = source_text.replace('.\" ', '.\". ')\n",
    "        source_text = source_text.replace(\"\\n\", \" \")\n",
    "        label = sample['article_label']\n",
    "        if len(sent_tokenize(source_text)) == 0:\n",
    "            empty_docs.append(ide)\n",
    "            continue\n",
    "        if len(sent_tokenize(source_text)) == 1:\n",
    "            short_doc.append(ide)\n",
    "            continue\n",
    "        sent_in_doc = clean_content_from_source(sent_tokenize(source_text))\n",
    "        sent_in_doc_as_text = ' [St]'.join(sent_in_doc)\n",
    "        df_.loc[len(df_)] = {\"article_id\": ide, \"article_text\": sent_in_doc_as_text, \"article_label\": label}\n",
    "        ide += 1    \n",
    "\n",
    "    print (\"#Empty docs:\", len(empty_docs), \"\\t#Short documents:\", len(short_doc))\n",
    "    in_dataframe_duplicated = df_[df_.duplicated(subset=[\"article_text\"])]\n",
    "    print (\"There are\", in_dataframe_duplicated.shape[0], \"duplicated documents in the partition.\")\n",
    "    print (\"Removing duplicates...\")\n",
    "    in_dataframe = df_.drop_duplicates(subset=[\"article_text\"])\n",
    "    print (\"Final dataframe shape:\", in_dataframe.shape)\n",
    "    \n",
    "    return in_dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9961a899",
   "metadata": {},
   "outputs": [],
   "source": [
    "path_dataset= \"/path/to/BBC-News/data/folder/Processed/\"\n",
    "train_dataframe = create_cleaned_dataframes(df_train)\n",
    "train_dataframe.to_csv(path_dataset+\"df_train.csv\", index=False)\n",
    "print (\"Train dataframe saved.\")\n",
    "validation_dataframe = create_cleaned_dataframes(df_val)\n",
    "validation_dataframe.to_csv(path_dataset+\"df_val.csv\", index=False)\n",
    "print (\"Validation dataframe saved.\")\n",
    "test_dataframe = create_cleaned_dataframes(df_test)\n",
    "test_dataframe.to_csv(path_dataset+\"df_test.csv\", index=False)\n",
    "print (\"Test dataframe saved.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5d0f5131",
   "metadata": {},
   "source": [
    "##### Load Data -- BBC News"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3151409e",
   "metadata": {},
   "outputs": [],
   "source": [
    "in_path = \"/path/to/BBC-News/data/folder/\"\n",
    "\n",
    "data_train = \"\"\n",
    "labels_train = \"\"\n",
    "data_test = \"\"\n",
    "labels_test = \"\"\n",
    "\n",
    "df_train, df_val, df_test = load_data(in_path, data_train, labels_train, data_test, labels_test, with_val=True)\n",
    "\n",
    "print (\"df_train\", df_train.shape)\n",
    "print (\"df_val\", df_val.shape)\n",
    "print (\"df_test\", df_test.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "13cf9015",
   "metadata": {},
   "source": [
    "#### Data Preparation --- HND"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e870dc34",
   "metadata": {},
   "outputs": [],
   "source": [
    "##### Do the same for HND..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ba323b29",
   "metadata": {},
   "outputs": [],
   "source": [
    "#### Load Data \n",
    "in_path = \"/path/to/HyperNews/data/folder/\"\n",
    "\n",
    "data_train = \"articles-training-byarticle-20181122.xml\"             #downloaded from https://zenodo.org/records/5776081\n",
    "labels_train = \"ground-truth-training-byarticle-20181122.xml\"       #downloaded from https://zenodo.org/records/5776081\n",
    "data_test = \"articles-test-byarticle-20181207.xml\"                  #downloaded from https://zenodo.org/records/5776081\n",
    "labels_test = \"ground-truth-test-byarticle-20181207.xml\"            #downloaded from https://zenodo.org/records/5776081\n",
    "\n",
    "df_full_train, df_test = load_data(in_path, data_train, labels_train, data_test, labels_test)\n",
    "\n",
    "print (\"df_train\", df_full_train.shape)\n",
    "print (\"df_test\", df_test.shape)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
