{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "be444825-b3bd-40af-8348-f166e01ae919",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "import requests\n",
    "import py7zr\n",
    "import xml.etree.ElementTree as ET\n",
    "import pandas as pd\n",
    "\n",
    "def process_xml(xml_file):\n",
    "    label = xml_file[:xml_file.find('.')]\n",
    "    tree = ET.parse('xml/' + xml_file)  # Replace with your actual file path\n",
    "    root = tree.getroot()\n",
    "    \n",
    "    # Extract relevant data\n",
    "    data = []\n",
    "    for row in root.findall('row'):\n",
    "        post_id = int(row.get('PostId'))\n",
    "        creation_date = row.get('CreationDate')\n",
    "        text = row.get('Text')\n",
    "        data.append((post_id, creation_date, text))\n",
    "    \n",
    "    # Convert to DataFrame\n",
    "    df = pd.DataFrame(data, columns=['PostId', 'CreationDate', 'Text'])\n",
    "    \n",
    "    # Convert CreationDate to datetime\n",
    "    df['CreationDate'] = pd.to_datetime(df['CreationDate'])\n",
    "    \n",
    "    # Keep only the earliest entry per PostId\n",
    "    df = df.sort_values(by=['PostId', 'CreationDate']).drop_duplicates(subset=['PostId'], keep='first')\n",
    "    df['Label']=label\n",
    "    df['Year'] = df.CreationDate.dt.year\n",
    "    df['length'] = [len(str(text).strip()) for text in df.Text]\n",
    "    \n",
    "    df = df[(df.length>20) & (df.length<1000)]\n",
    "    \n",
    "    for year in [2018,2019,2020,2021,2022,2023]:\n",
    "        if len(df[df.Year==year])>20:\n",
    "            continue\n",
    "        else:\n",
    "            return None\n",
    "    df = df[(df.Year>=2018) & (df.Year<=2023)].reset_index(drop=True)\n",
    "    df = df[['Text','CreationDate','Year','Label']]\n",
    "    return df\n",
    "    \n",
    "def extract_post(output_7z):\n",
    "    # Define the URL and filenames\n",
    "    url = \"https://archive.org/download/stackexchange/\" + output_7z\n",
    "    original_xml = \"PostHistory.xml\"\n",
    "    renamed_xml = output_7z[:output_7z.find('.stackexchange.com.7z')] + '.xml'\n",
    "    \n",
    "    # Download the .7z file\n",
    "    print(\"Downloading the file \" + output_7z)\n",
    "    response = requests.get(url, stream=True)\n",
    "    if response.status_code == 200:\n",
    "        with open(output_7z, \"wb\") as file:\n",
    "            for chunk in response.iter_content(chunk_size=1024):\n",
    "                file.write(chunk)\n",
    "        print(\"Download complete.\")\n",
    "    else:\n",
    "        print(\"Failed to download the file.\")\n",
    "        exit(1)\n",
    "    \n",
    "    # Extract the specific file\n",
    "    print(f\"Extracting {original_xml}...\")\n",
    "    with py7zr.SevenZipFile(output_7z, mode=\"r\") as archive:\n",
    "        all_files = archive.getnames()\n",
    "        if original_xml in all_files:\n",
    "            archive.extract(targets=[original_xml], path=\".\")\n",
    "            print(\"Extraction complete.\")\n",
    "        else:\n",
    "            print(f\"{original_xml} not found in the archive.\")\n",
    "            exit(1)\n",
    "    \n",
    "    # Rename the extracted file\n",
    "    if os.path.exists(original_xml):\n",
    "        print(\"Renaming to \" + renamed_xml)\n",
    "        os.rename(original_xml, renamed_xml)\n",
    "        print(f\"Renamed {original_xml} to {renamed_xml}\")\n",
    "    \n",
    "    # Optionally, remove the .7z file to save space\n",
    "    os.remove(output_7z)\n",
    "    print(\"Cleanup complete.\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "ec1369df-da49-4dad-9fbd-2c54fb000059",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>size</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3dprinting.stackexchange.com.7z</td>\n",
       "      <td>18.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>ai.stackexchange.com.7z</td>\n",
       "      <td>35.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>anime.stackexchange.com.7z</td>\n",
       "      <td>36.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>astronomy.stackexchange.com.7z</td>\n",
       "      <td>49.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>avp.stackexchange.com.7z</td>\n",
       "      <td>20.9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                              name  size\n",
       "0  3dprinting.stackexchange.com.7z  18.9\n",
       "1          ai.stackexchange.com.7z  35.3\n",
       "2       anime.stackexchange.com.7z  36.5\n",
       "3   astronomy.stackexchange.com.7z  49.0\n",
       "4         avp.stackexchange.com.7z  20.9"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#taken from https://archive.org/download/stackexchange 2025-02-05\n",
    "df = pd.read_csv('tmp/stackexchange_dir.tsv', sep = '\\t')\n",
    "df_clean = []\n",
    "for i,row  in df.iterrows():\n",
    "    cur_name = row['Name']\n",
    "    cur_size = row['Size']\n",
    "    if '.stackexchange.com.7z' in cur_name and not('.meta.' in cur_name):\n",
    "        cur_name = cur_name.strip()\n",
    "        cur_name = cur_name[:cur_name.find('.7z')+3]\n",
    "        if 'M' in cur_size:\n",
    "            cur_size = cur_size[:-1]\n",
    "            if float(cur_size)<50:\n",
    "                df_clean.append([cur_name, cur_size])\n",
    "df_clean = pd.DataFrame(df_clean, columns = ['name','size'])\n",
    "df_clean.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "5b6c0777-1da9-48b4-98d4-e4943cb8e0f4",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "********************\n",
      "Downloading the file 3dprinting.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to 3dprinting.xml\n",
      "Renamed PostHistory.xml to 3dprinting.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file ai.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to ai.xml\n",
      "Renamed PostHistory.xml to ai.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file anime.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to anime.xml\n",
      "Renamed PostHistory.xml to anime.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file astronomy.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to astronomy.xml\n",
      "Renamed PostHistory.xml to astronomy.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file avp.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to avp.xml\n",
      "Renamed PostHistory.xml to avp.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file beer.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to beer.xml\n",
      "Renamed PostHistory.xml to beer.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file bioacoustics.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to bioacoustics.xml\n",
      "Renamed PostHistory.xml to bioacoustics.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file bioinformatics.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to bioinformatics.xml\n",
      "Renamed PostHistory.xml to bioinformatics.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file boardgames.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to boardgames.xml\n",
      "Renamed PostHistory.xml to boardgames.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file bricks.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to bricks.xml\n",
      "Renamed PostHistory.xml to bricks.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file buddhism.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to buddhism.xml\n",
      "Renamed PostHistory.xml to buddhism.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file cardano.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to cardano.xml\n",
      "Renamed PostHistory.xml to cardano.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file chess.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to chess.xml\n",
      "Renamed PostHistory.xml to chess.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file chinese.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to chinese.xml\n",
      "Renamed PostHistory.xml to chinese.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file civicrm.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to civicrm.xml\n",
      "Renamed PostHistory.xml to civicrm.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file coffee.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to coffee.xml\n",
      "Renamed PostHistory.xml to coffee.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file cogsci.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to cogsci.xml\n",
      "Renamed PostHistory.xml to cogsci.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file computergraphics.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to computergraphics.xml\n",
      "Renamed PostHistory.xml to computergraphics.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file conlang.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to conlang.xml\n",
      "Renamed PostHistory.xml to conlang.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file craftcms.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to craftcms.xml\n",
      "Renamed PostHistory.xml to craftcms.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file crafts.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to crafts.xml\n",
      "Renamed PostHistory.xml to crafts.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file cseducators.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to cseducators.xml\n",
      "Renamed PostHistory.xml to cseducators.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file cstheory.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to cstheory.xml\n",
      "Renamed PostHistory.xml to cstheory.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file devops.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to devops.xml\n",
      "Renamed PostHistory.xml to devops.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file drones.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to drones.xml\n",
      "Renamed PostHistory.xml to drones.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file earthscience.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to earthscience.xml\n",
      "Renamed PostHistory.xml to earthscience.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file ebooks.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to ebooks.xml\n",
      "Renamed PostHistory.xml to ebooks.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file economics.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to economics.xml\n",
      "Renamed PostHistory.xml to economics.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file elementaryos.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to elementaryos.xml\n",
      "Renamed PostHistory.xml to elementaryos.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file engineering.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to engineering.xml\n",
      "Renamed PostHistory.xml to engineering.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file eosio.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to eosio.xml\n",
      "Renamed PostHistory.xml to eosio.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file esperanto.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to esperanto.xml\n",
      "Renamed PostHistory.xml to esperanto.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file expatriates.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to expatriates.xml\n",
      "Renamed PostHistory.xml to expatriates.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file expressionengine.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to expressionengine.xml\n",
      "Renamed PostHistory.xml to expressionengine.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file fitness.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to fitness.xml\n",
      "Renamed PostHistory.xml to fitness.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file freelancing.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to freelancing.xml\n",
      "Renamed PostHistory.xml to freelancing.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file french.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to french.xml\n",
      "Renamed PostHistory.xml to french.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file gardening.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to gardening.xml\n",
      "Renamed PostHistory.xml to gardening.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file genealogy.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to genealogy.xml\n",
      "Renamed PostHistory.xml to genealogy.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file ham.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to ham.xml\n",
      "Renamed PostHistory.xml to ham.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file hardwarerecs.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to hardwarerecs.xml\n",
      "Renamed PostHistory.xml to hardwarerecs.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file health.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to health.xml\n",
      "Renamed PostHistory.xml to health.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file homebrew.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to homebrew.xml\n",
      "Renamed PostHistory.xml to homebrew.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file hsm.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to hsm.xml\n",
      "Renamed PostHistory.xml to hsm.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file interpersonal.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to interpersonal.xml\n",
      "Renamed PostHistory.xml to interpersonal.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file iot.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to iot.xml\n",
      "Renamed PostHistory.xml to iot.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file iota.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to iota.xml\n",
      "Renamed PostHistory.xml to iota.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file italian.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to italian.xml\n",
      "Renamed PostHistory.xml to italian.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file joomla.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to joomla.xml\n",
      "Renamed PostHistory.xml to joomla.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file korean.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to korean.xml\n",
      "Renamed PostHistory.xml to korean.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file langdev.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to langdev.xml\n",
      "Renamed PostHistory.xml to langdev.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file languagelearning.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to languagelearning.xml\n",
      "Renamed PostHistory.xml to languagelearning.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file latin.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to latin.xml\n",
      "Renamed PostHistory.xml to latin.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file lifehacks.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to lifehacks.xml\n",
      "Renamed PostHistory.xml to lifehacks.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file linguistics.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to linguistics.xml\n",
      "Renamed PostHistory.xml to linguistics.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file literature.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to literature.xml\n",
      "Renamed PostHistory.xml to literature.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file martialarts.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to martialarts.xml\n",
      "Renamed PostHistory.xml to martialarts.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file materials.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to materials.xml\n",
      "Renamed PostHistory.xml to materials.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file matheducators.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to matheducators.xml\n",
      "Renamed PostHistory.xml to matheducators.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file moderators.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to moderators.xml\n",
      "Renamed PostHistory.xml to moderators.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file monero.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to monero.xml\n",
      "Renamed PostHistory.xml to monero.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file musicfans.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to musicfans.xml\n",
      "Renamed PostHistory.xml to musicfans.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file mythology.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to mythology.xml\n",
      "Renamed PostHistory.xml to mythology.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file networkengineering.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to networkengineering.xml\n",
      "Renamed PostHistory.xml to networkengineering.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file opendata.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to opendata.xml\n",
      "Renamed PostHistory.xml to opendata.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file opensource.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to opensource.xml\n",
      "Renamed PostHistory.xml to opensource.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file or.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to or.xml\n",
      "Renamed PostHistory.xml to or.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file outdoors.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to outdoors.xml\n",
      "Renamed PostHistory.xml to outdoors.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file parenting.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to parenting.xml\n",
      "Renamed PostHistory.xml to parenting.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file patents.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to patents.xml\n",
      "Renamed PostHistory.xml to patents.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file pets.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to pets.xml\n",
      "Renamed PostHistory.xml to pets.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file pm.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to pm.xml\n",
      "Renamed PostHistory.xml to pm.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file poker.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to poker.xml\n",
      "Renamed PostHistory.xml to poker.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file portuguese.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to portuguese.xml\n",
      "Renamed PostHistory.xml to portuguese.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file proofassistants.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to proofassistants.xml\n",
      "Renamed PostHistory.xml to proofassistants.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file quantumcomputing.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to quantumcomputing.xml\n",
      "Renamed PostHistory.xml to quantumcomputing.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file retrocomputing.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to retrocomputing.xml\n",
      "Renamed PostHistory.xml to retrocomputing.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file reverseengineering.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to reverseengineering.xml\n",
      "Renamed PostHistory.xml to reverseengineering.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file russian.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to russian.xml\n",
      "Renamed PostHistory.xml to russian.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file scicomp.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to scicomp.xml\n",
      "Renamed PostHistory.xml to scicomp.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file sitecore.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to sitecore.xml\n",
      "Renamed PostHistory.xml to sitecore.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file solana.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to solana.xml\n",
      "Renamed PostHistory.xml to solana.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file sound.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to sound.xml\n",
      "Renamed PostHistory.xml to sound.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file spanish.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to spanish.xml\n",
      "Renamed PostHistory.xml to spanish.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file sports.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to sports.xml\n",
      "Renamed PostHistory.xml to sports.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file sqa.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to sqa.xml\n",
      "Renamed PostHistory.xml to sqa.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file stellar.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to stellar.xml\n",
      "Renamed PostHistory.xml to stellar.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file substrate.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to substrate.xml\n",
      "Renamed PostHistory.xml to substrate.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file sustainability.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to sustainability.xml\n",
      "Renamed PostHistory.xml to sustainability.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file tezos.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to tezos.xml\n",
      "Renamed PostHistory.xml to tezos.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file tor.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to tor.xml\n",
      "Renamed PostHistory.xml to tor.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file tridion.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to tridion.xml\n",
      "Renamed PostHistory.xml to tridion.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file ukrainian.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to ukrainian.xml\n",
      "Renamed PostHistory.xml to ukrainian.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file vegetarianism.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to vegetarianism.xml\n",
      "Renamed PostHistory.xml to vegetarianism.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file vi.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to vi.xml\n",
      "Renamed PostHistory.xml to vi.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file windowsphone.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to windowsphone.xml\n",
      "Renamed PostHistory.xml to windowsphone.xml\n",
      "Cleanup complete.\n",
      "********************\n",
      "Downloading the file woodworking.stackexchange.com.7z\n",
      "Download complete.\n",
      "Extracting PostHistory.xml...\n",
      "Extraction complete.\n",
      "Renaming to woodworking.xml\n",
      "Renamed PostHistory.xml to woodworking.xml\n",
      "Cleanup complete.\n"
     ]
    }
   ],
   "source": [
    "for name in df_clean.name:\n",
    "    print('*'*20)\n",
    "    extract_post(name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "5ec93bd6-8dde-4ddb-9491-98711cd35c56",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3dprinting.xml\n",
      "ai.xml\n",
      "anime.xml\n",
      "astronomy.xml\n",
      "avp.xml\n",
      "beer.xml\n",
      "bioacoustics.xml\n",
      "bioinformatics.xml\n",
      "boardgames.xml\n",
      "bricks.xml\n",
      "buddhism.xml\n",
      "cardano.xml\n",
      "chess.xml\n",
      "chinese.xml\n",
      "civicrm.xml\n",
      "coffee.xml\n",
      "cogsci.xml\n",
      "computergraphics.xml\n",
      "conlang.xml\n",
      "craftcms.xml\n",
      "crafts.xml\n",
      "cseducators.xml\n",
      "cstheory.xml\n",
      "devops.xml\n",
      "drones.xml\n",
      "earthscience.xml\n",
      "ebooks.xml\n",
      "economics.xml\n",
      "elementaryos.xml\n",
      "engineering.xml\n",
      "eosio.xml\n",
      "esperanto.xml\n",
      "expatriates.xml\n",
      "expressionengine.xml\n",
      "fitness.xml\n",
      "freelancing.xml\n",
      "french.xml\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\jairo\\AppData\\Local\\Temp\\ipykernel_16064\\628546003.py:25: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
      "  df['CreationDate'] = pd.to_datetime(df['CreationDate'])\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "gardening.xml\n",
      "genealogy.xml\n",
      "ham.xml\n",
      "hardwarerecs.xml\n",
      "health.xml\n",
      "homebrew.xml\n",
      "hsm.xml\n",
      "interpersonal.xml\n",
      "iot.xml\n",
      "iota.xml\n",
      "italian.xml\n",
      "joomla.xml\n",
      "korean.xml\n",
      "langdev.xml\n",
      "languagelearning.xml\n",
      "latin.xml\n",
      "lifehacks.xml\n",
      "linguistics.xml\n",
      "literature.xml\n",
      "martialarts.xml\n",
      "materials.xml\n",
      "matheducators.xml\n",
      "moderators.xml\n",
      "monero.xml\n",
      "musicfans.xml\n",
      "mythology.xml\n",
      "networkengineering.xml\n",
      "opendata.xml\n",
      "opensource.xml\n",
      "or.xml\n",
      "outdoors.xml\n",
      "parenting.xml\n",
      "patents.xml\n",
      "pets.xml\n",
      "pm.xml\n",
      "poker.xml\n",
      "portuguese.xml\n",
      "proofassistants.xml\n",
      "quantumcomputing.xml\n",
      "retrocomputing.xml\n",
      "reverseengineering.xml\n",
      "russian.xml\n",
      "scicomp.xml\n",
      "sitecore.xml\n",
      "solana.xml\n",
      "sound.xml\n",
      "spanish.xml\n",
      "sports.xml\n",
      "sqa.xml\n",
      "stellar.xml\n",
      "substrate.xml\n",
      "sustainability.xml\n",
      "tezos.xml\n",
      "tor.xml\n",
      "tridion.xml\n",
      "ukrainian.xml\n",
      "vegetarianism.xml\n",
      "vi.xml\n",
      "windowsphone.xml\n",
      "woodworking.xml\n"
     ]
    }
   ],
   "source": [
    "xml_folder = \"xml\"  # Specify your folder path\n",
    "files = [f for f in os.listdir(xml_folder) if os.path.isfile(os.path.join(xml_folder, f))]\n",
    "\n",
    "df_final = []\n",
    "for file in files:\n",
    "    print(file)\n",
    "    df_temp = process_xml(file)\n",
    "    df_final.append(df_temp)\n",
    "\n",
    "df_final = pd.concat(df_final).reset_index(drop = True)\n",
    "df_final.to_csv('raw_stackexchange.csv', index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "d1de1f4d-4db5-4d4c-9ee6-57d7f12a81cc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "84"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df_final.Label.unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "82f839d0-9d06-4dc3-82a8-c6c6b1cb839f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(499359, 4)"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_final.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "6627ec4f-aabc-4629-83b8-c92297c58772",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Text</th>\n",
       "      <th>CreationDate</th>\n",
       "      <th>Year</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Label</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3dprinting</th>\n",
       "      <td>8257</td>\n",
       "      <td>8257</td>\n",
       "      <td>8257</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ai</th>\n",
       "      <td>13690</td>\n",
       "      <td>13690</td>\n",
       "      <td>13690</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>anime</th>\n",
       "      <td>8257</td>\n",
       "      <td>8257</td>\n",
       "      <td>8257</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>astronomy</th>\n",
       "      <td>13089</td>\n",
       "      <td>13089</td>\n",
       "      <td>13089</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>avp</th>\n",
       "      <td>6535</td>\n",
       "      <td>6535</td>\n",
       "      <td>6535</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>tridion</th>\n",
       "      <td>2654</td>\n",
       "      <td>2654</td>\n",
       "      <td>2654</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ukrainian</th>\n",
       "      <td>2544</td>\n",
       "      <td>2544</td>\n",
       "      <td>2544</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>vegetarianism</th>\n",
       "      <td>772</td>\n",
       "      <td>772</td>\n",
       "      <td>772</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>vi</th>\n",
       "      <td>16446</td>\n",
       "      <td>16446</td>\n",
       "      <td>16446</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>woodworking</th>\n",
       "      <td>3369</td>\n",
       "      <td>3369</td>\n",
       "      <td>3369</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>84 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                Text  CreationDate   Year\n",
       "Label                                    \n",
       "3dprinting      8257          8257   8257\n",
       "ai             13690         13690  13690\n",
       "anime           8257          8257   8257\n",
       "astronomy      13089         13089  13089\n",
       "avp             6535          6535   6535\n",
       "...              ...           ...    ...\n",
       "tridion         2654          2654   2654\n",
       "ukrainian       2544          2544   2544\n",
       "vegetarianism    772           772    772\n",
       "vi             16446         16446  16446\n",
       "woodworking     3369          3369   3369\n",
       "\n",
       "[84 rows x 3 columns]"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_final.groupby('Label').count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "1c197314-5e0f-4580-a6e9-1c40f64ada38",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Text</th>\n",
       "      <th>CreationDate</th>\n",
       "      <th>Label</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Year</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2018</th>\n",
       "      <td>117132</td>\n",
       "      <td>117132</td>\n",
       "      <td>117132</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2019</th>\n",
       "      <td>104621</td>\n",
       "      <td>104621</td>\n",
       "      <td>104621</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2020</th>\n",
       "      <td>95920</td>\n",
       "      <td>95920</td>\n",
       "      <td>95920</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2021</th>\n",
       "      <td>75136</td>\n",
       "      <td>75136</td>\n",
       "      <td>75136</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2022</th>\n",
       "      <td>56804</td>\n",
       "      <td>56804</td>\n",
       "      <td>56804</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2023</th>\n",
       "      <td>49746</td>\n",
       "      <td>49746</td>\n",
       "      <td>49746</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        Text  CreationDate   Label\n",
       "Year                              \n",
       "2018  117132        117132  117132\n",
       "2019  104621        104621  104621\n",
       "2020   95920         95920   95920\n",
       "2021   75136         75136   75136\n",
       "2022   56804         56804   56804\n",
       "2023   49746         49746   49746"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_final.groupby(['Year']).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "fc797712-0476-44a5-91d0-cb1321c74fa7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Text</th>\n",
       "      <th>CreationDate</th>\n",
       "      <th>Label</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Year</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2018</th>\n",
       "      <td>1777</td>\n",
       "      <td>1777</td>\n",
       "      <td>1777</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2019</th>\n",
       "      <td>2824</td>\n",
       "      <td>2824</td>\n",
       "      <td>2824</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2020</th>\n",
       "      <td>3290</td>\n",
       "      <td>3290</td>\n",
       "      <td>3290</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2021</th>\n",
       "      <td>2223</td>\n",
       "      <td>2223</td>\n",
       "      <td>2223</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2022</th>\n",
       "      <td>1714</td>\n",
       "      <td>1714</td>\n",
       "      <td>1714</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2023</th>\n",
       "      <td>1862</td>\n",
       "      <td>1862</td>\n",
       "      <td>1862</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      Text  CreationDate  Label\n",
       "Year                           \n",
       "2018  1777          1777   1777\n",
       "2019  2824          2824   2824\n",
       "2020  3290          3290   3290\n",
       "2021  2223          2223   2223\n",
       "2022  1714          1714   1714\n",
       "2023  1862          1862   1862"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_final[df_final.Label=='ai'].groupby('Year').count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "09955716-4827-4106-8a8b-12f883d2f0ef",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:base] *",
   "language": "python",
   "name": "conda-base-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
