{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Ingest StackExchange data dumps"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/notebooks/data-augmentation/stackexchange-builder/stackexchange-builder.ipynb)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {
    "id": "TB7CEfs8F-8u"
   },
   "source": [
    "This notebook takes a StackExchange Data dump \"Posts.xml\" file and ingests it into a Pandas Dataframe. Outputs of the file can be JSON, JSONL, Parquet, or CSV. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "0rHryQttyzyY"
   },
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup as bs\n",
    "import pandas as pd\n",
    "import requests\n",
    "import json"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "15mAL7GnzBv0"
   },
   "source": [
    "# Extract StackExchange\n",
    "Pull StackExchange file dumps. Specific column types are enforced to prevent errors on processing later in the notebook"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "FtcvUEaHVxcW",
    "outputId": "5b0cb19d-e3d9-422b-9077-52241bd09e0e"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dict_keys(['3dprinting_meta', '3dprinting', 'Stackoverflow_com_Posts_7z', 'academia_meta', 'academia', 'ai_meta', 'ai', 'android_meta', 'android', 'anime_meta', 'anime', 'apple_meta', 'apple', 'arduino_meta', 'arduino', 'askubuntu_com_7z', 'astronomy_meta', 'astronomy', 'aviation_meta', 'aviation', 'avp_meta', 'avp', 'beer_meta', 'beer', 'bicycles_meta', 'bicycles', 'bioacoustics_meta', 'bioacoustics', 'bioinformatics_meta', 'bioinformatics', 'biology_meta', 'biology', 'bitcoin_meta', 'bitcoin', 'blender_meta', 'blender', 'boardgames_meta', 'boardgames', 'bricks_meta', 'bricks', 'buddhism_meta', 'buddhism', 'cardano_meta', 'cardano', 'chemistry_meta', 'chemistry', 'chess_meta', 'chess', 'chinese_meta', 'chinese', 'christianity_meta', 'christianity', 'civicrm_meta', 'civicrm', 'codegolf_meta', 'codegolf', 'codereview_meta', 'codereview', 'coffee_meta', 'coffee', 'cogsci_meta', 'cogsci', 'computergraphics_meta', 'computergraphics', 'conlang_meta', 'conlang', 'cooking_meta', 'cooking', 'craftcms_meta', 'craftcms', 'crafts_meta', 'crafts', 'crypto_meta', 'crypto', 'cs_meta', 'cs', 'cseducators_meta', 'cseducators', 'cstheory_meta', 'cstheory', 'datascience_meta', 'datascience', 'dba_meta', 'dba', 'devops_meta', 'devops', 'diy_meta', 'diy', 'drones_meta', 'drones', 'drupal_meta', 'drupal', 'dsp_meta', 'dsp', 'earthscience_meta', 'earthscience', 'ebooks_meta', 'ebooks', 'economics_meta', 'economics', 'electronics_meta', 'electronics', 'elementaryos_meta', 'elementaryos', 'ell_meta', 'ell', 'emacs_meta', 'emacs', 'engineering_meta', 'engineering', 'english_meta', 'english', 'eosio_meta', 'eosio', 'es_meta_stackoverflow_com_7z', 'es_stackoverflow_com_7z', 'esperanto_meta', 'esperanto', 'ethereum_meta', 'ethereum', 'expatriates_meta', 'expatriates', 'expressionengine_meta', 'expressionengine', 'fitness_meta', 'fitness', 'freelancing_meta', 'freelancing', 'french_meta', 'french', 'gamedev_meta', 'gamedev', 'gaming_meta', 'gaming', 'gardening_meta', 'gardening', 'genealogy_meta', 'genealogy', 'german_meta', 'german', 'gis_meta', 'gis', 'graphicdesign_meta', 'graphicdesign', 'ham_meta', 'ham', 'hardwarerecs_meta', 'hardwarerecs', 'health_meta', 'health', 'hermeneutics_meta', 'hermeneutics', 'hinduism_meta', 'hinduism', 'history_meta', 'history', 'homebrew_meta', 'homebrew', 'hsm_meta', 'hsm', 'interpersonal_meta', 'interpersonal', 'iot_meta', 'iot', 'iota_meta', 'iota', 'islam_meta', 'islam', 'italian_meta', 'italian', 'ja_meta_stackoverflow_com_7z', 'ja_stackoverflow_com_7z', 'japanese_meta', 'japanese', 'joomla_meta', 'joomla', 'judaism_meta', 'judaism', 'korean_meta', 'korean', 'languagelearning_meta', 'languagelearning', 'latin_meta', 'latin', 'law_meta', 'law', 'lifehacks_meta', 'lifehacks', 'linguistics_meta', 'linguistics', 'literature_meta', 'literature', 'magento_meta', 'magento', 'martialarts_meta', 'martialarts', 'materials_meta', 'materials', 'math_meta', 'math', 'matheducators_meta', 'matheducators', 'mathematica_meta', 'mathematica', 'mathoverflow_net_7z', 'mechanics_meta', 'mechanics', 'meta_askubuntu_com_7z', 'meta_mathoverflow_net_7z', 'meta_serverfault_com_7z', 'meta', 'meta_stackoverflow_com_7z', 'meta_superuser_com_7z', 'moderators_meta', 'moderators', 'monero_meta', 'monero', 'money_meta', 'money', 'movies_meta', 'movies', 'music_meta', 'music', 'musicfans_meta', 'musicfans', 'mythology_meta', 'mythology', 'networkengineering_meta', 'networkengineering', 'opendata_meta', 'opendata', 'opensource_meta', 'opensource', 'or_meta', 'or', 'outdoors_meta', 'outdoors', 'parenting_meta', 'parenting', 'patents_meta', 'patents', 'pets_meta', 'pets', 'philosophy_meta', 'philosophy', 'photo_meta', 'photo', 'physics_meta', 'physics', 'pm_meta', 'pm', 'poker_meta', 'poker', 'politics_meta', 'politics', 'portuguese_meta', 'portuguese', 'proofassistants_meta', 'proofassistants', 'pt_meta_stackoverflow_com_7z', 'pt_stackoverflow_com_7z', 'puzzling_meta', 'puzzling', 'quant_meta', 'quant', 'quantumcomputing_meta', 'quantumcomputing', 'raspberrypi_meta', 'raspberrypi', 'retrocomputing_meta', 'retrocomputing', 'reverseengineering_meta', 'reverseengineering', 'robotics_meta', 'robotics', 'rpg_meta', 'rpg', 'ru_meta_stackoverflow_com_7z', 'ru_stackoverflow_com_7z', 'rus_meta', 'rus', 'russian_meta', 'russian', 'salesforce_meta', 'salesforce', 'scicomp_meta', 'scicomp', 'scifi_meta', 'scifi', 'security_meta', 'security', 'serverfault_com_7z', 'sharepoint_meta', 'sharepoint', 'sitecore_meta', 'sitecore', 'skeptics_meta', 'skeptics', 'softwareengineering_meta', 'softwareengineering', 'softwarerecs_meta', 'softwarerecs', 'solana_meta', 'solana', 'sound_meta', 'sound', 'space_meta', 'space', 'spanish_meta', 'spanish', 'sports_meta', 'sports', 'sqa_meta', 'sqa', 'stackapps_com_7z', 'stackoverflow_com_Badges_7z', 'stackoverflow_com_Comments_7z', 'stackoverflow_com_PostHistory_7z', 'stackoverflow_com_PostLinks_7z', 'stackoverflow_com_Tags_7z', 'stackoverflow_com_Users_7z', 'stackoverflow_com_Votes_7z', 'stats_meta', 'stats', 'stellar_meta', 'stellar', 'substrate_meta', 'substrate', 'superuser_com_7z', 'sustainability_meta', 'sustainability', 'tex_meta', 'tex', 'tezos_meta', 'tezos', 'tor_meta', 'tor', 'travel_meta', 'travel', 'tridion_meta', 'tridion', 'ukrainian_meta', 'ukrainian', 'unix_meta', 'unix', 'ux_meta', 'ux', 'vegetarianism_meta', 'vegetarianism', 'vi_meta', 'vi', 'webapps_meta', 'webapps', 'webmasters_meta', 'webmasters', 'windowsphone_meta', 'windowsphone', 'woodworking_meta', 'woodworking', 'wordpress_meta', 'wordpress', 'workplace_meta', 'workplace', 'worldbuilding_meta', 'worldbuilding', 'writers_meta', 'writers'])\n",
      "https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/ai.stackexchange.com.7z&file=Posts.xml\n"
     ]
    }
   ],
   "source": [
    "base_url = \"https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/{0}&file=Posts.xml\"\n",
    "\n",
    "\n",
    "def get_all_filenames():\n",
    "    response = requests.get(\"https://archive.org/download/stackexchange\")\n",
    "    if response.ok:\n",
    "        soup = bs(response.content, \"html.parser\")\n",
    "        table = soup.find(\"table\")\n",
    "        link_tags = table.find_all(\"a\")\n",
    "        urls = {}\n",
    "        for link in link_tags:\n",
    "            url = link[\"href\"]\n",
    "            name = url.split(\".stackexchange\")[0].replace(\".\", \"_\").replace(\"-\", \"_\")\n",
    "            if url.endswith(\"7z\"):\n",
    "                urls[name] = base_url.format(url)\n",
    "        return urls\n",
    "\n",
    "\n",
    "urls = get_all_filenames()\n",
    "\n",
    "print(urls.keys())\n",
    "print(urls.get(\"ai\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 907
    },
    "id": "-t27RnxdzBYB",
    "outputId": "5ec0ceed-c82b-48fa-facd-41b4aae2f9e6"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Id                        int64\n",
      "PostTypeId                int64\n",
      "AcceptedAnswerId          int64\n",
      "CreationDate             object\n",
      "Score                     int64\n",
      "ViewCount                 int64\n",
      "Body                     object\n",
      "OwnerUserId              object\n",
      "LastEditorUserId         object\n",
      "LastEditDate             object\n",
      "LastActivityDate         object\n",
      "Title                    object\n",
      "Tags                     object\n",
      "AnswerCount               int64\n",
      "CommentCount              int64\n",
      "ContentLicense           object\n",
      "ParentId                  int64\n",
      "ClosedDate               object\n",
      "FavoriteCount            object\n",
      "CommunityOwnedDate       object\n",
      "LastEditorDisplayName    object\n",
      "OwnerDisplayName         object\n",
      "DataSource               object\n",
      "dtype: object\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "  <div id=\"df-776df830-b974-4f15-9190-53bcb8e84bf8\">\n",
       "    <div class=\"colab-df-container\">\n",
       "      <div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Id</th>\n",
       "      <th>PostTypeId</th>\n",
       "      <th>AcceptedAnswerId</th>\n",
       "      <th>CreationDate</th>\n",
       "      <th>Score</th>\n",
       "      <th>ViewCount</th>\n",
       "      <th>Body</th>\n",
       "      <th>OwnerUserId</th>\n",
       "      <th>LastEditorUserId</th>\n",
       "      <th>LastEditDate</th>\n",
       "      <th>...</th>\n",
       "      <th>AnswerCount</th>\n",
       "      <th>CommentCount</th>\n",
       "      <th>ContentLicense</th>\n",
       "      <th>ParentId</th>\n",
       "      <th>ClosedDate</th>\n",
       "      <th>FavoriteCount</th>\n",
       "      <th>CommunityOwnedDate</th>\n",
       "      <th>LastEditorDisplayName</th>\n",
       "      <th>OwnerDisplayName</th>\n",
       "      <th>DataSource</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>2016-08-02T15:39:14.947</td>\n",
       "      <td>10</td>\n",
       "      <td>710</td>\n",
       "      <td>&lt;p&gt;What does \"backprop\" mean? Is the \"backprop...</td>\n",
       "      <td>8</td>\n",
       "      <td>2444</td>\n",
       "      <td>2019-11-16T17:56:22.093</td>\n",
       "      <td>...</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>CC BY-SA 4.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>https://ia600107.us.archive.org/view_archive.p...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>9</td>\n",
       "      <td>2016-08-02T15:40:20.623</td>\n",
       "      <td>14</td>\n",
       "      <td>1008</td>\n",
       "      <td>&lt;p&gt;Does increasing the noise in data help to i...</td>\n",
       "      <td>8</td>\n",
       "      <td>2444</td>\n",
       "      <td>2019-02-23T22:36:19.090</td>\n",
       "      <td>...</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>CC BY-SA 4.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>https://ia600107.us.archive.org/view_archive.p...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>2016-08-02T15:40:24.820</td>\n",
       "      <td>15</td>\n",
       "      <td>0</td>\n",
       "      <td>&lt;p&gt;\"Backprop\" is the same as \"backpropagation\"...</td>\n",
       "      <td>4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>CC BY-SA 3.0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>https://ia600107.us.archive.org/view_archive.p...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>12</td>\n",
       "      <td>2016-08-02T15:41:22.020</td>\n",
       "      <td>33</td>\n",
       "      <td>1266</td>\n",
       "      <td>&lt;p&gt;When you're writing your algorithm, how do ...</td>\n",
       "      <td>8</td>\n",
       "      <td>2444</td>\n",
       "      <td>2021-01-19T23:54:07.813</td>\n",
       "      <td>...</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>CC BY-SA 3.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>https://ia600107.us.archive.org/view_archive.p...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>20</td>\n",
       "      <td>2016-08-02T15:43:35.460</td>\n",
       "      <td>7</td>\n",
       "      <td>279</td>\n",
       "      <td>&lt;p&gt;Given the following definition of an intell...</td>\n",
       "      <td>29</td>\n",
       "      <td>2444</td>\n",
       "      <td>2019-06-15T18:25:58.513</td>\n",
       "      <td>...</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>CC BY-SA 4.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>https://ia600107.us.archive.org/view_archive.p...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 23 columns</p>\n",
       "</div>\n",
       "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-776df830-b974-4f15-9190-53bcb8e84bf8')\"\n",
       "              title=\"Convert this dataframe to an interactive table.\"\n",
       "              style=\"display:none;\">\n",
       "        \n",
       "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
       "       width=\"24px\">\n",
       "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
       "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
       "  </svg>\n",
       "      </button>\n",
       "      \n",
       "  <style>\n",
       "    .colab-df-container {\n",
       "      display:flex;\n",
       "      flex-wrap:wrap;\n",
       "      gap: 12px;\n",
       "    }\n",
       "\n",
       "    .colab-df-convert {\n",
       "      background-color: #E8F0FE;\n",
       "      border: none;\n",
       "      border-radius: 50%;\n",
       "      cursor: pointer;\n",
       "      display: none;\n",
       "      fill: #1967D2;\n",
       "      height: 32px;\n",
       "      padding: 0 0 0 0;\n",
       "      width: 32px;\n",
       "    }\n",
       "\n",
       "    .colab-df-convert:hover {\n",
       "      background-color: #E2EBFA;\n",
       "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
       "      fill: #174EA6;\n",
       "    }\n",
       "\n",
       "    [theme=dark] .colab-df-convert {\n",
       "      background-color: #3B4455;\n",
       "      fill: #D2E3FC;\n",
       "    }\n",
       "\n",
       "    [theme=dark] .colab-df-convert:hover {\n",
       "      background-color: #434B5C;\n",
       "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
       "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
       "      fill: #FFFFFF;\n",
       "    }\n",
       "  </style>\n",
       "\n",
       "      <script>\n",
       "        const buttonEl =\n",
       "          document.querySelector('#df-776df830-b974-4f15-9190-53bcb8e84bf8 button.colab-df-convert');\n",
       "        buttonEl.style.display =\n",
       "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
       "\n",
       "        async function convertToInteractive(key) {\n",
       "          const element = document.querySelector('#df-776df830-b974-4f15-9190-53bcb8e84bf8');\n",
       "          const dataTable =\n",
       "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
       "                                                     [key], {});\n",
       "          if (!dataTable) return;\n",
       "\n",
       "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
       "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
       "            + ' to learn more about interactive tables.';\n",
       "          element.innerHTML = '';\n",
       "          dataTable['output_type'] = 'display_data';\n",
       "          await google.colab.output.renderOutput(dataTable, element);\n",
       "          const docLink = document.createElement('div');\n",
       "          docLink.innerHTML = docLinkHtml;\n",
       "          element.appendChild(docLink);\n",
       "        }\n",
       "      </script>\n",
       "    </div>\n",
       "  </div>\n",
       "  "
      ],
      "text/plain": [
       "   Id  PostTypeId  AcceptedAnswerId             CreationDate  Score  \\\n",
       "0   1           1                 3  2016-08-02T15:39:14.947     10   \n",
       "1   2           1                 9  2016-08-02T15:40:20.623     14   \n",
       "2   3           2                 0  2016-08-02T15:40:24.820     15   \n",
       "3   4           1                12  2016-08-02T15:41:22.020     33   \n",
       "4   6           1                20  2016-08-02T15:43:35.460      7   \n",
       "\n",
       "   ViewCount                                               Body OwnerUserId  \\\n",
       "0        710  <p>What does \"backprop\" mean? Is the \"backprop...           8   \n",
       "1       1008  <p>Does increasing the noise in data help to i...           8   \n",
       "2          0  <p>\"Backprop\" is the same as \"backpropagation\"...           4   \n",
       "3       1266  <p>When you're writing your algorithm, how do ...           8   \n",
       "4        279  <p>Given the following definition of an intell...          29   \n",
       "\n",
       "  LastEditorUserId             LastEditDate  ... AnswerCount CommentCount  \\\n",
       "0             2444  2019-11-16T17:56:22.093  ...           5            0   \n",
       "1             2444  2019-02-23T22:36:19.090  ...           3            0   \n",
       "2              NaN                      NaN  ...           0            0   \n",
       "3             2444  2021-01-19T23:54:07.813  ...           4            0   \n",
       "4             2444  2019-06-15T18:25:58.513  ...           2            0   \n",
       "\n",
       "  ContentLicense  ParentId  ClosedDate FavoriteCount  CommunityOwnedDate  \\\n",
       "0   CC BY-SA 4.0         0         NaN           NaN                 NaN   \n",
       "1   CC BY-SA 4.0         0         NaN           NaN                 NaN   \n",
       "2   CC BY-SA 3.0         1         NaN           NaN                 NaN   \n",
       "3   CC BY-SA 3.0         0         NaN           NaN                 NaN   \n",
       "4   CC BY-SA 4.0         0         NaN           NaN                 NaN   \n",
       "\n",
       "  LastEditorDisplayName OwnerDisplayName  \\\n",
       "0                   NaN              NaN   \n",
       "1                   NaN              NaN   \n",
       "2                   NaN              NaN   \n",
       "3                   NaN              NaN   \n",
       "4                   NaN              NaN   \n",
       "\n",
       "                                          DataSource  \n",
       "0  https://ia600107.us.archive.org/view_archive.p...  \n",
       "1  https://ia600107.us.archive.org/view_archive.p...  \n",
       "2  https://ia600107.us.archive.org/view_archive.p...  \n",
       "3  https://ia600107.us.archive.org/view_archive.p...  \n",
       "4  https://ia600107.us.archive.org/view_archive.p...  \n",
       "\n",
       "[5 rows x 23 columns]"
      ]
     },
     "execution_count": 219,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "xml_format_map = {\n",
    "    \"Id\": int,\n",
    "    \"PostTypeId\": int,\n",
    "    \"CreationDate\": str,\n",
    "    \"Score\": int,\n",
    "    \"ViewCount\": int,\n",
    "    \"Body\": str,\n",
    "    \"AnswerCount\": int,\n",
    "    \"CommentCount\": int,\n",
    "    \"ContentLicense\": str,\n",
    "    \"AcceptedAnswerId\": int,\n",
    "    \"ParentId\": int,\n",
    "}\n",
    "\n",
    "\n",
    "# def extract_xml_file(file_url: str):\n",
    "#   table = pd.read_xml(file_url)\n",
    "#   return table\n",
    "\n",
    "\n",
    "def xml_to_df(response: str):\n",
    "    \"\"\"\n",
    "    Collect and Manually import XML into Dataframe\n",
    "\n",
    "    pd.read_xml() errors when XML trees are too large, this is just a hack to\n",
    "    download a XML file and parse into a Dataframe. **Not Tested on huge XML files**\n",
    "\n",
    "    Parameters:\n",
    "    response (Requests.Response): Requests response object with the XML data\n",
    "\n",
    "    Returns:\n",
    "    df (DataFrame): A Dataframe from the XML file\n",
    "    \"\"\"\n",
    "    soup = bs(response.content, \"xml\")\n",
    "    posts = soup.find_all(\"row\")\n",
    "\n",
    "    all_posts = [post.attrs for post in posts]\n",
    "\n",
    "    df = pd.DataFrame(all_posts)\n",
    "    df.AnswerCount.fillna(0, inplace=True)\n",
    "    df.ViewCount.fillna(0, inplace=True)\n",
    "    df.AcceptedAnswerId.fillna(0, inplace=True)\n",
    "    df.ParentId.fillna(0, inplace=True)\n",
    "    df[\"DataSource\"] = response.url\n",
    "    df = df.astype(xml_format_map)\n",
    "    return df\n",
    "\n",
    "\n",
    "dataset_name = \"ai\"\n",
    "\n",
    "xml_posts_path = urls.get(dataset_name)\n",
    "\n",
    "\n",
    "# df = extract_xml_file(test)\n",
    "response = requests.get(xml_posts_path)\n",
    "df = xml_to_df(response)\n",
    "\n",
    "\n",
    "print(df.dtypes)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "RAzTR7zY3oan"
   },
   "source": [
    "# Transformations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 424
    },
    "id": "qyUqc31Z3Z9g",
    "outputId": "18dce8b4-af26-49c9-ee73-6c677177b516"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "  <div id=\"df-c809ce1f-6807-4dfd-97c9-38d47afa28d7\">\n",
       "    <div class=\"colab-df-container\">\n",
       "      <div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Body</th>\n",
       "      <th>BodyClean</th>\n",
       "      <th>Tags</th>\n",
       "      <th>TagsClean</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>&lt;p&gt;What does \"backprop\" mean? Is the \"backprop...</td>\n",
       "      <td>What does \"backprop\" mean? Is the \"backprop\" t...</td>\n",
       "      <td>&lt;neural-networks&gt;&lt;backpropagation&gt;&lt;terminology...</td>\n",
       "      <td>neural networks, backpropagation, terminology,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>&lt;p&gt;Does increasing the noise in data help to i...</td>\n",
       "      <td>Does increasing the noise in data help to impr...</td>\n",
       "      <td>&lt;neural-networks&gt;&lt;machine-learning&gt;&lt;statistica...</td>\n",
       "      <td>neural networks, machine learning, statistical...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>&lt;p&gt;\"Backprop\" is the same as \"backpropagation\"...</td>\n",
       "      <td>\"Backprop\" is the same as \"backpropagation\": i...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>&lt;p&gt;When you're writing your algorithm, how do ...</td>\n",
       "      <td>When you're writing your algorithm, how do you...</td>\n",
       "      <td>&lt;neural-networks&gt;&lt;hyperparameter-optimization&gt;...</td>\n",
       "      <td>neural networks, hyperparameter optimization, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>&lt;p&gt;Given the following definition of an intell...</td>\n",
       "      <td>Given the following definition of an intellige...</td>\n",
       "      <td>&lt;philosophy&gt;&lt;definitions&gt;&lt;intelligent-agent&gt;</td>\n",
       "      <td>philosophy, definitions, intelligent agent</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23174</th>\n",
       "      <td>&lt;p&gt;The purpose of evaluating the state and act...</td>\n",
       "      <td>The purpose of evaluating the state and action...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23175</th>\n",
       "      <td>&lt;p&gt;In machine translation, convolution is a te...</td>\n",
       "      <td>In machine translation, convolution is a techn...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23176</th>\n",
       "      <td>&lt;p&gt;One of the key features of ChatGPT is its a...</td>\n",
       "      <td>One of the key features of ChatGPT is its abil...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23177</th>\n",
       "      <td>&lt;p&gt;Given a neural network model for Covid-19 c...</td>\n",
       "      <td>Given a neural network model for Covid-19 clas...</td>\n",
       "      <td>&lt;neural-networks&gt;&lt;homework&gt;</td>\n",
       "      <td>neural networks, homework</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23178</th>\n",
       "      <td>&lt;p&gt;My question is more related to the fundamen...</td>\n",
       "      <td>My question is more related to the fundamental...</td>\n",
       "      <td>&lt;search&gt;&lt;constraint-satisfaction-problems&gt;</td>\n",
       "      <td>search, constraint satisfaction problems</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>23179 rows × 4 columns</p>\n",
       "</div>\n",
       "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-c809ce1f-6807-4dfd-97c9-38d47afa28d7')\"\n",
       "              title=\"Convert this dataframe to an interactive table.\"\n",
       "              style=\"display:none;\">\n",
       "        \n",
       "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
       "       width=\"24px\">\n",
       "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
       "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
       "  </svg>\n",
       "      </button>\n",
       "      \n",
       "  <style>\n",
       "    .colab-df-container {\n",
       "      display:flex;\n",
       "      flex-wrap:wrap;\n",
       "      gap: 12px;\n",
       "    }\n",
       "\n",
       "    .colab-df-convert {\n",
       "      background-color: #E8F0FE;\n",
       "      border: none;\n",
       "      border-radius: 50%;\n",
       "      cursor: pointer;\n",
       "      display: none;\n",
       "      fill: #1967D2;\n",
       "      height: 32px;\n",
       "      padding: 0 0 0 0;\n",
       "      width: 32px;\n",
       "    }\n",
       "\n",
       "    .colab-df-convert:hover {\n",
       "      background-color: #E2EBFA;\n",
       "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
       "      fill: #174EA6;\n",
       "    }\n",
       "\n",
       "    [theme=dark] .colab-df-convert {\n",
       "      background-color: #3B4455;\n",
       "      fill: #D2E3FC;\n",
       "    }\n",
       "\n",
       "    [theme=dark] .colab-df-convert:hover {\n",
       "      background-color: #434B5C;\n",
       "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
       "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
       "      fill: #FFFFFF;\n",
       "    }\n",
       "  </style>\n",
       "\n",
       "      <script>\n",
       "        const buttonEl =\n",
       "          document.querySelector('#df-c809ce1f-6807-4dfd-97c9-38d47afa28d7 button.colab-df-convert');\n",
       "        buttonEl.style.display =\n",
       "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
       "\n",
       "        async function convertToInteractive(key) {\n",
       "          const element = document.querySelector('#df-c809ce1f-6807-4dfd-97c9-38d47afa28d7');\n",
       "          const dataTable =\n",
       "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
       "                                                     [key], {});\n",
       "          if (!dataTable) return;\n",
       "\n",
       "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
       "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
       "            + ' to learn more about interactive tables.';\n",
       "          element.innerHTML = '';\n",
       "          dataTable['output_type'] = 'display_data';\n",
       "          await google.colab.output.renderOutput(dataTable, element);\n",
       "          const docLink = document.createElement('div');\n",
       "          docLink.innerHTML = docLinkHtml;\n",
       "          element.appendChild(docLink);\n",
       "        }\n",
       "      </script>\n",
       "    </div>\n",
       "  </div>\n",
       "  "
      ],
      "text/plain": [
       "                                                    Body  \\\n",
       "0      <p>What does \"backprop\" mean? Is the \"backprop...   \n",
       "1      <p>Does increasing the noise in data help to i...   \n",
       "2      <p>\"Backprop\" is the same as \"backpropagation\"...   \n",
       "3      <p>When you're writing your algorithm, how do ...   \n",
       "4      <p>Given the following definition of an intell...   \n",
       "...                                                  ...   \n",
       "23174  <p>The purpose of evaluating the state and act...   \n",
       "23175  <p>In machine translation, convolution is a te...   \n",
       "23176  <p>One of the key features of ChatGPT is its a...   \n",
       "23177  <p>Given a neural network model for Covid-19 c...   \n",
       "23178  <p>My question is more related to the fundamen...   \n",
       "\n",
       "                                               BodyClean  \\\n",
       "0      What does \"backprop\" mean? Is the \"backprop\" t...   \n",
       "1      Does increasing the noise in data help to impr...   \n",
       "2      \"Backprop\" is the same as \"backpropagation\": i...   \n",
       "3      When you're writing your algorithm, how do you...   \n",
       "4      Given the following definition of an intellige...   \n",
       "...                                                  ...   \n",
       "23174  The purpose of evaluating the state and action...   \n",
       "23175  In machine translation, convolution is a techn...   \n",
       "23176  One of the key features of ChatGPT is its abil...   \n",
       "23177  Given a neural network model for Covid-19 clas...   \n",
       "23178  My question is more related to the fundamental...   \n",
       "\n",
       "                                                    Tags  \\\n",
       "0      <neural-networks><backpropagation><terminology...   \n",
       "1      <neural-networks><machine-learning><statistica...   \n",
       "2                                                    NaN   \n",
       "3      <neural-networks><hyperparameter-optimization>...   \n",
       "4           <philosophy><definitions><intelligent-agent>   \n",
       "...                                                  ...   \n",
       "23174                                                NaN   \n",
       "23175                                                NaN   \n",
       "23176                                                NaN   \n",
       "23177                        <neural-networks><homework>   \n",
       "23178         <search><constraint-satisfaction-problems>   \n",
       "\n",
       "                                               TagsClean  \n",
       "0      neural networks, backpropagation, terminology,...  \n",
       "1      neural networks, machine learning, statistical...  \n",
       "2                                                    NaN  \n",
       "3      neural networks, hyperparameter optimization, ...  \n",
       "4             philosophy, definitions, intelligent agent  \n",
       "...                                                  ...  \n",
       "23174                                                NaN  \n",
       "23175                                                NaN  \n",
       "23176                                                NaN  \n",
       "23177                          neural networks, homework  \n",
       "23178           search, constraint satisfaction problems  \n",
       "\n",
       "[23179 rows x 4 columns]"
      ]
     },
     "execution_count": 220,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def filter_only_questions_with_accepted_answers(df):\n",
    "    \"\"\"**TODO**\n",
    "    Filter only to Questions with Accepted Answers\n",
    "\n",
    "    Filter dataframe by questions that have accepted answers, should also include\n",
    "    all rows of answers for those questions, even if not accepted.\n",
    "\n",
    "    Parameters:\n",
    "    df (DataFrame): containing a \"AcceptedAnswerId\", \"Id\", and \"ParentId\" columns\n",
    "\n",
    "    Returns:\n",
    "    df (DataFrame): current dataframe with filtered results\n",
    "    \"\"\"\n",
    "    df = df[(df[\"AcceptedAnswerId\"].notnull()) | (df[\"ParentId\"] == df[\"Id\"])]\n",
    "\n",
    "\n",
    "def filter_scores_above(df, question_score_threshold: int = 20, answer_score_threshold: int = 20):\n",
    "    \"\"\"**TODO**\n",
    "    Filter Dataframe by minimum scores\n",
    "\n",
    "    Filter Question and Answer columns by score thresholds to trim lower scoring results\n",
    "\n",
    "    Parameters:\n",
    "    df (DataFrame): containing a \"Score\" column\n",
    "\n",
    "    Returns:\n",
    "    df (DataFrame): current dataframe with filtered results\n",
    "    \"\"\"\n",
    "    df = df[\n",
    "        ((df[\"Score\"] >= question_score_threshold) & (df.PostTypeId == 1))\n",
    "        | ((df[\"Score\"] >= answer_score_threshold) & (df.PostTypeId == 2))\n",
    "    ]\n",
    "\n",
    "\n",
    "def convert_html_to_text(df, column: str = \"Body\"):\n",
    "    \"\"\"\n",
    "    Convert HTML tags to pure text\n",
    "\n",
    "    Feeds HTML text body into BeautifulSoup to parse it to only text. Set aside as\n",
    "    function to provide option to skip\n",
    "\n",
    "    Parameters:\n",
    "    df (DataFrame): containing a \"Body\" column with HTML\n",
    "\n",
    "    Returns:\n",
    "    df (DataFrame): current dataframe with parsed column\n",
    "    \"\"\"\n",
    "    df.dropna(subset=[column], inplace=True)\n",
    "    df[f\"{column}Clean\"] = df[column].apply(lambda row: bs(row, \"html.parser\").text)\n",
    "\n",
    "\n",
    "def clean_tags(df):\n",
    "    \"\"\"\n",
    "    Convert Tags into Comma separated\n",
    "\n",
    "    Converts Tag slugs into commas separated tags\n",
    "\n",
    "    Parameters:\n",
    "    df (DataFrame): containing a \"Tags\" column with slugs\n",
    "\n",
    "    Returns:\n",
    "    df (DataFrame): current dataframe with parsed column\n",
    "    \"\"\"\n",
    "    df[\"TagsClean\"] = df[\"Tags\"].str.replace(\"-\", \" \").str.replace(\"><\", \", \").str.replace(\"<\", \"\").str.replace(\">\", \"\")\n",
    "\n",
    "\n",
    "# filter_only_questions_with_accepted_answers(df)\n",
    "# filter_scores_above(df)\n",
    "convert_html_to_text(df)\n",
    "clean_tags(df)\n",
    "\n",
    "df[[\"Body\", \"BodyClean\", \"Tags\", \"TagsClean\"]]\n",
    "# print(df.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "C09Bwdw-44PZ"
   },
   "source": [
    "This groups questions with answers so that a row with a question also has a column with an answer. It then creates an AcceptedAnswerFlag column that is True if the answer was accepted by the person who asked the question. Changing the `number_of_results` variable will limit the number of answers you want to keep."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 337
    },
    "id": "Bgz2fZ9k43Ab",
    "outputId": "28896d69-03cd-4877-fdfb-ae48dafa4ff3"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "  <div id=\"df-8ac2298f-ac6d-46f5-aa1d-41dec7fe27b5\">\n",
       "    <div class=\"colab-df-container\">\n",
       "      <div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Id_q</th>\n",
       "      <th>Question</th>\n",
       "      <th>QuestionScore</th>\n",
       "      <th>QuestionTags</th>\n",
       "      <th>Id_a</th>\n",
       "      <th>Answer</th>\n",
       "      <th>AnswerScore</th>\n",
       "      <th>AcceptedAnswerFlag</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1768</td>\n",
       "      <td>In Portal 2 we see that AI's can be \"killed\" b...</td>\n",
       "      <td>175</td>\n",
       "      <td>philosophy, decision theory, mythology of ai, ...</td>\n",
       "      <td>1769.0</td>\n",
       "      <td>This classic problem exhibits a basic misunder...</td>\n",
       "      <td>146.0</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10623</td>\n",
       "      <td>What is self-supervised learning in machine le...</td>\n",
       "      <td>91</td>\n",
       "      <td>machine learning, comparison, supervised learn...</td>\n",
       "      <td>10624.0</td>\n",
       "      <td>Introduction\\nThe term self-supervised learnin...</td>\n",
       "      <td>90.0</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>111</td>\n",
       "      <td>Obviously, self-driving cars aren't perfect, s...</td>\n",
       "      <td>100</td>\n",
       "      <td>philosophy, ethics, autonomous vehicles, decis...</td>\n",
       "      <td>1790.0</td>\n",
       "      <td>\\nHow could self-driving cars make ethical dec...</td>\n",
       "      <td>76.0</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>14224</td>\n",
       "      <td>If the original purpose for developing AI was ...</td>\n",
       "      <td>69</td>\n",
       "      <td>philosophy, social, explainable ai</td>\n",
       "      <td>14247.0</td>\n",
       "      <td>As argued by Selvaraju et al., there are three...</td>\n",
       "      <td>75.0</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1479</td>\n",
       "      <td>Do scientists or research experts know from th...</td>\n",
       "      <td>94</td>\n",
       "      <td>neural networks, deep learning, convolutional ...</td>\n",
       "      <td>4044.0</td>\n",
       "      <td>There are many approaches that aim to make a t...</td>\n",
       "      <td>69.0</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>\n",
       "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-8ac2298f-ac6d-46f5-aa1d-41dec7fe27b5')\"\n",
       "              title=\"Convert this dataframe to an interactive table.\"\n",
       "              style=\"display:none;\">\n",
       "        \n",
       "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
       "       width=\"24px\">\n",
       "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
       "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
       "  </svg>\n",
       "      </button>\n",
       "      \n",
       "  <style>\n",
       "    .colab-df-container {\n",
       "      display:flex;\n",
       "      flex-wrap:wrap;\n",
       "      gap: 12px;\n",
       "    }\n",
       "\n",
       "    .colab-df-convert {\n",
       "      background-color: #E8F0FE;\n",
       "      border: none;\n",
       "      border-radius: 50%;\n",
       "      cursor: pointer;\n",
       "      display: none;\n",
       "      fill: #1967D2;\n",
       "      height: 32px;\n",
       "      padding: 0 0 0 0;\n",
       "      width: 32px;\n",
       "    }\n",
       "\n",
       "    .colab-df-convert:hover {\n",
       "      background-color: #E2EBFA;\n",
       "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
       "      fill: #174EA6;\n",
       "    }\n",
       "\n",
       "    [theme=dark] .colab-df-convert {\n",
       "      background-color: #3B4455;\n",
       "      fill: #D2E3FC;\n",
       "    }\n",
       "\n",
       "    [theme=dark] .colab-df-convert:hover {\n",
       "      background-color: #434B5C;\n",
       "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
       "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
       "      fill: #FFFFFF;\n",
       "    }\n",
       "  </style>\n",
       "\n",
       "      <script>\n",
       "        const buttonEl =\n",
       "          document.querySelector('#df-8ac2298f-ac6d-46f5-aa1d-41dec7fe27b5 button.colab-df-convert');\n",
       "        buttonEl.style.display =\n",
       "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
       "\n",
       "        async function convertToInteractive(key) {\n",
       "          const element = document.querySelector('#df-8ac2298f-ac6d-46f5-aa1d-41dec7fe27b5');\n",
       "          const dataTable =\n",
       "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
       "                                                     [key], {});\n",
       "          if (!dataTable) return;\n",
       "\n",
       "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
       "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
       "            + ' to learn more about interactive tables.';\n",
       "          element.innerHTML = '';\n",
       "          dataTable['output_type'] = 'display_data';\n",
       "          await google.colab.output.renderOutput(dataTable, element);\n",
       "          const docLink = document.createElement('div');\n",
       "          docLink.innerHTML = docLinkHtml;\n",
       "          element.appendChild(docLink);\n",
       "        }\n",
       "      </script>\n",
       "    </div>\n",
       "  </div>\n",
       "  "
      ],
      "text/plain": [
       "    Id_q                                           Question  QuestionScore  \\\n",
       "0   1768  In Portal 2 we see that AI's can be \"killed\" b...            175   \n",
       "1  10623  What is self-supervised learning in machine le...             91   \n",
       "2    111  Obviously, self-driving cars aren't perfect, s...            100   \n",
       "3  14224  If the original purpose for developing AI was ...             69   \n",
       "4   1479  Do scientists or research experts know from th...             94   \n",
       "\n",
       "                                        QuestionTags     Id_a  \\\n",
       "0  philosophy, decision theory, mythology of ai, ...   1769.0   \n",
       "1  machine learning, comparison, supervised learn...  10624.0   \n",
       "2  philosophy, ethics, autonomous vehicles, decis...   1790.0   \n",
       "3                 philosophy, social, explainable ai  14247.0   \n",
       "4  neural networks, deep learning, convolutional ...   4044.0   \n",
       "\n",
       "                                              Answer  AnswerScore  \\\n",
       "0  This classic problem exhibits a basic misunder...        146.0   \n",
       "1  Introduction\\nThe term self-supervised learnin...         90.0   \n",
       "2  \\nHow could self-driving cars make ethical dec...         76.0   \n",
       "3  As argued by Selvaraju et al., there are three...         75.0   \n",
       "4  There are many approaches that aim to make a t...         69.0   \n",
       "\n",
       "   AcceptedAnswerFlag  \n",
       "0                True  \n",
       "1                True  \n",
       "2                True  \n",
       "3                True  \n",
       "4                True  "
      ]
     },
     "execution_count": 221,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "questions = df[df.PostTypeId == 1]\n",
    "answers = df[df.PostTypeId == 2]\n",
    "\n",
    "df = pd.merge(\n",
    "    questions,\n",
    "    answers[\n",
    "        [\n",
    "            \"Id\",\n",
    "            \"CreationDate\",\n",
    "            \"Score\",\n",
    "            \"ViewCount\",\n",
    "            \"CommentCount\",\n",
    "            \"ContentLicense\",\n",
    "            \"TagsClean\",\n",
    "            \"BodyClean\",\n",
    "            \"ParentId\",\n",
    "        ]\n",
    "    ],\n",
    "    left_on=\"Id\",\n",
    "    right_on=\"ParentId\",\n",
    "    suffixes=(\"_q\", \"_a\"),\n",
    "    how=\"left\",\n",
    ")\n",
    "\n",
    "df[\"AcceptedAnswerFlag\"] = df.apply(lambda row: row[\"Id_a\"] == row[\"AcceptedAnswerId\"], axis=1)\n",
    "\n",
    "df = df.rename(\n",
    "    columns={\n",
    "        \"BodyClean_q\": \"Question\",\n",
    "        \"Score_q\": \"QuestionScore\",\n",
    "        \"TagsClean_q\": \"QuestionTags\",\n",
    "        \"BodyClean_a\": \"Answer\",\n",
    "        \"Score_a\": \"AnswerScore\",\n",
    "        \"ContentLicense_q\": \"QuestionContentLicense\",\n",
    "        \"ContentLicense_a\": \"AnswerContentLicense\",\n",
    "        \"CreationDate_q\": \"CreationDate\",\n",
    "    }\n",
    ")\n",
    "\n",
    "## Set the number of results to a lower number to only return top N rated Answers.\n",
    "number_of_results = 25\n",
    "df = (\n",
    "    df.sort_values(by=[\"AcceptedAnswerFlag\", \"AnswerScore\"], ascending=[False, False])\n",
    "    .groupby(\"Question\")\n",
    "    .head(number_of_results)\n",
    "    .reset_index(drop=True)\n",
    ")\n",
    "\n",
    "df[[\"Id_q\", \"Question\", \"QuestionScore\", \"QuestionTags\", \"Id_a\", \"Answer\", \"AnswerScore\", \"AcceptedAnswerFlag\"]].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 645
    },
    "id": "eds1K8WL9QPo",
    "outputId": "bc526503-d6dd-428f-fa98-ad419d26a7dc"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "  <div id=\"df-16d171db-e359-46f3-a969-510a35cee78f\">\n",
       "    <div class=\"colab-df-container\">\n",
       "      <div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Id_q</th>\n",
       "      <th>Question</th>\n",
       "      <th>ParentId_a</th>\n",
       "      <th>AcceptedAnswerId</th>\n",
       "      <th>Id_a</th>\n",
       "      <th>Answer</th>\n",
       "      <th>AnswerScore</th>\n",
       "      <th>AcceptedAnswerFlag</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>15730</td>\n",
       "      <td>As a human being, we can think infinity. In pr...</td>\n",
       "      <td>15730.0</td>\n",
       "      <td>15744</td>\n",
       "      <td>15744.0</td>\n",
       "      <td>I think this is a fairly common misconception ...</td>\n",
       "      <td>62.0</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3662</th>\n",
       "      <td>15730</td>\n",
       "      <td>As a human being, we can think infinity. In pr...</td>\n",
       "      <td>15730.0</td>\n",
       "      <td>15744</td>\n",
       "      <td>15753.0</td>\n",
       "      <td>I think your premise is flawed.\\nYou seem to a...</td>\n",
       "      <td>19.0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3713</th>\n",
       "      <td>15730</td>\n",
       "      <td>As a human being, we can think infinity. In pr...</td>\n",
       "      <td>15730.0</td>\n",
       "      <td>15744</td>\n",
       "      <td>15747.0</td>\n",
       "      <td>TL;DR: The subtleties of infinity are made app...</td>\n",
       "      <td>12.0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3788</th>\n",
       "      <td>15730</td>\n",
       "      <td>As a human being, we can think infinity. In pr...</td>\n",
       "      <td>15730.0</td>\n",
       "      <td>15744</td>\n",
       "      <td>15756.0</td>\n",
       "      <td>In Haskell, you can type:\\nprint [1..]\\nand it...</td>\n",
       "      <td>9.0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3821</th>\n",
       "      <td>15730</td>\n",
       "      <td>As a human being, we can think infinity. In pr...</td>\n",
       "      <td>15730.0</td>\n",
       "      <td>15744</td>\n",
       "      <td>15758.0</td>\n",
       "      <td>I believe humans can be said to understand inf...</td>\n",
       "      <td>8.0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3882</th>\n",
       "      <td>15730</td>\n",
       "      <td>As a human being, we can think infinity. In pr...</td>\n",
       "      <td>15730.0</td>\n",
       "      <td>15744</td>\n",
       "      <td>15762.0</td>\n",
       "      <td>(There's a summary at the bottom for those who...</td>\n",
       "      <td>7.0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4389</th>\n",
       "      <td>15730</td>\n",
       "      <td>As a human being, we can think infinity. In pr...</td>\n",
       "      <td>15730.0</td>\n",
       "      <td>15744</td>\n",
       "      <td>15783.0</td>\n",
       "      <td>Then premise assumes that humans \"understand\" ...</td>\n",
       "      <td>4.0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4849</th>\n",
       "      <td>15730</td>\n",
       "      <td>As a human being, we can think infinity. In pr...</td>\n",
       "      <td>15730.0</td>\n",
       "      <td>15744</td>\n",
       "      <td>15740.0</td>\n",
       "      <td>By adding some rules for infinity in arithmeti...</td>\n",
       "      <td>3.0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4850</th>\n",
       "      <td>15730</td>\n",
       "      <td>As a human being, we can think infinity. In pr...</td>\n",
       "      <td>15730.0</td>\n",
       "      <td>15744</td>\n",
       "      <td>15803.0</td>\n",
       "      <td>I think the concept that is missing in the dis...</td>\n",
       "      <td>3.0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5763</th>\n",
       "      <td>15730</td>\n",
       "      <td>As a human being, we can think infinity. In pr...</td>\n",
       "      <td>15730.0</td>\n",
       "      <td>15744</td>\n",
       "      <td>15768.0</td>\n",
       "      <td>Computers don't understand \"infinity\" or even ...</td>\n",
       "      <td>2.0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5764</th>\n",
       "      <td>15730</td>\n",
       "      <td>As a human being, we can think infinity. In pr...</td>\n",
       "      <td>15730.0</td>\n",
       "      <td>15744</td>\n",
       "      <td>15810.0</td>\n",
       "      <td>The Questions That Computers Can Never Answer ...</td>\n",
       "      <td>2.0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5765</th>\n",
       "      <td>15730</td>\n",
       "      <td>As a human being, we can think infinity. In pr...</td>\n",
       "      <td>15730.0</td>\n",
       "      <td>15744</td>\n",
       "      <td>15943.0</td>\n",
       "      <td>John Doucette's answer covers my thoughts on t...</td>\n",
       "      <td>2.0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7462</th>\n",
       "      <td>15730</td>\n",
       "      <td>As a human being, we can think infinity. In pr...</td>\n",
       "      <td>15730.0</td>\n",
       "      <td>15744</td>\n",
       "      <td>15779.0</td>\n",
       "      <td>I would think that a computer couldn’t underst...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7463</th>\n",
       "      <td>15730</td>\n",
       "      <td>As a human being, we can think infinity. In pr...</td>\n",
       "      <td>15730.0</td>\n",
       "      <td>15744</td>\n",
       "      <td>15787.0</td>\n",
       "      <td>The \"concept\" of infinity is 1 thing to unders...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7464</th>\n",
       "      <td>15730</td>\n",
       "      <td>As a human being, we can think infinity. In pr...</td>\n",
       "      <td>15730.0</td>\n",
       "      <td>15744</td>\n",
       "      <td>15801.0</td>\n",
       "      <td>Just food for thought: how about if we try to ...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7465</th>\n",
       "      <td>15730</td>\n",
       "      <td>As a human being, we can think infinity. In pr...</td>\n",
       "      <td>15730.0</td>\n",
       "      <td>15744</td>\n",
       "      <td>15930.0</td>\n",
       "      <td>Its arguable if we humans understand infinity....</td>\n",
       "      <td>1.0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7466</th>\n",
       "      <td>15730</td>\n",
       "      <td>As a human being, we can think infinity. In pr...</td>\n",
       "      <td>15730.0</td>\n",
       "      <td>15744</td>\n",
       "      <td>15934.0</td>\n",
       "      <td>Well -- just to touch on the question of peopl...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7467</th>\n",
       "      <td>15730</td>\n",
       "      <td>As a human being, we can think infinity. In pr...</td>\n",
       "      <td>15730.0</td>\n",
       "      <td>15744</td>\n",
       "      <td>15938.0</td>\n",
       "      <td>Humans certainly don't understand infinity. Cu...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9481</th>\n",
       "      <td>15730</td>\n",
       "      <td>As a human being, we can think infinity. In pr...</td>\n",
       "      <td>15730.0</td>\n",
       "      <td>15744</td>\n",
       "      <td>15931.0</td>\n",
       "      <td>I think the property humans have which compute...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>\n",
       "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-16d171db-e359-46f3-a969-510a35cee78f')\"\n",
       "              title=\"Convert this dataframe to an interactive table.\"\n",
       "              style=\"display:none;\">\n",
       "        \n",
       "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
       "       width=\"24px\">\n",
       "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
       "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
       "  </svg>\n",
       "      </button>\n",
       "      \n",
       "  <style>\n",
       "    .colab-df-container {\n",
       "      display:flex;\n",
       "      flex-wrap:wrap;\n",
       "      gap: 12px;\n",
       "    }\n",
       "\n",
       "    .colab-df-convert {\n",
       "      background-color: #E8F0FE;\n",
       "      border: none;\n",
       "      border-radius: 50%;\n",
       "      cursor: pointer;\n",
       "      display: none;\n",
       "      fill: #1967D2;\n",
       "      height: 32px;\n",
       "      padding: 0 0 0 0;\n",
       "      width: 32px;\n",
       "    }\n",
       "\n",
       "    .colab-df-convert:hover {\n",
       "      background-color: #E2EBFA;\n",
       "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
       "      fill: #174EA6;\n",
       "    }\n",
       "\n",
       "    [theme=dark] .colab-df-convert {\n",
       "      background-color: #3B4455;\n",
       "      fill: #D2E3FC;\n",
       "    }\n",
       "\n",
       "    [theme=dark] .colab-df-convert:hover {\n",
       "      background-color: #434B5C;\n",
       "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
       "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
       "      fill: #FFFFFF;\n",
       "    }\n",
       "  </style>\n",
       "\n",
       "      <script>\n",
       "        const buttonEl =\n",
       "          document.querySelector('#df-16d171db-e359-46f3-a969-510a35cee78f button.colab-df-convert');\n",
       "        buttonEl.style.display =\n",
       "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
       "\n",
       "        async function convertToInteractive(key) {\n",
       "          const element = document.querySelector('#df-16d171db-e359-46f3-a969-510a35cee78f');\n",
       "          const dataTable =\n",
       "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
       "                                                     [key], {});\n",
       "          if (!dataTable) return;\n",
       "\n",
       "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
       "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
       "            + ' to learn more about interactive tables.';\n",
       "          element.innerHTML = '';\n",
       "          dataTable['output_type'] = 'display_data';\n",
       "          await google.colab.output.renderOutput(dataTable, element);\n",
       "          const docLink = document.createElement('div');\n",
       "          docLink.innerHTML = docLinkHtml;\n",
       "          element.appendChild(docLink);\n",
       "        }\n",
       "      </script>\n",
       "    </div>\n",
       "  </div>\n",
       "  "
      ],
      "text/plain": [
       "       Id_q                                           Question  ParentId_a  \\\n",
       "7     15730  As a human being, we can think infinity. In pr...     15730.0   \n",
       "3662  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
       "3713  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
       "3788  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
       "3821  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
       "3882  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
       "4389  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
       "4849  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
       "4850  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
       "5763  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
       "5764  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
       "5765  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
       "7462  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
       "7463  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
       "7464  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
       "7465  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
       "7466  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
       "7467  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
       "9481  15730  As a human being, we can think infinity. In pr...     15730.0   \n",
       "\n",
       "      AcceptedAnswerId     Id_a  \\\n",
       "7                15744  15744.0   \n",
       "3662             15744  15753.0   \n",
       "3713             15744  15747.0   \n",
       "3788             15744  15756.0   \n",
       "3821             15744  15758.0   \n",
       "3882             15744  15762.0   \n",
       "4389             15744  15783.0   \n",
       "4849             15744  15740.0   \n",
       "4850             15744  15803.0   \n",
       "5763             15744  15768.0   \n",
       "5764             15744  15810.0   \n",
       "5765             15744  15943.0   \n",
       "7462             15744  15779.0   \n",
       "7463             15744  15787.0   \n",
       "7464             15744  15801.0   \n",
       "7465             15744  15930.0   \n",
       "7466             15744  15934.0   \n",
       "7467             15744  15938.0   \n",
       "9481             15744  15931.0   \n",
       "\n",
       "                                                 Answer  AnswerScore  \\\n",
       "7     I think this is a fairly common misconception ...         62.0   \n",
       "3662  I think your premise is flawed.\\nYou seem to a...         19.0   \n",
       "3713  TL;DR: The subtleties of infinity are made app...         12.0   \n",
       "3788  In Haskell, you can type:\\nprint [1..]\\nand it...          9.0   \n",
       "3821  I believe humans can be said to understand inf...          8.0   \n",
       "3882  (There's a summary at the bottom for those who...          7.0   \n",
       "4389  Then premise assumes that humans \"understand\" ...          4.0   \n",
       "4849  By adding some rules for infinity in arithmeti...          3.0   \n",
       "4850  I think the concept that is missing in the dis...          3.0   \n",
       "5763  Computers don't understand \"infinity\" or even ...          2.0   \n",
       "5764  The Questions That Computers Can Never Answer ...          2.0   \n",
       "5765  John Doucette's answer covers my thoughts on t...          2.0   \n",
       "7462  I would think that a computer couldn’t underst...          1.0   \n",
       "7463  The \"concept\" of infinity is 1 thing to unders...          1.0   \n",
       "7464  Just food for thought: how about if we try to ...          1.0   \n",
       "7465  Its arguable if we humans understand infinity....          1.0   \n",
       "7466  Well -- just to touch on the question of peopl...          1.0   \n",
       "7467  Humans certainly don't understand infinity. Cu...          1.0   \n",
       "9481  I think the property humans have which compute...          0.0   \n",
       "\n",
       "      AcceptedAnswerFlag  \n",
       "7                   True  \n",
       "3662               False  \n",
       "3713               False  \n",
       "3788               False  \n",
       "3821               False  \n",
       "3882               False  \n",
       "4389               False  \n",
       "4849               False  \n",
       "4850               False  \n",
       "5763               False  \n",
       "5764               False  \n",
       "5765               False  \n",
       "7462               False  \n",
       "7463               False  \n",
       "7464               False  \n",
       "7465               False  \n",
       "7466               False  \n",
       "7467               False  \n",
       "9481               False  "
      ]
     },
     "execution_count": 222,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "testing_id = df.Id_q.mode()[0]\n",
    "df[(df.Id_q == testing_id) | (df.ParentId_a == testing_id)][\n",
    "    [\"Id_q\", \"Question\", \"ParentId_a\", \"AcceptedAnswerId\", \"Id_a\", \"Answer\", \"AnswerScore\", \"AcceptedAnswerFlag\"]\n",
    "]\n",
    "# df[['Id_q', 'Question', 'ParentId_a', 'AcceptedAnswerId', 'Id_a', 'Answer', 'AnswerScore', 'AcceptedAnswerFlag']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "gXgpXEO7DCbj"
   },
   "source": [
    "# Create JSONL version of Dataframe\n",
    "This groups the dataframe by question data and creates nested list of Answers for that group. The entire list contains individual JSON objects, each representing a single question in the dataset with a key, Answers, which contains a list of dictionaries for each answer to the question."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "OBR58MSRzAMP",
    "outputId": "c7da1e6c-3a97-465d-c9ba-7e055cb0d751"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "    \"Title\": \"1 hidden layer with 1000 neurons vs. 10 hidden layers with 100 neurons\",\n",
      "    \"Question\": \"These types of questions may be problem-dependent, but I have tried to find research that addresses the question whether the number of hidden layers and their size (number of neurons in each layer) really matter or not.\\nSo my question is, does it really matter if we for example have 1 large hidden layer of 1000 neurons vs. 10 hidden layers with 100 neurons each?\\n\",\n",
      "    \"QuestionScore\": 16,\n",
      "    \"QuestionTags\": \"neural networks\",\n",
      "    \"QuestionContentLicense\": \"CC BY-SA 3.0\",\n",
      "    \"DataSource\": \"https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/ai.stackexchange.com.7z&file=Posts.xml\",\n",
      "    \"CreationDate\": \"2017-05-04T13:06:37.990\",\n",
      "    \"Answers\": [\n",
      "        {\n",
      "            \"Answer\": \"Basically, having multiple layers (aka a deep network) makes your network more eager to recognize certain aspects of input data. For example, if you have the details of a house (size, lawn size, location etc.) as input and want to predict the price. The first layer may predict:\\n\\nBig area, higher price\\nSmall amount of bedrooms, lower price\\n\\nThe second layer might conclude:\\n\\nBig area + small amount of bedrooms = large bedrooms = +- effect\\n\\nYes, one layer can also 'detect' the stats, however it will require more neurons as it cannot rely on other neurons to do 'parts' of the total calculation required to detect that stat.\\nCheck out this answer\\n\",\n",
      "            \"AnswerScore\": 13.0,\n",
      "            \"AcceptedAnswerFlag\": true\n",
      "        },\n",
      "        {\n",
      "            \"Answer\": \"There are so many aspects.\\n1. Training:\\nTraining deep nets is a hard job due to the vanishing (rearly exploding) gradient problem. So building a 10x100 neural-net is not recommended.\\n2. Trained network performance:\\n\\nInformation loss:\\nThe classical usage of neural nets is the classification problem. Which means we want to get some well defined information from the data. (Ex. Is there a face in the picture or not.)\\nSo usually classification problem has a lot of input, and few output, whats more the size of the hidden layers are descend from input to output.\\nHowever, we loss information using less neurons layer by layer. (Ie. We cannot reproduce the original image based on the fact that is there a face on it or no.) So you must know that you loss information using 100 neurons if the size of the input is (lets say) 1000.\\nInformation complexity: However the deeper nets (as Tomas W mentioned) can fetch more complex information from the input data. Inspite of this its not recommended to use 10 fully connected layers. Its recommended to use convolutional/relu/maxpooling or other type of layers. Firest layers can compress the some essential part of the inputs. (Ex is there any line in a specific part of the picture) Second layers can say: There is a specific shape in this place in the picture. Etc etc.\\n\\nSo deeper nets are more \\\"clever\\\" but 10x100 net structure is a good choice.\\n\",\n",
      "            \"AnswerScore\": 4.0,\n",
      "            \"AcceptedAnswerFlag\": false\n",
      "        },\n",
      "        {\n",
      "            \"Answer\": \"If the problem you are solving is linearly separable, one layer of 1000 neurons can do better job than 10 layers with each of 100 neurons.\\nIf the problem is non linear and not convex, then you need deep neural nets. \\n\",\n",
      "            \"AnswerScore\": 1.0,\n",
      "            \"AcceptedAnswerFlag\": false\n",
      "        },\n",
      "        {\n",
      "            \"Answer\": \"\\nI think you have a confusion in the basics of the neural networks.\\n  Every layer has a separate activation function and input/output\\n  connection weights.\\n\\nThe output of the first hidden layer will be multiplied by a weight, processed by an activation function in the next layer and so on.\\nSingle layer neural networks are very limited for simple tasks, deeper NN can perform far better than a single layer. \\nHowever, do not use more than layer if your application is not fairly complex. In conclusion, 100 neurons layer does not mean better neural network than 10 layers x 10 neurons but 10 layers are something imaginary unless you are doing deep learning. start with 10 neurons in the hidden layer and try to add layers or add more neurons to the same layer to see the difference. learning with more layers will be easier but more training time is required.\\n\",\n",
      "            \"AnswerScore\": 0.0,\n",
      "            \"AcceptedAnswerFlag\": false\n",
      "        }\n",
      "    ]\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "j = (\n",
    "    df.groupby(\n",
    "        [\"Title\", \"Question\", \"QuestionScore\", \"QuestionTags\", \"QuestionContentLicense\", \"DataSource\", \"CreationDate\"]\n",
    "    )\n",
    "    .apply(lambda x: x[[\"Answer\", \"AnswerScore\", \"AcceptedAnswerFlag\"]].to_dict(\"records\"))\n",
    "    .reset_index()\n",
    "    .rename(columns={0: \"Answers\"})\n",
    "    .to_json(orient=\"records\")\n",
    ")\n",
    "\n",
    "data = json.loads(j)\n",
    "\n",
    "for post in data:\n",
    "    if len(post.get(\"Answers\")) >= 4:\n",
    "        print(json.dumps(post, indent=4))\n",
    "        break"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "PlNjrpXaDm1_"
   },
   "source": [
    "# Save file\n",
    "\n",
    "Files can be saved as JSON, JSONL, CSV, or Parquet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "CU0gWRGQDqIs",
    "outputId": "9646e475-cedd-46f1-f9b8-7eb1fbc703c7"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data should be either of List type for JSON and JSONL, or Pandas Dataframes for CSV and Parquet\n"
     ]
    }
   ],
   "source": [
    "file_name = dataset_name\n",
    "\n",
    "\n",
    "def save_data(data: list, file_name: str, file_type: str = [\"csv\", \"json\", \"jsonl\", \"parquet\"]):\n",
    "    \"\"\"\n",
    "    Save Data to file\n",
    "\n",
    "    Save Data list to file as either JSON or JSONL\n",
    "\n",
    "    Parameters:\n",
    "    data (list): list of dictionaries\n",
    "    file_name (str): name of file (no extension)\n",
    "    jsonl (bool): to save file as either JSON or JSONL\n",
    "    \"\"\"\n",
    "    file_type = file_type.lower()\n",
    "\n",
    "    if file_type == \"csv\" and isinstance(data, pd.DataFrame):\n",
    "        data.to_csv(f\"/content/{file_name}.csv\", index=False)\n",
    "\n",
    "    elif file_type == \"json\" and isinstance(data, list):\n",
    "        print(json.dumps(data, indent=4), file=open(f\"/content/{file_name}.json\", \"w\"))\n",
    "\n",
    "    elif file_type == \"jsonl\" and isinstance(data, list):\n",
    "        for item in data:\n",
    "            print(json.dumps(item), file=open(f\"/content/{file_name}.jsonl\", \"a\"))\n",
    "\n",
    "    elif file_type == \"parquet\" and isinstance(data, pd.DataFrame):\n",
    "        data.to_parquet(f\"/content/{file_name}.parquet\", index=False)\n",
    "\n",
    "    else:\n",
    "        print(\"Data should be either of List type for JSON and JSONL, or Pandas Dataframes for CSV and Parquet\")\n",
    "\n",
    "\n",
    "# save_data(data=data, file_name=file_name, file_type='jsonl')\n",
    "# save_data(data=df, file_name=file_name, file_type='parquet')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "BdN3hKxtgH7f"
   },
   "source": [
    "# Open-Assistant Data Scheme\n",
    "\n",
    "Testing putting the data into the Open-Assistant Data Scheme\n",
    "\n",
    "https://github.com/LAION-AI/Open-Assistant/blob/main/docs/data_schemas.md"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "n8ubYQxegNSY"
   },
   "outputs": [],
   "source": [
    "from typing import TypeVar, List, Dict, Any, Literal\n",
    "from json import JSONEncoder\n",
    "\n",
    "T = TypeVar(\"T\", bound=\"ConversationTreeNode\")\n",
    "\n",
    "\n",
    "class ConversationTreeNode:\n",
    "    text: str  # The text of the node\n",
    "    role: Literal[\"prompter\", \"assistant\"]  # Whether the node is a user prompt/follow-up or an assistant response\n",
    "    children: List[T]  # The children of the node (if you have a linear conversation, this will be of length 0 or 1)\n",
    "    metadata: Dict[str, Any]  # Node metadata (see below)\n",
    "\n",
    "    def __init__(\n",
    "        self, text: str, role: Literal[\"prompter\", \"assistant\"], children: List[T], metadata: Dict[str, Any]\n",
    "    ) -> None:\n",
    "        self.text = text\n",
    "        self.role = role\n",
    "        self.children = children\n",
    "        self.metadata = metadata\n",
    "\n",
    "\n",
    "class ConversationTree:\n",
    "    root: ConversationTreeNode  # The node containing the initial prompt\n",
    "    metadata: Dict[str, Any]  # Tree metadata, different from root node metadata.\n",
    "\n",
    "    def __init__(self, root: ConversationTreeNode, metadata: Dict[str, Any]) -> None:\n",
    "        self.root = root\n",
    "        self.metadata = metadata\n",
    "\n",
    "\n",
    "# subclass JSONEncoder\n",
    "class TreeEncoder(JSONEncoder):\n",
    "    def default(self, o):\n",
    "        return o.__dict__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "eE0fkytExSGl",
    "outputId": "594632d6-f98c-49b8-af86-25f7f5e2ce06"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "    \"root\": {\n",
      "        \"text\": \"Science Fiction has frequently shown AI to be a threat to the very existence of mankind. AI systems have often been the antagonists in many works of fiction, from 2001: A Space Odyssey through to The Terminator and beyond.\\nThe Media seems to buy into this trope as well.  And in recent years we have had people like Elon Musk warn us of the dangers of an impending AI revolution, stating that AI is more dangerous than nukes.\\nAnd, apparently, experts think that we will be seeing this AI revolution in the next 100 years.\\nHowever, from my (albeit limited) study of AI, I get the impression that they are all wrong. I am going to outline my understanding below, please correct me if I am wrong:\\n\\nFirstly, all of these things seem to be confusing Artificial Intelligence with Artificial Consciousness.  AI is essentially a system to make intelligent decisions, whereas AC is more like the \\\"self-aware\\\" systems that are shown in science fiction.\\n\\nNot AI itself, but intelligence and intelligent decision-making algorithms are something we've been working with and enhancing since before computers have been around.  Moving this over to an artificial framework is fairly easy.  However, consciousness is still something we are learning about.  My guess is we won't be able to re-create something artificially if we barely understand how it works in the real world.\\n\\nSo, my conclusion is that no AI system will be able to learn enough to start thinking for itself, and that all our warnings of AI are completely unjustified.\\n\\nThe real danger comes from AC, which we are a long, long way from realizing because we are still a long way off from defining exactly what consciousness is, let alone understanding it.\\n\\n\\n\\nSo, my question is, assuming that my understanding is correct, are any efforts are being made by companies or organizations that work with AI to correct these popular misunderstandings in sci-fi, the media, and/or the public?\\nOr are the proponents of AI ambivalent towards this public fear-mongering?\\nI understand that the fear mongering is going to remain popular for some time, as bad news sells better than good news. I am just wondering if the general attitude from AI organizations is to ignore this popular misconception, or whether a concerted effort is being made to fight against these AI myths (but unfortunately nobody in the media is listening or cares).\\n\",\n",
      "        \"role\": \"prompter\",\n",
      "        \"children\": [\n",
      "            {\n",
      "                \"text\": \"Nothing.  \\nIts in almost everyone's favor for it to stay that way financially. Having non-technical individuals associate AI with terminators makes a perception that the field has greater capabilities than it does $\\\\rightarrow$ this leads to grants, funding, etc...  \\nIs there any negative? Yes. Misconceptions always have drawbacks. We see the creation of dumb ethics boards and such cough cough Elon Musk.\\nBut if history has anything to say about this, as the field gains popularity (which it is dnagerously quick), information will spread by definition, and eventually misconceptions will be laid to rest.\\nNote that this answer is biased and based upon my own opinions\\n\",\n",
      "                \"role\": \"assistant\",\n",
      "                \"children\": [],\n",
      "                \"metadata\": {\n",
      "                    \"AnswerScore\": 2.0,\n",
      "                    \"AcceptedAnswerFlag\": true\n",
      "                }\n",
      "            }\n",
      "        ],\n",
      "        \"metadata\": {\n",
      "            \"QuestionScore\": 5,\n",
      "            \"QuestionTags\": \"social, artificial consciousness\"\n",
      "        }\n",
      "    },\n",
      "    \"metadata\": {\n",
      "        \"Title\": \"\\\"AI will kill us all! The machines will rise up!\\\" - what is being done to dispel such myths?\",\n",
      "        \"QuestionContentLicense\": \"CC BY-SA 4.0\",\n",
      "        \"DataSource\": \"https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/ai.stackexchange.com.7z&file=Posts.xml\",\n",
      "        \"CreationDate\": \"2019-10-16T13:57:37.143\"\n",
      "    }\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "conversation_forest = []\n",
    "\n",
    "tree_metadata_map = {\"Title\": str, \"QuestionContentLicense\": str, \"DataSource\": str, \"CreationDate\": str}\n",
    "question_metadata_map = {\"QuestionScore\": int, \"QuestionTags\": str}\n",
    "answer_metadata_map = {\"AnswerScore\": int, \"AcceptedAnswerFlag\": bool}\n",
    "\n",
    "\n",
    "for item in data:\n",
    "    prompt = item.get(\"Question\")\n",
    "    metadata = {k: v for k, v in item.items() if k in question_metadata_map}\n",
    "    root = ConversationTreeNode(text=prompt, role=\"prompter\", children=[], metadata=metadata)\n",
    "\n",
    "    for answer in item.get(\"Answers\"):\n",
    "        response = answer.get(\"Answer\")\n",
    "        metadata = {k: v for k, v in answer.items() if k in answer_metadata_map}\n",
    "        child = ConversationTreeNode(text=response, role=\"assistant\", children=[], metadata=metadata)\n",
    "        root.children.append(child)\n",
    "\n",
    "    metadata = {k: v for k, v in item.items() if k in tree_metadata_map}\n",
    "    conversation_tree = ConversationTree(root=root, metadata=metadata)\n",
    "    conversation_forest.append(conversation_tree)\n",
    "\n",
    "\n",
    "conversation_forest_json = [\n",
    "    json.loads(TreeEncoder().encode(conversation_tree)) for conversation_tree in conversation_forest\n",
    "]\n",
    "\n",
    "\n",
    "# print(json.dumps(conversation_forest_json[0], indent=4))\n",
    "\n",
    "\n",
    "print(json.dumps(conversation_forest_json, indent=4), file=open(f\"/content/{file_name}.json\", \"w\"))"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.7.4 (tags/v3.7.4:e09359112e, Jul  8 2019, 20:34:20) [MSC v.1916 64 bit (AMD64)]"
  },
  "vscode": {
   "interpreter": {
    "hash": "25d5c2324055587ceaeef27650c79ce8358ea61d7689f2e0b8ada5d53f85bce4"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
