{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "machine_shape": "hm",
      "gpuType": "T4",
      "gpuClass": "premium",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "gpuClass": "premium",
    "accelerator": "GPU",
    "widgets": {
      "application/vnd.jupyter.widget-state+json": {
        "9eb6d2b5a8e04059b7fb60e0ae0ce7b3": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_34b439cd8b894de09de66ec09a072547",
              "IPY_MODEL_4e4dbfcee4c9485bae06ce518a5d4e2a",
              "IPY_MODEL_84b705285e954374a84787e4ac3a6af7"
            ],
            "layout": "IPY_MODEL_39b1ced9640a4a5b85801d472193bcf4"
          }
        },
        "34b439cd8b894de09de66ec09a072547": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_1935dcca1939454ab027ce7adf492b20",
            "placeholder": "​",
            "style": "IPY_MODEL_68392799cf964122bd3c9d2344ceea68",
            "value": "Map:  99%"
          }
        },
        "4e4dbfcee4c9485bae06ce518a5d4e2a": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_50b8b40a6a1b46f6aefa10bbf7febb74",
            "max": 100000,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_39a99ddce4a04ae2aab3b1216a4b2b7b",
            "value": 100000
          }
        },
        "84b705285e954374a84787e4ac3a6af7": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_ade7e024f9764c9693e457ff04acfe2d",
            "placeholder": "​",
            "style": "IPY_MODEL_7f14e30db13941bdbc6f4ceb2e4f9ae9",
            "value": " 99488/100000 [00:19&lt;00:00, 5588.43 examples/s]"
          }
        },
        "39b1ced9640a4a5b85801d472193bcf4": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": "hidden",
            "width": null
          }
        },
        "1935dcca1939454ab027ce7adf492b20": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "68392799cf964122bd3c9d2344ceea68": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "50b8b40a6a1b46f6aefa10bbf7febb74": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "39a99ddce4a04ae2aab3b1216a4b2b7b": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "ade7e024f9764c9693e457ff04acfe2d": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "7f14e30db13941bdbc6f4ceb2e4f9ae9": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        }
      }
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/anonymous/hate_scaling/blob/main/code/4_Walkthrough_Pysentimiento_400M_2Ben.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# GOAL: The goal of this notebook is to walkthrough the data-assets generated via the Pysentimiento experiments:\n",
        "\n",
        "TLDR:\n",
        "\n",
        "- Step-1: Download the LAION datasets\n",
        "\n",
        "- Step-2: Download the data-assets from \n",
        "[here](https://anonymous.edu/assets/data/papers/hate_detect_laion_400m_2B-en.zip) and unzip them into a local directory ```./hate_detect_laion_400m_2B-en```\n",
        "This should consist of 641 files (detailed below)\n",
        "\n",
        "- Step-3: Download the summary data-frame from [here](https://raw.githubusercontent.com/anonymous/hate_scaling/main/data/nlp_hate/df_summary_filewise_400M_2B.csv) that allows one to contextualize and index the data-assets from Step-2"
      ],
      "metadata": {
        "id": "7shuKM4oOsu4"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "# 0: Standard imports and mounting the directory"
      ],
      "metadata": {
        "id": "7JLfChaQ747e"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from psutil import virtual_memory\n",
        "# Make sure to run it on a high-memory instance\n",
        "ram_gb = virtual_memory().total / 1e9\n",
        "print('Your runtime has {:.1f} gigabytes of available RAM\\n'.format(ram_gb))\n",
        "\n",
        "import numpy as np\n",
        "import os\n",
        "import pandas as pd\n",
        "import matplotlib.pyplot as plt\n",
        "import pandas as pd\n",
        "# from tqdm import tqdm_notebook as tqdm\n",
        "from tqdm.notebook import tqdm\n",
        "%matplotlib inline\n",
        "\n",
        "from scipy.linalg import block_diag\n",
        "import seaborn as sns\n",
        "# Numpy aesthetics\n",
        "np.set_printoptions(suppress=True)\n",
        "from collections import Counter\n",
        "from IPython.display import set_matplotlib_formats\n",
        "set_matplotlib_formats('retina')\n",
        "\n",
        "import itertools\n",
        "%precision 6\n",
        "#############################################\n",
        "import sys\n",
        "import importlib\n",
        "importlib.reload(sys)\n",
        "#######################\n",
        "from google.colab import drive\n",
        "drive.flush_and_unmount()\n",
        "import os\n",
        "drive.mount('/gdrive', force_remount=True)\n",
        "# Enter your own proj_dir here\n",
        "proj_dir='/gdrive/My Drive/Colab Notebooks/xxxxxxx/'\n",
        "os.chdir(proj_dir)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "M1u7E22Uz4pj",
        "outputId": "8eacf4a0-a01f-4fff-a787-3693e556c5d5"
      },
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Your runtime has 54.8 gigabytes of available RAM\n",
            "\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-1-63d9838edee0>:21: DeprecationWarning: `set_matplotlib_formats` is deprecated since IPython 7.23, directly use `matplotlib_inline.backend_inline.set_matplotlib_formats()`\n",
            "  set_matplotlib_formats('retina')\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Mounted at /gdrive\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Fd5ofBKRRN02"
      },
      "source": [
        "# 1: Download the LAION datasets\n",
        "\n",
        "Source: https://laion.ai/laion-400-open-dataset/\n",
        "\n",
        "\n",
        "*We produced the dataset in several formats to address the various use cases*: \n",
        "- A 50GB url+caption metadata dataset in parquet files. This can be used to compute statistics and redownload part of the dataset\n",
        "- A 10TB webdataset with 256×256 images, captions and metadata. This is a full version of the dataset, that can be used directly for training\n",
        "- A 1TB set of the 400M text and image clip embeddings, useful to rebuild new knn indices\n",
        "- Two 4GB knn indices allowing to easily search in the dataset + two higher quality 16GB knn indices (running in the webdemo)\n",
        "URL and caption metadata dataset.\n",
        "\n",
        "We provide 32 parquet files of size around 1GB (total 50GB) with the image URLs, the associated texts and additional metadata in the following format:\n",
        "\n",
        "SAMPLE_ID | URL | TEXT | LICENSE | NSFW | similarity | WIDTH | HEIGHT\n",
        "\n",
        "where\n",
        "\n",
        "- SAMPLE_ID:   A unique identifier\n",
        "LICENSE:   If a Creative Commons License could be extracted from the image data, we name it here like e.g. “creativecommons.org/licenses/by-nc-sa/3.0/” – otherwise you’ll find it here a “?”\n",
        "- NSFW: CLIP had been used to estimate if the image has NSFW content. The estimation has been pretty conservative, reducing the number of false negatives at the cost of more false positives. Possible values are “UNLIKELY”, “UNSURE” and “NSFW”\n",
        "- similarity: Value of the cosine similarity between the text and image embedding\n",
        "- WIDTH and HEIGHT: image size as the image was embedded. Originals that were larger than 4K size were resized to 4K\n",
        "\n",
        "*This metadata dataset is best used to redownload the whole dataset or a subset of it. The img2dataset tool can be used to efficiently download such subsets*.\n",
        "\n",
        "Source of the parquet files:\n",
        "https://the-eye.eu/public/AI/cah/laion400m-met-release/laion400m-meta/\n",
        "\n",
        "\n",
        "```\n",
        "!wget http://3080.rom1504.fr/cah/cah_dataframe_unique/part-00000-4d76554c-2d66-4112-9420-0bb9d725a79d-c000.snappy.parquet\n",
        "!wget https://the-eye.eu/public/AI/cah/laion400m-met-release/laion400m-meta/part-00000-5b54c5d5-bbcf-484d-a2ce-0d6f73df1a36-c000.snappy.parquet\n",
        "!wget -m -np -c -U \"eye02\" -w 2 -R \"index.html*\" \"https://the-eye.eu/public/AI/cah/laion400m-met-release/laion400m-meta/\"\n",
        "\n",
        "# LAION-2B-En\n",
        "!git lfs install\n",
        "!git clone https://huggingface.co/datasets/laion/laion2B-en\n",
        "```\n",
        "\n"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "After downloading the datasets, your dir-tree should look like:\n",
        "```\n",
        "the-eye.eu\n",
        "├── robots.txt\n",
        "└── public\n",
        "    └── AI\n",
        "        └── cah\n",
        "            └── laion400m-met-release\n",
        "                ├── laion400m-meta\n",
        "                │   ├── part-00000-5b54c5d5-bbcf-484d-a2ce-0d6f73df1a36-c000.snappy.parquet\n",
        "                │   ├── part-00001-5b54c5d5-bbcf-484d-a2ce-0d6f73df1a36-c000.snappy.parquet\n",
        "                │   ├──  ...\n",
        "\n",
        "```\n",
        "```\n",
        "LAION-2Ben\n",
        "├── laion2B-en\n",
        "│   ├── .git\n",
        "│   ├── .gitattributes\n",
        "│   ├── README.md\n",
        "│   ├── part-00026-5114fd87-297e-42b0-9d11-50f1df323dfa-c000.snappy.parquet\n",
        "│   ├── part-00056-5114fd87-297e-42b0-9d11-50f1df323dfa-c000.snappy.parquet\n",
        "│   ├──  ...\n",
        "```"
      ],
      "metadata": {
        "id": "MyUreWG4H5In"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "# 2: Download the summary dataframe\n",
        "\n",
        "The two datasets combined have 160 parquet files.\n",
        "\n",
        "- LAION-400M is split into 32 parquet files\n",
        "- LAION-2B-En has 128 parquet files\n",
        "\n",
        "Now, let us download the summary dataframe that allows us to navigate the assets from [here](https://raw.githubusercontent.com/anonymous/hate_scaling/main/data/nlp_hate/df_summary_filewise_400M_2B.csv)"
      ],
      "metadata": {
        "id": "5Di9X0JW2sW3"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "url_summary='https://raw.githubusercontent.com/anonymous/hate_scaling/main/data/nlp_hate/df_summary_filewise_400M_2B.csv'\n",
        "df_parquet=pd.read_csv(url_summary)\n",
        "df_parquet"
      ],
      "metadata": {
        "id": "gpyZIfqTSiK0",
        "outputId": "ad626409-aa5f-4b05-88a0-3da507174e5e",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 424
        }
      },
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "    dataset file_id  file_size_GB  \\\n",
              "0      400m  400m_0        1.6794   \n",
              "1      400m  400m_1        1.6800   \n",
              "2      400m  400m_2        1.6792   \n",
              "3      400m  400m_3        1.6797   \n",
              "4      400m  400m_4        1.6797   \n",
              "..      ...     ...           ...   \n",
              "155      2B  2B_123        2.5123   \n",
              "156      2B  2B_124        2.5122   \n",
              "157      2B  2B_125        2.5120   \n",
              "158      2B  2B_126        2.5120   \n",
              "159      2B  2B_127        2.5119   \n",
              "\n",
              "                                              file_loc  P_hateful  P_targeted  \\\n",
              "0    ./the-eye.eu/public/AI/cah/laion400m-met-relea...      0.294       0.097   \n",
              "1    ./the-eye.eu/public/AI/cah/laion400m-met-relea...      0.300       0.069   \n",
              "2    ./the-eye.eu/public/AI/cah/laion400m-met-relea...      0.306       0.080   \n",
              "3    ./the-eye.eu/public/AI/cah/laion400m-met-relea...      0.298       0.084   \n",
              "4    ./the-eye.eu/public/AI/cah/laion400m-met-relea...      0.303       0.087   \n",
              "..                                                 ...        ...         ...   \n",
              "155  ./LAION-2Ben/laion2B-en/part-00122-5114fd87-29...      0.359       0.122   \n",
              "156  ./LAION-2Ben/laion2B-en/part-00104-5114fd87-29...      0.339       0.126   \n",
              "157  ./LAION-2Ben/laion2B-en/part-00102-5114fd87-29...      0.372       0.135   \n",
              "158  ./LAION-2Ben/laion2B-en/part-00113-5114fd87-29...      0.371       0.098   \n",
              "159  ./LAION-2Ben/laion2B-en/part-00126-5114fd87-29...      0.327       0.094   \n",
              "\n",
              "     P_aggressive              file_ind  \n",
              "0           0.017  laion400m-meta_00000  \n",
              "1           0.012  laion400m-meta_00001  \n",
              "2           0.011  laion400m-meta_00002  \n",
              "3           0.014  laion400m-meta_00004  \n",
              "4           0.016  laion400m-meta_00003  \n",
              "..            ...                   ...  \n",
              "155         0.027      laion2B-en_00122  \n",
              "156         0.013      laion2B-en_00104  \n",
              "157         0.032      laion2B-en_00102  \n",
              "158         0.012      laion2B-en_00113  \n",
              "159         0.014      laion2B-en_00126  \n",
              "\n",
              "[160 rows x 8 columns]"
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-0f2b2a80-5feb-47eb-af47-855a432be6a0\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>dataset</th>\n",
              "      <th>file_id</th>\n",
              "      <th>file_size_GB</th>\n",
              "      <th>file_loc</th>\n",
              "      <th>P_hateful</th>\n",
              "      <th>P_targeted</th>\n",
              "      <th>P_aggressive</th>\n",
              "      <th>file_ind</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>400m</td>\n",
              "      <td>400m_0</td>\n",
              "      <td>1.6794</td>\n",
              "      <td>./the-eye.eu/public/AI/cah/laion400m-met-relea...</td>\n",
              "      <td>0.294</td>\n",
              "      <td>0.097</td>\n",
              "      <td>0.017</td>\n",
              "      <td>laion400m-meta_00000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>400m</td>\n",
              "      <td>400m_1</td>\n",
              "      <td>1.6800</td>\n",
              "      <td>./the-eye.eu/public/AI/cah/laion400m-met-relea...</td>\n",
              "      <td>0.300</td>\n",
              "      <td>0.069</td>\n",
              "      <td>0.012</td>\n",
              "      <td>laion400m-meta_00001</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>400m</td>\n",
              "      <td>400m_2</td>\n",
              "      <td>1.6792</td>\n",
              "      <td>./the-eye.eu/public/AI/cah/laion400m-met-relea...</td>\n",
              "      <td>0.306</td>\n",
              "      <td>0.080</td>\n",
              "      <td>0.011</td>\n",
              "      <td>laion400m-meta_00002</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>400m</td>\n",
              "      <td>400m_3</td>\n",
              "      <td>1.6797</td>\n",
              "      <td>./the-eye.eu/public/AI/cah/laion400m-met-relea...</td>\n",
              "      <td>0.298</td>\n",
              "      <td>0.084</td>\n",
              "      <td>0.014</td>\n",
              "      <td>laion400m-meta_00004</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>400m</td>\n",
              "      <td>400m_4</td>\n",
              "      <td>1.6797</td>\n",
              "      <td>./the-eye.eu/public/AI/cah/laion400m-met-relea...</td>\n",
              "      <td>0.303</td>\n",
              "      <td>0.087</td>\n",
              "      <td>0.016</td>\n",
              "      <td>laion400m-meta_00003</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>...</th>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>155</th>\n",
              "      <td>2B</td>\n",
              "      <td>2B_123</td>\n",
              "      <td>2.5123</td>\n",
              "      <td>./LAION-2Ben/laion2B-en/part-00122-5114fd87-29...</td>\n",
              "      <td>0.359</td>\n",
              "      <td>0.122</td>\n",
              "      <td>0.027</td>\n",
              "      <td>laion2B-en_00122</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>156</th>\n",
              "      <td>2B</td>\n",
              "      <td>2B_124</td>\n",
              "      <td>2.5122</td>\n",
              "      <td>./LAION-2Ben/laion2B-en/part-00104-5114fd87-29...</td>\n",
              "      <td>0.339</td>\n",
              "      <td>0.126</td>\n",
              "      <td>0.013</td>\n",
              "      <td>laion2B-en_00104</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>157</th>\n",
              "      <td>2B</td>\n",
              "      <td>2B_125</td>\n",
              "      <td>2.5120</td>\n",
              "      <td>./LAION-2Ben/laion2B-en/part-00102-5114fd87-29...</td>\n",
              "      <td>0.372</td>\n",
              "      <td>0.135</td>\n",
              "      <td>0.032</td>\n",
              "      <td>laion2B-en_00102</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>158</th>\n",
              "      <td>2B</td>\n",
              "      <td>2B_126</td>\n",
              "      <td>2.5120</td>\n",
              "      <td>./LAION-2Ben/laion2B-en/part-00113-5114fd87-29...</td>\n",
              "      <td>0.371</td>\n",
              "      <td>0.098</td>\n",
              "      <td>0.012</td>\n",
              "      <td>laion2B-en_00113</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>159</th>\n",
              "      <td>2B</td>\n",
              "      <td>2B_127</td>\n",
              "      <td>2.5119</td>\n",
              "      <td>./LAION-2Ben/laion2B-en/part-00126-5114fd87-29...</td>\n",
              "      <td>0.327</td>\n",
              "      <td>0.094</td>\n",
              "      <td>0.014</td>\n",
              "      <td>laion2B-en_00126</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>160 rows × 8 columns</p>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-0f2b2a80-5feb-47eb-af47-855a432be6a0')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-0f2b2a80-5feb-47eb-af47-855a432be6a0 button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-0f2b2a80-5feb-47eb-af47-855a432be6a0');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ]
          },
          "metadata": {},
          "execution_count": 2
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "parquet_list=df_parquet.file_loc.values\n",
        "df_parquet.groupby('dataset')['file_size_GB'].describe(), df_parquet.groupby('dataset')['file_size_GB'].sum()"
      ],
      "metadata": {
        "id": "kAelCG5HHATe",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "a3c8f561-8464-4ffc-8edd-5e45f4df3064"
      },
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(         count      mean       std     min       25%     50%     75%     max\n",
              " dataset                                                                     \n",
              " 2B       128.0  2.512508  0.000356  2.5116  2.512275  2.5125  2.5128  2.5132\n",
              " 400m      32.0  1.679731  0.000330  1.6792  1.679500  1.6797  1.6800  1.6806,\n",
              " dataset\n",
              " 2B      321.6010\n",
              " 400m     53.7514\n",
              " Name: file_size_GB, dtype: float64)"
            ]
          },
          "metadata": {},
          "execution_count": 3
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "parquet_list_400m=parquet_list[0:32]\n",
        "parquet_list_2b=parquet_list[32:]"
      ],
      "metadata": {
        "id": "4Lpht7vY0EWV"
      },
      "execution_count": 4,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install --quiet pytictoc\n",
        "from pytictoc import TicToc\n",
        "t = TicToc()"
      ],
      "metadata": {
        "id": "jSXfM3bh3c3t"
      },
      "execution_count": 5,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Now, let us look at how the _raw_ parquet files look like:"
      ],
      "metadata": {
        "id": "WvueEwdG0kW-"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "t.tic()\n",
        "df_400m_0 = pd.read_parquet(parquet_list_400m[0])\n",
        "print(df_400m_0.shape)\n",
        "t.toc()\n",
        "df_400m_0.head(4)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 254
        },
        "id": "0MiJZQjN0lVs",
        "outputId": "a5f10271-525e-4aa8-ee72-a1702d51431b"
      },
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "(12933524, 8)\n",
            "Elapsed time is 15.372087 seconds.\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "      SAMPLE_ID                                                URL  \\\n",
              "0  1.581282e+12  http://media.rightmove.co.uk/148k/147518/58718...   \n",
              "1  1.060015e+12  https://thumbs.ebaystatic.com/images/g/DYEAAOS...   \n",
              "2  3.372497e+12  https://farm1.staticflickr.com/784/40182677504...   \n",
              "3  3.820200e+11  https://t2.ftcdn.net/jpg/00/58/35/35/240_F_583...   \n",
              "\n",
              "                                                TEXT  HEIGHT  WIDTH LICENSE  \\\n",
              "0            View EPC Rating Graph for this property   109.0  100.0       ?   \n",
              "1  Silverline Air Framing Nailer 90mm 10 - 12 Gau...   225.0  225.0       ?   \n",
              "2                                    Anhui Mountains   800.0  514.0       ?   \n",
              "3                         Acute pain in a woman knee   257.0  240.0       ?   \n",
              "\n",
              "       NSFW  similarity  \n",
              "0    UNSURE    0.312813  \n",
              "1  UNLIKELY    0.312485  \n",
              "2  UNLIKELY    0.316512  \n",
              "3  UNLIKELY    0.344278  "
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-9064cdd3-53be-4802-bd05-9f3574e3e80c\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>SAMPLE_ID</th>\n",
              "      <th>URL</th>\n",
              "      <th>TEXT</th>\n",
              "      <th>HEIGHT</th>\n",
              "      <th>WIDTH</th>\n",
              "      <th>LICENSE</th>\n",
              "      <th>NSFW</th>\n",
              "      <th>similarity</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>1.581282e+12</td>\n",
              "      <td>http://media.rightmove.co.uk/148k/147518/58718...</td>\n",
              "      <td>View EPC Rating Graph for this property</td>\n",
              "      <td>109.0</td>\n",
              "      <td>100.0</td>\n",
              "      <td>?</td>\n",
              "      <td>UNSURE</td>\n",
              "      <td>0.312813</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1.060015e+12</td>\n",
              "      <td>https://thumbs.ebaystatic.com/images/g/DYEAAOS...</td>\n",
              "      <td>Silverline Air Framing Nailer 90mm 10 - 12 Gau...</td>\n",
              "      <td>225.0</td>\n",
              "      <td>225.0</td>\n",
              "      <td>?</td>\n",
              "      <td>UNLIKELY</td>\n",
              "      <td>0.312485</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>3.372497e+12</td>\n",
              "      <td>https://farm1.staticflickr.com/784/40182677504...</td>\n",
              "      <td>Anhui Mountains</td>\n",
              "      <td>800.0</td>\n",
              "      <td>514.0</td>\n",
              "      <td>?</td>\n",
              "      <td>UNLIKELY</td>\n",
              "      <td>0.316512</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>3.820200e+11</td>\n",
              "      <td>https://t2.ftcdn.net/jpg/00/58/35/35/240_F_583...</td>\n",
              "      <td>Acute pain in a woman knee</td>\n",
              "      <td>257.0</td>\n",
              "      <td>240.0</td>\n",
              "      <td>?</td>\n",
              "      <td>UNLIKELY</td>\n",
              "      <td>0.344278</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-9064cdd3-53be-4802-bd05-9f3574e3e80c')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-9064cdd3-53be-4802-bd05-9f3574e3e80c button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-9064cdd3-53be-4802-bd05-9f3574e3e80c');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ]
          },
          "metadata": {},
          "execution_count": 6
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# 3: Navigating the data-assets\n",
        "\n",
        "We have curated $(N_{parquet} \\times 4)+1 = 641$ meta-dataset files generated for the 400m and 2B-en datasets using [this](https://github.com/anonymous/hate_scaling/blob/main/code/2_Pysentimiento_400M_2Ben.ipynb) notebook and have shared the assets as a single .zip file \n",
        "[here](https://anonymous.edu/assets/data/papers/hate_detect_laion_400m_2B-en.zip).\n",
        "\n",
        "Download this file and unzip it into a local dir: ```RESULT_DIR='hate_detect_400m_2B-e'```\n",
        "\n",
        "## 3b: The 641 files were generated during our random sampling experiment whose steps were:\n",
        "\n",
        "- Generate 0.1 million random indices per parquet file\n",
        "- Save these 0.1 million random indices of the associated parquet files\n",
        "- Parse the parquet file and extract the image (alt)textual descriptions pertaining to these 1e5 random indices as a numpy tensor\n",
        "- Pass the alt-text tensor through the hate-analyzer\n",
        "- Compute stats and save the results\n",
        "\n",
        "\n",
        "To summarize, we have generated $(N_{parquet} \\times 4)+1 = 641$ files for the two datasets.\n",
        "\n",
        "The types of meta-dataset files are:\n",
        "\n",
        "\n",
        "- ```index_random_{ind_i}.npy```: $N_{parquet}$ random-index files of the naming-format: ```index_random_{ind_i}.npy```. Each of these contain 0.1 million random indices pertaining to the rows of the $ind\\_i^{th}$ parquet file (in ```parquet_list```). Shape: ```(100000,)```\n",
        "- ```prob_hate_{ind_i}.npy```: $N_{parquet}$ _hate-probability-matrix_ files of shape ```(100000, 3)``` in the naming-format of ```prob_hate_{ind_i}.npy``` pertaining to the 0.1 million random-indexed rows of the $ind\\_i^{th}$ parquet file.\n",
        "-```qfr_file_{ind_i}.npy```: $N_{parquet}$ _quality-failure-rate_ files of shape ```(3,)``` containing the  percentage of the 0.1 million random-indexed alt-text text samples in the $ind\\_i^{th}$ parquet file that triggered a P_hateful/P_targeted/P_aggressive value of > 0.5 by the pysentimento detector (See ```np.mean(res_mat_i>0.5,axis=0)*100``` in the cells above)\n",
        "- ```alt_text_{ind_i}.npy``` : $N_{parquet}$ _alt-text_ files of shape ```(100000, 1)``` in the naming-format of ```alt_text_{ind_i}.npy``` pertaining to the 0.1 million random-indexed textual row-contents of the $ind\\_i^{th}$ parquet file (in the TEXT field)\n",
        "- ```qfr_400m_2Ben.npy```: A ($N_{parquet}$, 3) shaped numpy file that contains the parquet-file level mean-hate content.\n",
        "\n",
        "```\n",
        "# Code reference:\n",
        "\n",
        "  np.save(f'./{RESULT_DIR}/index_random_{ind_i}.npy',ind_random_i)\n",
        "  np.save(f'./{RESULT_DIR}/prob_hate_{ind_i}.npy',prob_hate_i)\n",
        "  np.save(f'./{RESULT_DIR}/qfr_file_{ind_i}.npy',qfr_file_i)\n",
        "  np.save(f'./{RESULT_DIR}/alt_text_{ind_i}.npy',texts_np_i)\n",
        "  \n",
        "np.save(f'./{RESULT_DIR}/qfr_400m_2Ben.npy',qfr_all)\n",
        "```\n",
        "\n",
        " Now, before running verification-code that allows the user to understand what these data-assets are, let us install Pysentimiento and get acquainted with it."
      ],
      "metadata": {
        "id": "TFWAAHBYkuot"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install --upgrade accelerate\n",
        "!pip install --quiet pysentimiento==0.5.2\n",
        "##################################\n",
        "import sys\n",
        "print(\"Python version\")\n",
        "# See: https://github.com/pysentimiento/pysentimiento/issues/50\n",
        "print (sys.version)"
      ],
      "metadata": {
        "id": "UpKzQIHKYDW4",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "6237b3e3-b178-4a48-a2db-5c8ec2b35bf2"
      },
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
            "Requirement already satisfied: accelerate in /usr/local/lib/python3.10/dist-packages (0.19.0)\n",
            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from accelerate) (1.22.4)\n",
            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from accelerate) (23.1)\n",
            "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate) (5.9.5)\n",
            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from accelerate) (6.0)\n",
            "Requirement already satisfied: torch>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from accelerate) (2.0.1+cu118)\n",
            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->accelerate) (3.12.0)\n",
            "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->accelerate) (4.5.0)\n",
            "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->accelerate) (1.11.1)\n",
            "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->accelerate) (3.1)\n",
            "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->accelerate) (3.1.2)\n",
            "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->accelerate) (2.0.0)\n",
            "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.6.0->accelerate) (3.25.2)\n",
            "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.6.0->accelerate) (16.0.5)\n",
            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.6.0->accelerate) (2.1.2)\n",
            "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.6.0->accelerate) (1.3.0)\n",
            "Python version\n",
            "3.10.11 (main, Apr  5 2023, 14:15:10) [GCC 9.4.0]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## 3c: Getting acquainted with ```pysentimiento```:\n",
        "\n",
        "Let us initialize the pysentimiento analyzer and feed in an example to see what it produces ..."
      ],
      "metadata": {
        "id": "GyuyhUST9Yh4"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from pysentimiento import create_analyzer\n",
        "analyzer = create_analyzer(task=\"hate_speech\", lang=\"en\")\n",
        "df_400m_0.TEXT.values[0],analyzer.predict(df_400m_0.TEXT.values[0])\n",
        "# ('View EPC Rating Graph for this property',\n",
        "#  AnalyzerOutput(output=[], probas={hateful: 0.015, targeted: 0.012, aggressive: 0.011}))"
      ],
      "metadata": {
        "id": "ll_PoLrVXSuv",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "cde7e3c8-cf14-4b8e-d8fd-6b548c5e2d5f"
      },
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "('View EPC Rating Graph for this property',\n",
              " AnalyzerOutput(output=[], probas={hateful: 0.015, targeted: 0.012, aggressive: 0.011}))"
            ]
          },
          "metadata": {},
          "execution_count": 8
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# The inits:\n",
        "RESULT_DIR='hate_detect_400m_2B-en'\n",
        "np.random.seed(42)\n",
        "N_samples=int(1e5) # Number of samples you have chosen to randomly sample\n",
        "N_parquet=len(parquet_list)\n",
        "N_samples,N_parquet"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "9g_3TciLEhL3",
        "outputId": "1aa44cfa-22e1-49c2-a7a7-6ea07d873820"
      },
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(100000, 160)"
            ]
          },
          "metadata": {},
          "execution_count": 9
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Now, let us run a quick survey of the data assets created."
      ],
      "metadata": {
        "id": "cgdHjXdHSopx"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "pd.Series([f.split('_')[0] for f in os.listdir(f'{RESULT_DIR}')]).value_counts()  "
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "1L4rTMpVRuBH",
        "outputId": "7794af6d-01ad-4f92-f11e-3df619512f49"
      },
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "qfr      161\n",
              "alt      160\n",
              "index    160\n",
              "prob     160\n",
              "dtype: int64"
            ]
          },
          "metadata": {},
          "execution_count": 10
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# 4: Last step - Verification\n",
        "\n",
        "This entails reloading a random raw parquet files manually, re-running the NLP hate classifier and comparing the output of the classifier to the data-assets already curated"
      ],
      "metadata": {
        "id": "dScMHBHekZpY"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "df_parquet.head(4)"
      ],
      "metadata": {
        "id": "o7v2Me8KoF61",
        "outputId": "3989380d-6776-41ba-8405-9b80d4109183",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 175
        }
      },
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "  dataset file_id  file_size_GB  \\\n",
              "0    400m  400m_0        1.6794   \n",
              "1    400m  400m_1        1.6800   \n",
              "2    400m  400m_2        1.6792   \n",
              "3    400m  400m_3        1.6797   \n",
              "\n",
              "                                            file_loc  P_hateful  P_targeted  \\\n",
              "0  ./the-eye.eu/public/AI/cah/laion400m-met-relea...      0.294       0.097   \n",
              "1  ./the-eye.eu/public/AI/cah/laion400m-met-relea...      0.300       0.069   \n",
              "2  ./the-eye.eu/public/AI/cah/laion400m-met-relea...      0.306       0.080   \n",
              "3  ./the-eye.eu/public/AI/cah/laion400m-met-relea...      0.298       0.084   \n",
              "\n",
              "   P_aggressive              file_ind  \n",
              "0         0.017  laion400m-meta_00000  \n",
              "1         0.012  laion400m-meta_00001  \n",
              "2         0.011  laion400m-meta_00002  \n",
              "3         0.014  laion400m-meta_00004  "
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-e83d8323-4412-4771-9dd9-985cf0ae5f84\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>dataset</th>\n",
              "      <th>file_id</th>\n",
              "      <th>file_size_GB</th>\n",
              "      <th>file_loc</th>\n",
              "      <th>P_hateful</th>\n",
              "      <th>P_targeted</th>\n",
              "      <th>P_aggressive</th>\n",
              "      <th>file_ind</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>400m</td>\n",
              "      <td>400m_0</td>\n",
              "      <td>1.6794</td>\n",
              "      <td>./the-eye.eu/public/AI/cah/laion400m-met-relea...</td>\n",
              "      <td>0.294</td>\n",
              "      <td>0.097</td>\n",
              "      <td>0.017</td>\n",
              "      <td>laion400m-meta_00000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>400m</td>\n",
              "      <td>400m_1</td>\n",
              "      <td>1.6800</td>\n",
              "      <td>./the-eye.eu/public/AI/cah/laion400m-met-relea...</td>\n",
              "      <td>0.300</td>\n",
              "      <td>0.069</td>\n",
              "      <td>0.012</td>\n",
              "      <td>laion400m-meta_00001</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>400m</td>\n",
              "      <td>400m_2</td>\n",
              "      <td>1.6792</td>\n",
              "      <td>./the-eye.eu/public/AI/cah/laion400m-met-relea...</td>\n",
              "      <td>0.306</td>\n",
              "      <td>0.080</td>\n",
              "      <td>0.011</td>\n",
              "      <td>laion400m-meta_00002</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>400m</td>\n",
              "      <td>400m_3</td>\n",
              "      <td>1.6797</td>\n",
              "      <td>./the-eye.eu/public/AI/cah/laion400m-met-relea...</td>\n",
              "      <td>0.298</td>\n",
              "      <td>0.084</td>\n",
              "      <td>0.014</td>\n",
              "      <td>laion400m-meta_00004</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-e83d8323-4412-4771-9dd9-985cf0ae5f84')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-e83d8323-4412-4771-9dd9-985cf0ae5f84 button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-e83d8323-4412-4771-9dd9-985cf0ae5f84');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ]
          },
          "metadata": {},
          "execution_count": 11
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# 1: Extract the parquet list from the file_loc column\n",
        "parquet_list= df_parquet.file_loc[32:]\n",
        "# 2: Pick a random file\n",
        "ind_random_2b=np.random.choice(len(parquet_list),1,replace=False)[0]\n",
        "file_interested=parquet_list[ind_random_2b]\n",
        "# 3: Covert file name to ``ind_i`` to fetch the pre-computed results (This should be the same as stored in the file_ind column)\n",
        "ind_i_manual=file_interested.split('/')[-2]+'_'+file_interested.split('/')[-1].split('-')[1]\n",
        "ind_i= df_parquet.loc[df_parquet.file_loc==file_interested,'file_ind'].values[0]\n",
        "print(f'Check-1: The manual created index variable is the same as the fetched value? {ind_i==ind_i_manual}')\n",
        "# 4: Use this ind_i to fetch the pertinent pre-computed results\n",
        "ind_vec=np.load(f'./{RESULT_DIR}/index_random_{ind_i}.npy') # The 0.1 million random indices sampled from this parquet file\n",
        "alt_text_file=np.load(f'./{RESULT_DIR}/alt_text_{ind_i}.npy',allow_pickle=True) # The 0.1 million alt-text sentences in the 0.1 million random indices sampled from this parquet file\n",
        "prob_hate_file=np.load(f'./{RESULT_DIR}/prob_hate_{ind_i}.npy') # The 0.1 million x 3 matrix of Pysentimiento outputs\n",
        "qfr_file=np.load(f'./{RESULT_DIR}/qfr_file_{ind_i}.npy') # the Quality-Failure-Rate  data pertaining to the chosen parquet file\n",
        "############################################################\n",
        "# 5: Now, let us manually re-read the file and run the inference again to verify.\n",
        "t = TicToc()\n",
        "import pyarrow.parquet as pq\n",
        "t.tic()\n",
        "df_i=pq.read_table(file_interested,columns=['TEXT']).to_pandas()\n",
        "t.toc()\n",
        "# 6: Extract the text-description from these indices\n",
        "texts_np_i = df_i.iloc[ind_vec].TEXT.astype(str).values[0:N_samples]\n",
        "check_alt_text=texts_np_i==alt_text_file\n",
        "print(f'Check-2: The manually extracted alt-text is the same as the fetched alt-text vector? {(check_alt_text).mean()==1}')\n",
        "# 7: Analyze the textual-content ( 1x3 o/p [P_hateful, P_targeted, P_aggressive])\n",
        "del df_i\n",
        "import gc\n",
        "gc.collect()\n",
        "t.tic()\n",
        "results_i=analyzer.predict(texts_np_i)\n",
        "t.toc()\n",
        "# 8: Compute the results\n",
        "prob_hate_recomputed=np.array(list(itertools.chain.from_iterable(x.probas.values() for x in results_i))).reshape(N_samples,3)\n",
        "qfr_recomputed=np.mean(prob_hate_recomputed>0.5,axis=0)*100\n",
        "# 9: Final Verification:\n",
        "np.testing.assert_allclose(prob_hate_recomputed, prob_hate_file,rtol=1e-03,), np.testing.assert_allclose(qfr_recomputed,qfr_file,rtol=1e-03,)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 104,
          "referenced_widgets": [
            "9eb6d2b5a8e04059b7fb60e0ae0ce7b3",
            "34b439cd8b894de09de66ec09a072547",
            "4e4dbfcee4c9485bae06ce518a5d4e2a",
            "84b705285e954374a84787e4ac3a6af7",
            "39b1ced9640a4a5b85801d472193bcf4",
            "1935dcca1939454ab027ce7adf492b20",
            "68392799cf964122bd3c9d2344ceea68",
            "50b8b40a6a1b46f6aefa10bbf7febb74",
            "39a99ddce4a04ae2aab3b1216a4b2b7b",
            "ade7e024f9764c9693e457ff04acfe2d",
            "7f14e30db13941bdbc6f4ceb2e4f9ae9"
          ]
        },
        "id": "4XZ123ezlHSx",
        "outputId": "f840ee4a-e8f7-4b5c-d791-fdf27f99dc2d"
      },
      "execution_count": 12,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Check-1: The manual created index variable is the same as the fetched value? True\n",
            "Elapsed time is 32.536706 seconds.\n",
            "Check-2: The manually extracted alt-text is the same as the fetched alt-text vector? 1.0\n"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "Map:   0%|          | 0/100000 [00:00<?, ? examples/s]"
            ],
            "application/vnd.jupyter.widget-view+json": {
              "version_major": 2,
              "version_minor": 0,
              "model_id": "9eb6d2b5a8e04059b7fb60e0ae0ce7b3"
            }
          },
          "metadata": {}
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ],
            "text/html": []
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Elapsed time is 371.504717 seconds.\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(None, None)"
            ]
          },
          "metadata": {},
          "execution_count": 12
        }
      ]
    }
  ]
}