{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Getting started with OpenAssistant OASST1 data\n",
    "\n",
    "- https://huggingface.co/datasets/OpenAssistant/oasst1"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/master/notebooks/openassistant-oasst1/getting-started.ipynb)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# uncomment and run below lines to set up if running in colab\n",
    "#!pip install datasets pandas treelib"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\andre\\Documents\\repos\\Open-Assistant\\venv\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from datasets import load_dataset\n",
    "from treelib import Tree\n",
    "\n",
    "# set some pandas options to make the output more readable\n",
    "pd.set_option(\"display.max_rows\", 500)\n",
    "pd.set_option(\"display.max_columns\", 500)\n",
    "pd.set_option(\"display.width\", 1000)\n",
    "\n",
    "\n",
    "def add_tree_level(df):\n",
    "    \"\"\"helper function to add tree level to a df\"\"\"\n",
    "\n",
    "    # if tree level already exists, return df\n",
    "    if \"tree_level\" in df.columns:\n",
    "        return df\n",
    "\n",
    "    else:\n",
    "        tree_level_map = {}\n",
    "\n",
    "        # iterate over rows in df\n",
    "        for i, row in df.iterrows():\n",
    "            message_id = row[\"message_id\"]\n",
    "            parent_id = row[\"parent_id\"]\n",
    "\n",
    "            # if parent_id is None, then it is a root message\n",
    "            if parent_id is None:\n",
    "                tree_level_map[message_id] = 0\n",
    "            # if parent_id is the same as message_tree_id, then it is a direct reply to the root message\n",
    "            elif parent_id == row[\"message_tree_id\"]:\n",
    "                tree_level_map[message_id] = 1\n",
    "            # else just look up the tree level of the parent_id and add 1\n",
    "            else:\n",
    "                tree_level_map[message_id] = tree_level_map[parent_id] + 1\n",
    "\n",
    "        # create a df from the tree_level_map and merge it with the original df\n",
    "        df_tree_level_map = (\n",
    "            pd.DataFrame.from_dict(tree_level_map, orient=\"index\", columns=[\"tree_level\"])\n",
    "            .reset_index()\n",
    "            .rename(columns={\"index\": \"message_id\"})\n",
    "        )\n",
    "\n",
    "        return df.merge(df_tree_level_map, on=\"message_id\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Found cached dataset parquet (C:/Users/andre/.cache/huggingface/datasets/OpenAssistant___parquet/OpenAssistant--oasst1-ea605663b798f601/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n",
      "100%|██████████| 2/2 [00:00<00:00, 95.13it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DatasetDict({\n",
      "    train: Dataset({\n",
      "        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'],\n",
      "        num_rows: 84437\n",
      "    })\n",
      "    validation: Dataset({\n",
      "        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'],\n",
      "        num_rows: 4401\n",
      "    })\n",
      "})\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "# load dataset from huggingface datasets\n",
    "ds = load_dataset(\"OpenAssistant/oasst1\")\n",
    "print(ds)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create Pandas Dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# lets convert the train dataset to a pandas df\n",
    "df = ds[\"train\"].to_pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 84437 entries, 0 to 84436\n",
      "Data columns (total 18 columns):\n",
      " #   Column           Non-Null Count  Dtype  \n",
      "---  ------           --------------  -----  \n",
      " 0   message_id       84437 non-null  object \n",
      " 1   parent_id        74591 non-null  object \n",
      " 2   user_id          84437 non-null  object \n",
      " 3   created_date     84437 non-null  object \n",
      " 4   text             84437 non-null  object \n",
      " 5   role             84437 non-null  object \n",
      " 6   lang             84437 non-null  object \n",
      " 7   review_count     84437 non-null  int32  \n",
      " 8   review_result    83732 non-null  object \n",
      " 9   deleted          84437 non-null  bool   \n",
      " 10  rank             48730 non-null  float64\n",
      " 11  synthetic        84437 non-null  bool   \n",
      " 12  model_name       0 non-null      object \n",
      " 13  detoxify         72297 non-null  object \n",
      " 14  message_tree_id  84437 non-null  object \n",
      " 15  tree_state       84437 non-null  object \n",
      " 16  emojis           71496 non-null  object \n",
      " 17  labels           84199 non-null  object \n",
      "dtypes: bool(2), float64(1), int32(1), object(14)\n",
      "memory usage: 10.1+ MB\n"
     ]
    }
   ],
   "source": [
    "# look at the df info\n",
    "df.info(verbose=True, memory_usage=True, show_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{47284: {'message_id': '9303a17a-a84a-42a9-87b3-5a9daa7c6827',\n",
       "  'parent_id': '1b70b9ff-bc39-4264-a96d-ae186a993917',\n",
       "  'user_id': 'e2abe732-48a7-48d3-9de8-5af8cb96d029',\n",
       "  'created_date': '2023-02-09T19:27:48.959399+00:00',\n",
       "  'text': 'Una paraula amb les 5 vocals, que no comenci amb \"a\" i que no sigui un verb és, per exemple, minotaure.',\n",
       "  'role': 'assistant',\n",
       "  'lang': 'ca',\n",
       "  'review_count': 3,\n",
       "  'review_result': True,\n",
       "  'deleted': False,\n",
       "  'rank': 1.0,\n",
       "  'synthetic': False,\n",
       "  'model_name': None,\n",
       "  'detoxify': None,\n",
       "  'message_tree_id': '3ac3157d-5057-43be-b975-2f7665063ca8',\n",
       "  'tree_state': 'ready_for_export',\n",
       "  'emojis': {'name': array(['+1'], dtype=object), 'count': array([2])},\n",
       "  'labels': {'name': array(['spam', 'fails_task', 'lang_mismatch', 'pii', 'not_appropriate',\n",
       "          'hate_speech', 'sexual_content', 'quality', 'toxicity', 'humor',\n",
       "          'helpfulness', 'creativity', 'violence'], dtype=object),\n",
       "   'value': array([0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.75, 0.  , 1.  , 1.  ,\n",
       "          1.  , 0.  ]),\n",
       "   'count': array([3, 1, 3, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1])}}}"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# look at a sample row in a json format we can easily read\n",
    "df.sample(1).transpose().to_dict()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Random Message Tree"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1ba945f1-1d58-4815-b9f8-dacaca15180a\n"
     ]
    }
   ],
   "source": [
    "# lets grab a random message tree\n",
    "message_tree_id = df[\"message_tree_id\"].sample(1).values[0]\n",
    "print(message_tree_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>message_id</th>\n",
       "      <th>parent_id</th>\n",
       "      <th>user_id</th>\n",
       "      <th>created_date</th>\n",
       "      <th>text</th>\n",
       "      <th>role</th>\n",
       "      <th>lang</th>\n",
       "      <th>review_count</th>\n",
       "      <th>review_result</th>\n",
       "      <th>deleted</th>\n",
       "      <th>rank</th>\n",
       "      <th>synthetic</th>\n",
       "      <th>model_name</th>\n",
       "      <th>detoxify</th>\n",
       "      <th>message_tree_id</th>\n",
       "      <th>tree_state</th>\n",
       "      <th>emojis</th>\n",
       "      <th>labels</th>\n",
       "      <th>tree_level</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1ba945f1-1d58-4815-b9f8-dacaca15180a</td>\n",
       "      <td>None</td>\n",
       "      <td>77161841-4be2-4f79-8f08-b457b4177ff4</td>\n",
       "      <td>2023-02-04T19:20:58.346053+00:00</td>\n",
       "      <td>Write a ballad on why internet privacy and ope...</td>\n",
       "      <td>prompter</td>\n",
       "      <td>en</td>\n",
       "      <td>3</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "      <td>{'toxicity': 0.0006916996790096164, 'severe_to...</td>\n",
       "      <td>1ba945f1-1d58-4815-b9f8-dacaca15180a</td>\n",
       "      <td>ready_for_export</td>\n",
       "      <td>{'name': ['+1', '_skip_reply', '_skip_ranking'...</td>\n",
       "      <td>{'name': ['spam', 'lang_mismatch', 'pii', 'not...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>7f5b4e4d-ae41-43b7-a4d4-458cfcb51630</td>\n",
       "      <td>1ba945f1-1d58-4815-b9f8-dacaca15180a</td>\n",
       "      <td>e8453d07-9240-49da-8cb5-2f8d79d37e51</td>\n",
       "      <td>2023-02-05T16:42:21.553867+00:00</td>\n",
       "      <td>Verse 1\\nOpen source and privacy,\\nWe need to ...</td>\n",
       "      <td>assistant</td>\n",
       "      <td>en</td>\n",
       "      <td>3</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>0.0</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "      <td>{'toxicity': 0.00038280943408608437, 'severe_t...</td>\n",
       "      <td>1ba945f1-1d58-4815-b9f8-dacaca15180a</td>\n",
       "      <td>ready_for_export</td>\n",
       "      <td>{'name': ['+1', '_skip_labeling'], 'count': [2...</td>\n",
       "      <td>{'name': ['spam', 'fails_task', 'lang_mismatch...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>781e7fe4-c6b3-4119-a585-2646b0684722</td>\n",
       "      <td>1ba945f1-1d58-4815-b9f8-dacaca15180a</td>\n",
       "      <td>2fb3c88d-3e2f-4bd2-a643-331dbae8af42</td>\n",
       "      <td>2023-02-05T16:55:57.495596+00:00</td>\n",
       "      <td>There once waw a computer,\\na friendly old com...</td>\n",
       "      <td>assistant</td>\n",
       "      <td>en</td>\n",
       "      <td>3</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>2.0</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "      <td>{'toxicity': 0.0007086674449965358, 'severe_to...</td>\n",
       "      <td>1ba945f1-1d58-4815-b9f8-dacaca15180a</td>\n",
       "      <td>ready_for_export</td>\n",
       "      <td>{'name': ['+1', '-1'], 'count': [3, 6]}</td>\n",
       "      <td>{'name': ['spam', 'fails_task', 'lang_mismatch...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>fa911681-4848-4efc-9bfa-2b52dc7372ea</td>\n",
       "      <td>1ba945f1-1d58-4815-b9f8-dacaca15180a</td>\n",
       "      <td>490cef32-f0cb-4f01-9270-525e24d561d9</td>\n",
       "      <td>2023-02-05T16:57:07.002312+00:00</td>\n",
       "      <td>Verse 1:\\nIn a world of endless clicks and end...</td>\n",
       "      <td>assistant</td>\n",
       "      <td>en</td>\n",
       "      <td>3</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>1.0</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "      <td>{'toxicity': 0.00046211303560994565, 'severe_t...</td>\n",
       "      <td>1ba945f1-1d58-4815-b9f8-dacaca15180a</td>\n",
       "      <td>ready_for_export</td>\n",
       "      <td>{'name': ['+1'], 'count': [10]}</td>\n",
       "      <td>{'name': ['spam', 'fails_task', 'lang_mismatch...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>6852b915-5a2c-4558-ae9c-2c7f5566d8e1</td>\n",
       "      <td>fa911681-4848-4efc-9bfa-2b52dc7372ea</td>\n",
       "      <td>9109eaad-017a-46f9-91c5-231cd95d73d0</td>\n",
       "      <td>2023-02-05T17:28:38.315985+00:00</td>\n",
       "      <td>Change the chorus</td>\n",
       "      <td>prompter</td>\n",
       "      <td>en</td>\n",
       "      <td>3</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "      <td>{'toxicity': 0.0005257376469671726, 'severe_to...</td>\n",
       "      <td>1ba945f1-1d58-4815-b9f8-dacaca15180a</td>\n",
       "      <td>ready_for_export</td>\n",
       "      <td>{'name': ['_skip_reply', '_skip_ranking'], 'co...</td>\n",
       "      <td>{'name': ['spam', 'lang_mismatch', 'pii', 'not...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>377d805a-e680-4054-919a-270f95e5ec52</td>\n",
       "      <td>fa911681-4848-4efc-9bfa-2b52dc7372ea</td>\n",
       "      <td>828e7359-02f0-4b9a-a60b-94568279436f</td>\n",
       "      <td>2023-02-05T17:31:19.683738+00:00</td>\n",
       "      <td>Can you modify it to carry an opposite message?</td>\n",
       "      <td>prompter</td>\n",
       "      <td>en</td>\n",
       "      <td>3</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "      <td>{'toxicity': 0.0003970751422457397, 'severe_to...</td>\n",
       "      <td>1ba945f1-1d58-4815-b9f8-dacaca15180a</td>\n",
       "      <td>ready_for_export</td>\n",
       "      <td>{'name': ['+1'], 'count': [2]}</td>\n",
       "      <td>{'name': ['spam', 'lang_mismatch', 'pii', 'not...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>3e03cedc-42ac-4d60-a37d-287666a15fe8</td>\n",
       "      <td>7f5b4e4d-ae41-43b7-a4d4-458cfcb51630</td>\n",
       "      <td>c6546bf5-460a-4f99-b387-f3fe7bddd2f5</td>\n",
       "      <td>2023-02-05T18:19:50.000999+00:00</td>\n",
       "      <td>Looks good but can you explain who takes away ...</td>\n",
       "      <td>prompter</td>\n",
       "      <td>en</td>\n",
       "      <td>3</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "      <td>{'toxicity': 0.00044449075357988477, 'severe_t...</td>\n",
       "      <td>1ba945f1-1d58-4815-b9f8-dacaca15180a</td>\n",
       "      <td>ready_for_export</td>\n",
       "      <td>None</td>\n",
       "      <td>{'name': ['spam', 'lang_mismatch', 'pii', 'not...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>1f9ba095-3937-492b-bc65-db313efd6f67</td>\n",
       "      <td>781e7fe4-c6b3-4119-a585-2646b0684722</td>\n",
       "      <td>c6546bf5-460a-4f99-b387-f3fe7bddd2f5</td>\n",
       "      <td>2023-02-05T18:21:20.321209+00:00</td>\n",
       "      <td>You said waw instead of was. Not sure the gram...</td>\n",
       "      <td>prompter</td>\n",
       "      <td>en</td>\n",
       "      <td>3</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "      <td>{'toxicity': 0.0006272009341046214, 'severe_to...</td>\n",
       "      <td>1ba945f1-1d58-4815-b9f8-dacaca15180a</td>\n",
       "      <td>ready_for_export</td>\n",
       "      <td>{'name': ['+1', '_skip_ranking'], 'count': [6,...</td>\n",
       "      <td>{'name': ['spam', 'lang_mismatch', 'pii', 'not...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>ebaed177-20ed-4e4b-8d50-b4337c10cc24</td>\n",
       "      <td>6852b915-5a2c-4558-ae9c-2c7f5566d8e1</td>\n",
       "      <td>0b8c1657-1a06-47e4-9b5a-e196729795f3</td>\n",
       "      <td>2023-02-05T19:10:55.869172+00:00</td>\n",
       "      <td>Chorus:\\nOh internet, let freedom be our guide...</td>\n",
       "      <td>assistant</td>\n",
       "      <td>en</td>\n",
       "      <td>3</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>2.0</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "      <td>{'toxicity': 0.0003795399097725749, 'severe_to...</td>\n",
       "      <td>1ba945f1-1d58-4815-b9f8-dacaca15180a</td>\n",
       "      <td>ready_for_export</td>\n",
       "      <td>None</td>\n",
       "      <td>{'name': ['spam', 'fails_task', 'lang_mismatch...</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>43c405d6-437b-4189-8ac5-72eb48357105</td>\n",
       "      <td>6852b915-5a2c-4558-ae9c-2c7f5566d8e1</td>\n",
       "      <td>0dcecbad-be37-4ead-a2e7-af5ac7aedfe2</td>\n",
       "      <td>2023-02-05T19:12:45.246748+00:00</td>\n",
       "      <td>New Chorus:\\nOh internet, where knowledge and ...</td>\n",
       "      <td>assistant</td>\n",
       "      <td>en</td>\n",
       "      <td>3</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>1.0</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "      <td>{'toxicity': 0.00036214423016645014, 'severe_t...</td>\n",
       "      <td>1ba945f1-1d58-4815-b9f8-dacaca15180a</td>\n",
       "      <td>ready_for_export</td>\n",
       "      <td>None</td>\n",
       "      <td>{'name': ['spam', 'fails_task', 'lang_mismatch...</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>049dfba4-2010-4fb9-8899-86d1985bb31a</td>\n",
       "      <td>6852b915-5a2c-4558-ae9c-2c7f5566d8e1</td>\n",
       "      <td>6f136680-a2fa-4b27-8524-8c061a66c224</td>\n",
       "      <td>2023-02-05T19:13:07.700076+00:00</td>\n",
       "      <td>Chorus:\\nOh internet, where information flows ...</td>\n",
       "      <td>assistant</td>\n",
       "      <td>en</td>\n",
       "      <td>3</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>0.0</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "      <td>{'toxicity': 0.0005320637137629092, 'severe_to...</td>\n",
       "      <td>1ba945f1-1d58-4815-b9f8-dacaca15180a</td>\n",
       "      <td>ready_for_export</td>\n",
       "      <td>{'name': ['+1'], 'count': [3]}</td>\n",
       "      <td>{'name': ['spam', 'fails_task', 'lang_mismatch...</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>cbaf3202-d894-4b4e-994d-721e8cd8b9fb</td>\n",
       "      <td>1f9ba095-3937-492b-bc65-db313efd6f67</td>\n",
       "      <td>cf4470fc-58f0-44ac-ac9b-43a5bfd672f2</td>\n",
       "      <td>2023-02-05T19:51:18.161677+00:00</td>\n",
       "      <td>Sorry, I did indeed misspell \"was\".\\n\\nhere is...</td>\n",
       "      <td>assistant</td>\n",
       "      <td>en</td>\n",
       "      <td>3</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>0.0</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "      <td>{'toxicity': 0.0005069204489700496, 'severe_to...</td>\n",
       "      <td>1ba945f1-1d58-4815-b9f8-dacaca15180a</td>\n",
       "      <td>ready_for_export</td>\n",
       "      <td>{'name': ['+1'], 'count': [2]}</td>\n",
       "      <td>{'name': ['spam', 'fails_task', 'lang_mismatch...</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>11dd753c-d1de-4d6b-a223-7fe5bb74d084</td>\n",
       "      <td>1f9ba095-3937-492b-bc65-db313efd6f67</td>\n",
       "      <td>2c96e467-66f0-4be7-9693-bda51356a424</td>\n",
       "      <td>2023-02-05T19:52:47.283553+00:00</td>\n",
       "      <td>There once was a computer,\\na friendly old mac...</td>\n",
       "      <td>assistant</td>\n",
       "      <td>en</td>\n",
       "      <td>3</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>2.0</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "      <td>{'toxicity': 0.0005644694319926202, 'severe_to...</td>\n",
       "      <td>1ba945f1-1d58-4815-b9f8-dacaca15180a</td>\n",
       "      <td>ready_for_export</td>\n",
       "      <td>None</td>\n",
       "      <td>{'name': ['spam', 'fails_task', 'lang_mismatch...</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>6a39fde1-daf5-4426-8047-09c531a7689b</td>\n",
       "      <td>1f9ba095-3937-492b-bc65-db313efd6f67</td>\n",
       "      <td>687b1065-fc42-41d5-a38d-2049579de6c1</td>\n",
       "      <td>2023-02-05T20:08:10.060958+00:00</td>\n",
       "      <td>There once was a computer, friendly and old,\\n...</td>\n",
       "      <td>assistant</td>\n",
       "      <td>en</td>\n",
       "      <td>3</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>1.0</td>\n",
       "      <td>False</td>\n",
       "      <td>None</td>\n",
       "      <td>{'toxicity': 0.000547583622392267, 'severe_tox...</td>\n",
       "      <td>1ba945f1-1d58-4815-b9f8-dacaca15180a</td>\n",
       "      <td>ready_for_export</td>\n",
       "      <td>None</td>\n",
       "      <td>{'name': ['spam', 'fails_task', 'lang_mismatch...</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                              message_id                             parent_id                               user_id                      created_date                                               text       role lang  review_count review_result  deleted  rank  synthetic model_name                                           detoxify                       message_tree_id        tree_state                                             emojis                                             labels  tree_level\n",
       "0   1ba945f1-1d58-4815-b9f8-dacaca15180a                                  None  77161841-4be2-4f79-8f08-b457b4177ff4  2023-02-04T19:20:58.346053+00:00  Write a ballad on why internet privacy and ope...   prompter   en             3          True    False   NaN      False       None  {'toxicity': 0.0006916996790096164, 'severe_to...  1ba945f1-1d58-4815-b9f8-dacaca15180a  ready_for_export  {'name': ['+1', '_skip_reply', '_skip_ranking'...  {'name': ['spam', 'lang_mismatch', 'pii', 'not...           0\n",
       "1   7f5b4e4d-ae41-43b7-a4d4-458cfcb51630  1ba945f1-1d58-4815-b9f8-dacaca15180a  e8453d07-9240-49da-8cb5-2f8d79d37e51  2023-02-05T16:42:21.553867+00:00  Verse 1\\nOpen source and privacy,\\nWe need to ...  assistant   en             3          True    False   0.0      False       None  {'toxicity': 0.00038280943408608437, 'severe_t...  1ba945f1-1d58-4815-b9f8-dacaca15180a  ready_for_export  {'name': ['+1', '_skip_labeling'], 'count': [2...  {'name': ['spam', 'fails_task', 'lang_mismatch...           1\n",
       "2   781e7fe4-c6b3-4119-a585-2646b0684722  1ba945f1-1d58-4815-b9f8-dacaca15180a  2fb3c88d-3e2f-4bd2-a643-331dbae8af42  2023-02-05T16:55:57.495596+00:00  There once waw a computer,\\na friendly old com...  assistant   en             3          True    False   2.0      False       None  {'toxicity': 0.0007086674449965358, 'severe_to...  1ba945f1-1d58-4815-b9f8-dacaca15180a  ready_for_export            {'name': ['+1', '-1'], 'count': [3, 6]}  {'name': ['spam', 'fails_task', 'lang_mismatch...           1\n",
       "3   fa911681-4848-4efc-9bfa-2b52dc7372ea  1ba945f1-1d58-4815-b9f8-dacaca15180a  490cef32-f0cb-4f01-9270-525e24d561d9  2023-02-05T16:57:07.002312+00:00  Verse 1:\\nIn a world of endless clicks and end...  assistant   en             3          True    False   1.0      False       None  {'toxicity': 0.00046211303560994565, 'severe_t...  1ba945f1-1d58-4815-b9f8-dacaca15180a  ready_for_export                    {'name': ['+1'], 'count': [10]}  {'name': ['spam', 'fails_task', 'lang_mismatch...           1\n",
       "4   6852b915-5a2c-4558-ae9c-2c7f5566d8e1  fa911681-4848-4efc-9bfa-2b52dc7372ea  9109eaad-017a-46f9-91c5-231cd95d73d0  2023-02-05T17:28:38.315985+00:00                                  Change the chorus   prompter   en             3          True    False   NaN      False       None  {'toxicity': 0.0005257376469671726, 'severe_to...  1ba945f1-1d58-4815-b9f8-dacaca15180a  ready_for_export  {'name': ['_skip_reply', '_skip_ranking'], 'co...  {'name': ['spam', 'lang_mismatch', 'pii', 'not...           2\n",
       "5   377d805a-e680-4054-919a-270f95e5ec52  fa911681-4848-4efc-9bfa-2b52dc7372ea  828e7359-02f0-4b9a-a60b-94568279436f  2023-02-05T17:31:19.683738+00:00    Can you modify it to carry an opposite message?   prompter   en             3          True    False   NaN      False       None  {'toxicity': 0.0003970751422457397, 'severe_to...  1ba945f1-1d58-4815-b9f8-dacaca15180a  ready_for_export                     {'name': ['+1'], 'count': [2]}  {'name': ['spam', 'lang_mismatch', 'pii', 'not...           2\n",
       "6   3e03cedc-42ac-4d60-a37d-287666a15fe8  7f5b4e4d-ae41-43b7-a4d4-458cfcb51630  c6546bf5-460a-4f99-b387-f3fe7bddd2f5  2023-02-05T18:19:50.000999+00:00  Looks good but can you explain who takes away ...   prompter   en             3          True    False   NaN      False       None  {'toxicity': 0.00044449075357988477, 'severe_t...  1ba945f1-1d58-4815-b9f8-dacaca15180a  ready_for_export                                               None  {'name': ['spam', 'lang_mismatch', 'pii', 'not...           2\n",
       "7   1f9ba095-3937-492b-bc65-db313efd6f67  781e7fe4-c6b3-4119-a585-2646b0684722  c6546bf5-460a-4f99-b387-f3fe7bddd2f5  2023-02-05T18:21:20.321209+00:00  You said waw instead of was. Not sure the gram...   prompter   en             3          True    False   NaN      False       None  {'toxicity': 0.0006272009341046214, 'severe_to...  1ba945f1-1d58-4815-b9f8-dacaca15180a  ready_for_export  {'name': ['+1', '_skip_ranking'], 'count': [6,...  {'name': ['spam', 'lang_mismatch', 'pii', 'not...           2\n",
       "8   ebaed177-20ed-4e4b-8d50-b4337c10cc24  6852b915-5a2c-4558-ae9c-2c7f5566d8e1  0b8c1657-1a06-47e4-9b5a-e196729795f3  2023-02-05T19:10:55.869172+00:00  Chorus:\\nOh internet, let freedom be our guide...  assistant   en             3          True    False   2.0      False       None  {'toxicity': 0.0003795399097725749, 'severe_to...  1ba945f1-1d58-4815-b9f8-dacaca15180a  ready_for_export                                               None  {'name': ['spam', 'fails_task', 'lang_mismatch...           3\n",
       "9   43c405d6-437b-4189-8ac5-72eb48357105  6852b915-5a2c-4558-ae9c-2c7f5566d8e1  0dcecbad-be37-4ead-a2e7-af5ac7aedfe2  2023-02-05T19:12:45.246748+00:00  New Chorus:\\nOh internet, where knowledge and ...  assistant   en             3          True    False   1.0      False       None  {'toxicity': 0.00036214423016645014, 'severe_t...  1ba945f1-1d58-4815-b9f8-dacaca15180a  ready_for_export                                               None  {'name': ['spam', 'fails_task', 'lang_mismatch...           3\n",
       "10  049dfba4-2010-4fb9-8899-86d1985bb31a  6852b915-5a2c-4558-ae9c-2c7f5566d8e1  6f136680-a2fa-4b27-8524-8c061a66c224  2023-02-05T19:13:07.700076+00:00  Chorus:\\nOh internet, where information flows ...  assistant   en             3          True    False   0.0      False       None  {'toxicity': 0.0005320637137629092, 'severe_to...  1ba945f1-1d58-4815-b9f8-dacaca15180a  ready_for_export                     {'name': ['+1'], 'count': [3]}  {'name': ['spam', 'fails_task', 'lang_mismatch...           3\n",
       "11  cbaf3202-d894-4b4e-994d-721e8cd8b9fb  1f9ba095-3937-492b-bc65-db313efd6f67  cf4470fc-58f0-44ac-ac9b-43a5bfd672f2  2023-02-05T19:51:18.161677+00:00  Sorry, I did indeed misspell \"was\".\\n\\nhere is...  assistant   en             3          True    False   0.0      False       None  {'toxicity': 0.0005069204489700496, 'severe_to...  1ba945f1-1d58-4815-b9f8-dacaca15180a  ready_for_export                     {'name': ['+1'], 'count': [2]}  {'name': ['spam', 'fails_task', 'lang_mismatch...           3\n",
       "12  11dd753c-d1de-4d6b-a223-7fe5bb74d084  1f9ba095-3937-492b-bc65-db313efd6f67  2c96e467-66f0-4be7-9693-bda51356a424  2023-02-05T19:52:47.283553+00:00  There once was a computer,\\na friendly old mac...  assistant   en             3          True    False   2.0      False       None  {'toxicity': 0.0005644694319926202, 'severe_to...  1ba945f1-1d58-4815-b9f8-dacaca15180a  ready_for_export                                               None  {'name': ['spam', 'fails_task', 'lang_mismatch...           3\n",
       "13  6a39fde1-daf5-4426-8047-09c531a7689b  1f9ba095-3937-492b-bc65-db313efd6f67  687b1065-fc42-41d5-a38d-2049579de6c1  2023-02-05T20:08:10.060958+00:00  There once was a computer, friendly and old,\\n...  assistant   en             3          True    False   1.0      False       None  {'toxicity': 0.000547583622392267, 'severe_tox...  1ba945f1-1d58-4815-b9f8-dacaca15180a  ready_for_export                                               None  {'name': ['spam', 'fails_task', 'lang_mismatch...           3"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# look at all data for this message tree\n",
    "df_message_tree = df.query(f\"message_tree_id == '{message_tree_id}'\").sort_values(\"created_date\")\n",
    "\n",
    "# add tree level to df\n",
    "df_message_tree = add_tree_level(df_message_tree)\n",
    "\n",
    "df_message_tree"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create Message Tree"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "id_tree:\n",
      "1ba945f1-1d58-4815-b9f8-dacaca15180a\n",
      "├── 781e7fe4-c6b3-4119-a585-2646b0684722\n",
      "│   └── 1f9ba095-3937-492b-bc65-db313efd6f67\n",
      "│       ├── 11dd753c-d1de-4d6b-a223-7fe5bb74d084\n",
      "│       ├── 6a39fde1-daf5-4426-8047-09c531a7689b\n",
      "│       └── cbaf3202-d894-4b4e-994d-721e8cd8b9fb\n",
      "├── 7f5b4e4d-ae41-43b7-a4d4-458cfcb51630\n",
      "│   └── 3e03cedc-42ac-4d60-a37d-287666a15fe8\n",
      "└── fa911681-4848-4efc-9bfa-2b52dc7372ea\n",
      "    ├── 377d805a-e680-4054-919a-270f95e5ec52\n",
      "    └── 6852b915-5a2c-4558-ae9c-2c7f5566d8e1\n",
      "        ├── 049dfba4-2010-4fb9-8899-86d1985bb31a\n",
      "        ├── 43c405d6-437b-4189-8ac5-72eb48357105\n",
      "        └── ebaed177-20ed-4e4b-8d50-b4337c10cc24\n",
      "\n",
      "text_tree:\n",
      "Write a ballad on why internet privacy and open-source is cool/important.\n",
      "├── There once waw a computer, a friendly old computer, that just wanted to do its own thing,  it was ru\n",
      "│   └── You said waw instead of was. Not sure the grammar is right.\n",
      "│       ├── Sorry, I did indeed misspell \"was\".  here is a corrected version:  There once was a computer, a frie\n",
      "│       ├── There once was a computer, a friendly old machine, that just wanted to do its own thing.  It was run\n",
      "│       └── There once was a computer, friendly and old, Its XP operating system, with a story to be told. It he\n",
      "├── Verse 1 Open source and privacy, We need to protect our liberties, These freedoms must be kept, So o\n",
      "│   └── Looks good but can you explain who takes away the freedoms?\n",
      "└── Verse 1: In a world of endless clicks and endless screens Where our secrets and our dreams are share\n",
      "    ├── Can you modify it to carry an opposite message?\n",
      "    └── Change the chorus\n",
      "        ├── Chorus: Oh internet, let freedom be our guide With privacy protected and code open wide We'll naviga\n",
      "        ├── Chorus: Oh internet, where information flows like a sea It's up to us to keep it secure and free Wit\n",
      "        └── New Chorus: Oh internet, where knowledge and freedom meet It's up to us to keep it fair and truly ne\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# lets create a tree of message ids\n",
    "id_tree = Tree()\n",
    "# lets create a tree of message texts\n",
    "text_tree = Tree()\n",
    "# lets set a max char length for the text\n",
    "max_char_len = 100\n",
    "\n",
    "# iterate over rows in df_message_tree\n",
    "for i, row in df_message_tree.iterrows():\n",
    "    # grab the message_id, parent_id, text, and parent text\n",
    "    message_id = row[\"message_id\"]\n",
    "    parent_id = row[\"parent_id\"]\n",
    "    text = row[\"text\"]\n",
    "    text_short = text[:max_char_len] if len(text) > max_char_len else text\n",
    "    text_short = text_short.replace(\"\\n\", \" \")\n",
    "    parent_text = (\n",
    "        df_message_tree.query(f\"message_id == '{parent_id}'\")[\"text\"].values[0] if parent_id is not None else \"ROOT\"\n",
    "    )\n",
    "    parent_text_short = parent_text[:max_char_len] if len(parent_text) > max_char_len else parent_text\n",
    "    parent_text_short = parent_text_short.replace(\"\\n\", \" \")\n",
    "\n",
    "    # create a node in the id_tree and text_tree, add row as data in case want it later\n",
    "    id_tree.create_node(message_id, message_id, parent=parent_id, data=row.to_dict())\n",
    "\n",
    "    # if parent_id is None, then it is a root message so dont add parent text as is none\n",
    "    if parent_id is None:\n",
    "        text_tree.create_node(text_short, text_short)\n",
    "    # else use the parent text short as the parent\n",
    "    else:\n",
    "        text_tree.create_node(text_short, text_short, parent=parent_text_short)\n",
    "\n",
    "\n",
    "print(\"id_tree:\")\n",
    "id_tree.show()\n",
    "\n",
    "print(\"text_tree:\")\n",
    "text_tree.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "25d5c2324055587ceaeef27650c79ce8358ea61d7689f2e0b8ada5d53f85bce4"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
