{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Steps\n",
    "# 1、merge two parts\t\t\t\t90664 sessions\n",
    "#     run the merge part\n",
    "# 2、remove html\t\t\t\t\t76964 sessions\n",
    "#     run remove_html.py\n",
    "# 3、remove non-en                  41304 sessions\n",
    "#     run the clean multi-lang part\n",
    "# 4、split long \t\t\t\t    76082 segments   41308 sessions\n",
    "#     run the split_long_conversations.py\n",
    "# 5、remove one session             74898 segments   40994 sessions\n",
    "#     run the clear len(session) == 1 part\n",
    "# 6、clean completed duplicates     34435 segments   19357 sessions\n",
    "#     run the clean completed duplicates part"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import re\n",
    "from tqdm import tqdm\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from collections import Counter"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_json_file(file_path):\n",
    "    with open(file_path, 'r', encoding=\"utf-8\") as file:\n",
    "        json_data = json.load(file)\n",
    "    return json_data"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Clean"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## merge two part"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "removed_html_part1 = read_json_file('../sg_90k_part1.json')\n",
    "removed_html_part2 = read_json_file('../sg_90k_part2.json')\n",
    "print(len(removed_html_part1))\n",
    "print(len(removed_html_part2))\n",
    "\n",
    "merged_html_list = []\n",
    "for session_dict in removed_html_part1:\n",
    "    merged_html_list.append(session_dict)\n",
    "for session_dict in removed_html_part1:\n",
    "    merged_html_list.append(session_dict)\n",
    "print(len(merged_html_list))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## clean multi-lang"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Processing per sessions: 100%|██████████| 76964/76964 [00:04<00:00, 16780.20it/s]\n"
     ]
    }
   ],
   "source": [
    "remove_multi_lang = []\n",
    "cnt_translation_task = 0\n",
    "cnt_non_english = 0\n",
    "for session_dict in tqdm(test, desc=\"Processing per sessions\"):\n",
    "    has_translation = any(re.search(r'\\b[Tt]ranslat\\w+', utter_dict['value']) for utter_dict in session_dict['conversations'] if utter_dict['from'] == 'human')\n",
    "    if has_translation:\n",
    "        remove_multi_lang.append(session_dict)\n",
    "        cnt_translation_task += 1\n",
    "    else: \n",
    "        include_non_english = any(re.search('[\\u0080-\\uFFFF]', utter_dict['value']) for utter_dict in session_dict['conversations'])\n",
    "        if include_non_english:\n",
    "            cnt_non_english += 1\n",
    "        else:\n",
    "            remove_multi_lang.append(session_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Remains 41304 sessions.\n",
      "Remove 35660 sessions\n",
      "Remains 1584 translation tasks.\n",
      "Remove 35660 non-english sessions.\n"
     ]
    }
   ],
   "source": [
    "print('Remains', len(remove_multi_lang), 'sessions.')\n",
    "print('Remove', str(76964-len(remove_multi_lang)), 'sessions')\n",
    "print('Remains', cnt_translation_task, 'translation tasks.')\n",
    "print('Remove', cnt_non_english, 'non-english sessions.' )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## clear len(session) == 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "clear_one = []\n",
    "for session_dict in test:\n",
    "    if len(session_dict['conversations']) > 1:\n",
    "        clear_one.append(session_dict) \n",
    "print(len(clear_one))"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## clean completed duplicates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Processing per sessions: 100%|██████████| 41304/41304 [00:05<00:00, 7280.75it/s] \n"
     ]
    }
   ],
   "source": [
    "all_rounds = []\n",
    "clean_duplicates = []\n",
    "for session_dict in tqdm(test, desc=\"Processing per sessions\"):\n",
    "    per_session_conv_str = ''\n",
    "    for utter_dict in session_dict['conversations']:\n",
    "        per_session_conv_str += utter_dict['value']\n",
    "    if per_session_conv_str not in all_rounds:\n",
    "        all_rounds.append(per_session_conv_str)\n",
    "        clean_duplicates.append(session_dict)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('final.json', 'w', encoding=\"utf-8\" ) as file:\n",
    "    json.dump(final, file, indent=2, ensure_ascii=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "inquirygpt-cli",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
