{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import random"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_json_file(file_path):\n",
    "    with open(file_path, 'r', encoding=\"utf-8\") as file:\n",
    "        json_data = json.load(file)\n",
    "    return json_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "vcn = read_json_file('\\corpus\\sharegpt\\data\\sg_html_en_long_mine.json')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "remove_long_conv = []\n",
    "for session_dict in vcn:\n",
    "    if session_dict['id'].endswith('_0'):\n",
    "        remove_long_conv.append(session_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "61616\n"
     ]
    }
   ],
   "source": [
    "print(len(vcn)-len(remove_long_conv))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "52573"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(remove_long_conv)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "random_seed = 125\n",
    "random.seed(random_seed)\n",
    "random.shuffle(remove_long_conv)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'id': '7T5cv1p_0',\n",
       " 'model': '',\n",
       " 'conversations': [{'from': 'human',\n",
       "   'value': 'create a sequence diagram for change control process using mermaid text'},\n",
       "  {'from': 'gpt',\n",
       "   'value': 'Here is a simple sequence diagram for a change control process using Mermaid text:\\n```makefile\\nsequenceDiagram\\nparticipant Manager\\nparticipant Developer\\nparticipant Tester\\n\\nManager->>Developer: Submit Change Request\\nDeveloper->>Manager: Acknowledge Request\\nDeveloper->>Tester: Perform Impact Analysis\\nTester->>Developer: Report Impact Analysis\\nDeveloper->>Manager: Review Impact Analysis\\nManager->>Developer: Approve/Reject Change\\nDeveloper->>Tester: Implement Approved Changes\\nTester->>Manager: Validate Changes\\nManager->>Developer: Close Change Request\\n```'},\n",
       "  {'from': 'human',\n",
       "   'value': 'change it to include CCB and change requests and change orders'},\n",
       "  {'from': 'gpt',\n",
       "   'value': 'Here is an updated sequence diagram for a change control process that includes a Change Control Board (CCB), change requests, and change orders:\\n```makefile\\nsequenceDiagram\\nparticipant Manager\\nparticipant Developer\\nparticipant Tester\\nparticipant CCB\\n\\nManager->>Developer: Submit Change Request\\nDeveloper->>Manager: Acknowledge Request\\nDeveloper->>Tester: Perform Impact Analysis\\nTester->>Developer: Report Impact Analysis\\nDeveloper->>Manager: Review Impact Analysis\\nManager->>CCB: Review Change Request\\nCCB->>Manager: Approve/Reject Change Request\\nManager->>Developer: Approve/Reject Change Request\\nDeveloper->>Tester: Implement Approved Changes\\nTester->>Manager: Validate Changes\\nManager->>CCB: Review Change Order\\nCCB->>Manager: Approve/Reject Change Order\\nManager->>Developer: Close Change Request\\n```'}]}"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# check\n",
    "remove_long_conv[0]\n",
    "# {'id': '7T5cv1p_0',"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "vcn_10k = remove_long_conv[:10000]\n",
    "assert len(vcn_10k) == 10000"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('vicuna_10k_for_train.json', 'w', encoding=\"utf-8\" ) as file:\n",
    "    json.dump(vcn_10k, file, indent=2, ensure_ascii=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "inquirygpt-cli",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
