{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "82e7c158-c5e5-4cad-a854-8c6979bc68a9",
   "metadata": {},
   "source": [
    "<hr>\n",
    "\n",
    "***Version: 1001.1792024.qut.cs.tnl***\n",
    "\n",
    "***Sk Tanzir Mehedi, PhD Student, QUT***\n",
    "\n",
    "***Supervisory Team: Prof. Raja Jurdak & Dr Chadni Islam***\n",
    "<hr>"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b5e328f9-9718-4ddc-bda4-45e475fdc622",
   "metadata": {},
   "source": [
    "**----Start of Step 5 Analysis Part 1----**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "f5dc51da-7e1b-4a46-8cf2-7b6c2d7fb153",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['SimilarityAlgorithmsOutput']"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "file_path = \"D:/Final Version/Step 4 SimilarityAlgorithms-SearchBenignPackages/SimilarityAlgorithms-SearchBenignPackagesOutput.xlsx\"\n",
    "xls = pd.ExcelFile(file_path)\n",
    "\n",
    "xls.sheet_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "c22cf6d0-4636-4b97-ad6d-7d979ec3d212",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Malicious Package Name</th>\n",
       "      <th>Malicious Package Version</th>\n",
       "      <th>Benign Package Name SM</th>\n",
       "      <th>Benign Package Name LS</th>\n",
       "      <th>Benign Package Name JS</th>\n",
       "      <th>Benign Package Name CS</th>\n",
       "      <th>Benign Package Version SM</th>\n",
       "      <th>Benign Package Version LS</th>\n",
       "      <th>Benign Package Version JS</th>\n",
       "      <th>Benign Package Version CS</th>\n",
       "      <th>Benign Release Date SM</th>\n",
       "      <th>Benign Release Date LS</th>\n",
       "      <th>Benign Release Date JS</th>\n",
       "      <th>Benign Release Date CS</th>\n",
       "      <th>Similarity Score SM</th>\n",
       "      <th>Similarity Score LS</th>\n",
       "      <th>Similarity Score JS</th>\n",
       "      <th>Similarity Score CS</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10Cent10</td>\n",
       "      <td>999.0.4</td>\n",
       "      <td>X11Client</td>\n",
       "      <td>vCenter</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.4</td>\n",
       "      <td>6.7.0.2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2019-04-27T12:51:40</td>\n",
       "      <td>2019-05-11T07:33:12</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.588235</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10Cent11</td>\n",
       "      <td>999.0.4</td>\n",
       "      <td>X11Client</td>\n",
       "      <td>vCenter</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.4</td>\n",
       "      <td>6.7.0.2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2019-04-27T12:51:40</td>\n",
       "      <td>2019-05-11T07:33:12</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.588235</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>11cent</td>\n",
       "      <td>999.0.0</td>\n",
       "      <td>cent</td>\n",
       "      <td>scent</td>\n",
       "      <td>cent</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.0.0b1</td>\n",
       "      <td>0.1.0</td>\n",
       "      <td>5.0.0b1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2024-03-09T07:40:58</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2024-03-09T07:40:58</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.800000</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>11cent</td>\n",
       "      <td>999.0.1</td>\n",
       "      <td>cent</td>\n",
       "      <td>scent</td>\n",
       "      <td>cent</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.0.0b1</td>\n",
       "      <td>0.1.0</td>\n",
       "      <td>5.0.0b1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2024-03-09T07:40:58</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2024-03-09T07:40:58</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.800000</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>11cent</td>\n",
       "      <td>999.0.2</td>\n",
       "      <td>cent</td>\n",
       "      <td>scent</td>\n",
       "      <td>cent</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.0.0b1</td>\n",
       "      <td>0.1.0</td>\n",
       "      <td>5.0.0b1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2024-03-09T07:40:58</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2024-03-09T07:40:58</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.800000</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Malicious Package Name Malicious Package Version Benign Package Name SM  \\\n",
       "0               10Cent10                   999.0.4              X11Client   \n",
       "1               10Cent11                   999.0.4              X11Client   \n",
       "2                 11cent                   999.0.0                   cent   \n",
       "3                 11cent                   999.0.1                   cent   \n",
       "4                 11cent                   999.0.2                   cent   \n",
       "\n",
       "  Benign Package Name LS Benign Package Name JS Benign Package Name CS  \\\n",
       "0                vCenter                    NaN                    NaN   \n",
       "1                vCenter                    NaN                    NaN   \n",
       "2                  scent                   cent                    NaN   \n",
       "3                  scent                   cent                    NaN   \n",
       "4                  scent                   cent                    NaN   \n",
       "\n",
       "  Benign Package Version SM Benign Package Version LS  \\\n",
       "0                       1.4                   6.7.0.2   \n",
       "1                       1.4                   6.7.0.2   \n",
       "2                   5.0.0b1                     0.1.0   \n",
       "3                   5.0.0b1                     0.1.0   \n",
       "4                   5.0.0b1                     0.1.0   \n",
       "\n",
       "  Benign Package Version JS Benign Package Version CS Benign Release Date SM  \\\n",
       "0                       NaN                       NaN    2019-04-27T12:51:40   \n",
       "1                       NaN                       NaN    2019-04-27T12:51:40   \n",
       "2                   5.0.0b1                       NaN    2024-03-09T07:40:58   \n",
       "3                   5.0.0b1                       NaN    2024-03-09T07:40:58   \n",
       "4                   5.0.0b1                       NaN    2024-03-09T07:40:58   \n",
       "\n",
       "  Benign Release Date LS Benign Release Date JS Benign Release Date CS  \\\n",
       "0    2019-05-11T07:33:12                    NaN                    NaN   \n",
       "1    2019-05-11T07:33:12                    NaN                    NaN   \n",
       "2                    NaN    2024-03-09T07:40:58                    NaN   \n",
       "3                    NaN    2024-03-09T07:40:58                    NaN   \n",
       "4                    NaN    2024-03-09T07:40:58                    NaN   \n",
       "\n",
       "   Similarity Score SM  Similarity Score LS  Similarity Score JS  \\\n",
       "0             0.588235             0.500000                 0.00   \n",
       "1             0.588235             0.500000                 0.00   \n",
       "2             0.800000             0.666667                 0.25   \n",
       "3             0.800000             0.666667                 0.25   \n",
       "4             0.800000             0.666667                 0.25   \n",
       "\n",
       "   Similarity Score CS  \n",
       "0                  0.0  \n",
       "1                  0.0  \n",
       "2                  0.0  \n",
       "3                  0.0  \n",
       "4                  0.0  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_excel(xls, sheet_name='SimilarityAlgorithmsOutput')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "3b04d7a8-04a8-4a60-903c-a1781f8021e8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'ConsiderableBenignPackages.xlsx'"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def combine_benign_packages(row):\n",
    "    packages = [\n",
    "        (row['Benign Package Name SM'], row['Benign Package Version SM']),\n",
    "        (row['Benign Package Name LS'], row['Benign Package Version LS']),\n",
    "        (row['Benign Package Name JS'], row['Benign Package Version JS']),\n",
    "        (row['Benign Package Name CS'], row['Benign Package Version CS']),\n",
    "    ]\n",
    "    packages = [f\"{name} ({version})\" for name, version in packages if pd.notna(name) and pd.notna(version)]\n",
    "    \n",
    "    if len(packages) > 0:\n",
    "        return ', '.join(packages)\n",
    "    else:\n",
    "        return None\n",
    "\n",
    "# Create the new \"Combined Benign Packages\" column\n",
    "df['Combined Benign Packages'] = df.apply(combine_benign_packages, axis=1)\n",
    "\n",
    "\n",
    "def combine_benign_packages_unique(row):\n",
    "    packages = [\n",
    "        (row['Benign Package Name SM'], row['Benign Package Version SM']),\n",
    "        (row['Benign Package Name LS'], row['Benign Package Version LS']),\n",
    "        (row['Benign Package Name JS'], row['Benign Package Version JS']),\n",
    "        (row['Benign Package Name CS'], row['Benign Package Version CS']),\n",
    "    ]\n",
    "    packages = [f\"{name} ({version})\" for name, version in packages if pd.notna(name) and pd.notna(version)]\n",
    "    \n",
    "    # Add the newly created 'Combined Benign Packages' to the list\n",
    "    if pd.notna(row['Combined Benign Packages']):\n",
    "        combined_packages = row['Combined Benign Packages'].split(', ')\n",
    "        packages.extend(combined_packages)\n",
    "    \n",
    "    # Ensure unique packages\n",
    "    unique_packages = list(set(packages))\n",
    "    \n",
    "    if len(unique_packages) == 1:\n",
    "        return unique_packages[0]\n",
    "    else:\n",
    "        return ', '.join(unique_packages)\n",
    "\n",
    "# Apply the function to the input DataFrame to get combined unique packages\n",
    "df['Combined Benign Packages Unique'] = df.apply(combine_benign_packages_unique, axis=1)\n",
    "\n",
    "# Reorder the columns to move \"Combined Benign Packages Unique\" to the last position\n",
    "columns = list(df.columns)\n",
    "columns.append(columns.pop(columns.index('Combined Benign Packages Unique')))\n",
    "df = df[columns]\n",
    "\n",
    "# Write the result to the output file path\n",
    "output_file_path = 'ConsiderableBenignPackages.xlsx'\n",
    "df.to_excel(output_file_path, index=False)\n",
    "\n",
    "output_file_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "e4629fdf-217b-4ba8-b0c4-3532971532d6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Malicious Package Name</th>\n",
       "      <th>Malicious Package Version</th>\n",
       "      <th>Benign Package Name SM</th>\n",
       "      <th>Benign Package Name LS</th>\n",
       "      <th>Benign Package Name JS</th>\n",
       "      <th>Benign Package Name CS</th>\n",
       "      <th>Benign Package Version SM</th>\n",
       "      <th>Benign Package Version LS</th>\n",
       "      <th>Benign Package Version JS</th>\n",
       "      <th>Benign Package Version CS</th>\n",
       "      <th>Benign Release Date SM</th>\n",
       "      <th>Benign Release Date LS</th>\n",
       "      <th>Benign Release Date JS</th>\n",
       "      <th>Benign Release Date CS</th>\n",
       "      <th>Similarity Score SM</th>\n",
       "      <th>Similarity Score LS</th>\n",
       "      <th>Similarity Score JS</th>\n",
       "      <th>Similarity Score CS</th>\n",
       "      <th>Combined Benign Packages</th>\n",
       "      <th>Combined Benign Packages Unique</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10Cent10</td>\n",
       "      <td>999.0.4</td>\n",
       "      <td>X11Client</td>\n",
       "      <td>vCenter</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.4</td>\n",
       "      <td>6.7.0.2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2019-04-27T12:51:40</td>\n",
       "      <td>2019-05-11T07:33:12</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.588235</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>X11Client (1.4), vCenter (6.7.0.2)</td>\n",
       "      <td>X11Client (1.4), vCenter (6.7.0.2)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10Cent11</td>\n",
       "      <td>999.0.4</td>\n",
       "      <td>X11Client</td>\n",
       "      <td>vCenter</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.4</td>\n",
       "      <td>6.7.0.2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2019-04-27T12:51:40</td>\n",
       "      <td>2019-05-11T07:33:12</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.588235</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>X11Client (1.4), vCenter (6.7.0.2)</td>\n",
       "      <td>X11Client (1.4), vCenter (6.7.0.2)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>11cent</td>\n",
       "      <td>999.0.0</td>\n",
       "      <td>cent</td>\n",
       "      <td>scent</td>\n",
       "      <td>cent</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.0.0b1</td>\n",
       "      <td>0.1.0</td>\n",
       "      <td>5.0.0b1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2024-03-09T07:40:58</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2024-03-09T07:40:58</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.800000</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.0</td>\n",
       "      <td>cent (5.0.0b1), scent (0.1.0), cent (5.0.0b1)</td>\n",
       "      <td>scent (0.1.0), cent (5.0.0b1)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>11cent</td>\n",
       "      <td>999.0.1</td>\n",
       "      <td>cent</td>\n",
       "      <td>scent</td>\n",
       "      <td>cent</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.0.0b1</td>\n",
       "      <td>0.1.0</td>\n",
       "      <td>5.0.0b1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2024-03-09T07:40:58</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2024-03-09T07:40:58</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.800000</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.0</td>\n",
       "      <td>cent (5.0.0b1), scent (0.1.0), cent (5.0.0b1)</td>\n",
       "      <td>scent (0.1.0), cent (5.0.0b1)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>11cent</td>\n",
       "      <td>999.0.2</td>\n",
       "      <td>cent</td>\n",
       "      <td>scent</td>\n",
       "      <td>cent</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.0.0b1</td>\n",
       "      <td>0.1.0</td>\n",
       "      <td>5.0.0b1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2024-03-09T07:40:58</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2024-03-09T07:40:58</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.800000</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.0</td>\n",
       "      <td>cent (5.0.0b1), scent (0.1.0), cent (5.0.0b1)</td>\n",
       "      <td>scent (0.1.0), cent (5.0.0b1)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7122</th>\n",
       "      <td>zuppa</td>\n",
       "      <td>0.0.1</td>\n",
       "      <td>zupa</td>\n",
       "      <td>zupa</td>\n",
       "      <td>SUPPA</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0.2</td>\n",
       "      <td>0.0.2</td>\n",
       "      <td>2.3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2023-07-16T05:59:06</td>\n",
       "      <td>2023-07-16T05:59:06</td>\n",
       "      <td>2018-02-28T17:07:29</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.888889</td>\n",
       "      <td>0.800000</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.0</td>\n",
       "      <td>zupa (0.0.2), zupa (0.0.2), SUPPA (2.3)</td>\n",
       "      <td>SUPPA (2.3), zupa (0.0.2)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7123</th>\n",
       "      <td>zuppa</td>\n",
       "      <td>0.0.2</td>\n",
       "      <td>zupa</td>\n",
       "      <td>zupa</td>\n",
       "      <td>SUPPA</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0.2</td>\n",
       "      <td>0.0.2</td>\n",
       "      <td>2.3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2023-07-16T05:59:06</td>\n",
       "      <td>2023-07-16T05:59:06</td>\n",
       "      <td>2018-02-28T17:07:29</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.888889</td>\n",
       "      <td>0.800000</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.0</td>\n",
       "      <td>zupa (0.0.2), zupa (0.0.2), SUPPA (2.3)</td>\n",
       "      <td>SUPPA (2.3), zupa (0.0.2)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7124</th>\n",
       "      <td>zwhrce</td>\n",
       "      <td>0.0.1</td>\n",
       "      <td>tzwhere</td>\n",
       "      <td>tzwhere</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3.0.3</td>\n",
       "      <td>3.0.3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2017-08-24T14:18:13</td>\n",
       "      <td>2017-08-24T14:18:13</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.769231</td>\n",
       "      <td>0.571429</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>tzwhere (3.0.3), tzwhere (3.0.3)</td>\n",
       "      <td>tzwhere (3.0.3)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7125</th>\n",
       "      <td>zydnitro</td>\n",
       "      <td>1.0</td>\n",
       "      <td>pynitro</td>\n",
       "      <td>pynitro</td>\n",
       "      <td>nitro</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.1.0</td>\n",
       "      <td>0.1.0</td>\n",
       "      <td>0.0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2022-06-24T10:40:35</td>\n",
       "      <td>2022-06-24T10:40:35</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.800000</td>\n",
       "      <td>0.750000</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.0</td>\n",
       "      <td>pynitro (0.1.0), pynitro (0.1.0), nitro (0.0.0)</td>\n",
       "      <td>pynitro (0.1.0), nitro (0.0.0)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7126</th>\n",
       "      <td>zyqnuutupjerllnbxaeq</td>\n",
       "      <td>0.0.1</td>\n",
       "      <td>nutella</td>\n",
       "      <td>pyqt-timer-label</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0.0</td>\n",
       "      <td>0.0.1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2018-03-13T19:29:22</td>\n",
       "      <td>2022-05-12T10:06:02</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.518519</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>nutella (0.0.0), pyqt-timer-label (0.0.1)</td>\n",
       "      <td>pyqt-timer-label (0.0.1), nutella (0.0.0)</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>7127 rows × 20 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Malicious Package Name Malicious Package Version Benign Package Name SM  \\\n",
       "0                  10Cent10                   999.0.4              X11Client   \n",
       "1                  10Cent11                   999.0.4              X11Client   \n",
       "2                    11cent                   999.0.0                   cent   \n",
       "3                    11cent                   999.0.1                   cent   \n",
       "4                    11cent                   999.0.2                   cent   \n",
       "...                     ...                       ...                    ...   \n",
       "7122                  zuppa                     0.0.1                   zupa   \n",
       "7123                  zuppa                     0.0.2                   zupa   \n",
       "7124                 zwhrce                     0.0.1                tzwhere   \n",
       "7125               zydnitro                       1.0                pynitro   \n",
       "7126   zyqnuutupjerllnbxaeq                     0.0.1                nutella   \n",
       "\n",
       "     Benign Package Name LS Benign Package Name JS Benign Package Name CS  \\\n",
       "0                   vCenter                    NaN                    NaN   \n",
       "1                   vCenter                    NaN                    NaN   \n",
       "2                     scent                   cent                    NaN   \n",
       "3                     scent                   cent                    NaN   \n",
       "4                     scent                   cent                    NaN   \n",
       "...                     ...                    ...                    ...   \n",
       "7122                   zupa                  SUPPA                    NaN   \n",
       "7123                   zupa                  SUPPA                    NaN   \n",
       "7124                tzwhere                    NaN                    NaN   \n",
       "7125                pynitro                  nitro                    NaN   \n",
       "7126       pyqt-timer-label                    NaN                    NaN   \n",
       "\n",
       "     Benign Package Version SM Benign Package Version LS  \\\n",
       "0                          1.4                   6.7.0.2   \n",
       "1                          1.4                   6.7.0.2   \n",
       "2                      5.0.0b1                     0.1.0   \n",
       "3                      5.0.0b1                     0.1.0   \n",
       "4                      5.0.0b1                     0.1.0   \n",
       "...                        ...                       ...   \n",
       "7122                     0.0.2                     0.0.2   \n",
       "7123                     0.0.2                     0.0.2   \n",
       "7124                     3.0.3                     3.0.3   \n",
       "7125                     0.1.0                     0.1.0   \n",
       "7126                     0.0.0                     0.0.1   \n",
       "\n",
       "     Benign Package Version JS Benign Package Version CS  \\\n",
       "0                          NaN                       NaN   \n",
       "1                          NaN                       NaN   \n",
       "2                      5.0.0b1                       NaN   \n",
       "3                      5.0.0b1                       NaN   \n",
       "4                      5.0.0b1                       NaN   \n",
       "...                        ...                       ...   \n",
       "7122                       2.3                       NaN   \n",
       "7123                       2.3                       NaN   \n",
       "7124                       NaN                       NaN   \n",
       "7125                     0.0.0                       NaN   \n",
       "7126                       NaN                       NaN   \n",
       "\n",
       "     Benign Release Date SM Benign Release Date LS Benign Release Date JS  \\\n",
       "0       2019-04-27T12:51:40    2019-05-11T07:33:12                    NaN   \n",
       "1       2019-04-27T12:51:40    2019-05-11T07:33:12                    NaN   \n",
       "2       2024-03-09T07:40:58                    NaN    2024-03-09T07:40:58   \n",
       "3       2024-03-09T07:40:58                    NaN    2024-03-09T07:40:58   \n",
       "4       2024-03-09T07:40:58                    NaN    2024-03-09T07:40:58   \n",
       "...                     ...                    ...                    ...   \n",
       "7122    2023-07-16T05:59:06    2023-07-16T05:59:06    2018-02-28T17:07:29   \n",
       "7123    2023-07-16T05:59:06    2023-07-16T05:59:06    2018-02-28T17:07:29   \n",
       "7124    2017-08-24T14:18:13    2017-08-24T14:18:13                    NaN   \n",
       "7125    2022-06-24T10:40:35    2022-06-24T10:40:35                    NaN   \n",
       "7126    2018-03-13T19:29:22    2022-05-12T10:06:02                    NaN   \n",
       "\n",
       "     Benign Release Date CS  Similarity Score SM  Similarity Score LS  \\\n",
       "0                       NaN             0.588235             0.500000   \n",
       "1                       NaN             0.588235             0.500000   \n",
       "2                       NaN             0.800000             0.666667   \n",
       "3                       NaN             0.800000             0.666667   \n",
       "4                       NaN             0.800000             0.666667   \n",
       "...                     ...                  ...                  ...   \n",
       "7122                    NaN             0.888889             0.800000   \n",
       "7123                    NaN             0.888889             0.800000   \n",
       "7124                    NaN             0.769231             0.571429   \n",
       "7125                    NaN             0.800000             0.750000   \n",
       "7126                    NaN             0.518519             0.400000   \n",
       "\n",
       "      Similarity Score JS  Similarity Score CS  \\\n",
       "0                    0.00                  0.0   \n",
       "1                    0.00                  0.0   \n",
       "2                    0.25                  0.0   \n",
       "3                    0.25                  0.0   \n",
       "4                    0.25                  0.0   \n",
       "...                   ...                  ...   \n",
       "7122                 0.25                  0.0   \n",
       "7123                 0.25                  0.0   \n",
       "7124                 0.00                  0.0   \n",
       "7125                 0.25                  0.0   \n",
       "7126                 0.00                  0.0   \n",
       "\n",
       "                             Combined Benign Packages  \\\n",
       "0                  X11Client (1.4), vCenter (6.7.0.2)   \n",
       "1                  X11Client (1.4), vCenter (6.7.0.2)   \n",
       "2       cent (5.0.0b1), scent (0.1.0), cent (5.0.0b1)   \n",
       "3       cent (5.0.0b1), scent (0.1.0), cent (5.0.0b1)   \n",
       "4       cent (5.0.0b1), scent (0.1.0), cent (5.0.0b1)   \n",
       "...                                               ...   \n",
       "7122          zupa (0.0.2), zupa (0.0.2), SUPPA (2.3)   \n",
       "7123          zupa (0.0.2), zupa (0.0.2), SUPPA (2.3)   \n",
       "7124                 tzwhere (3.0.3), tzwhere (3.0.3)   \n",
       "7125  pynitro (0.1.0), pynitro (0.1.0), nitro (0.0.0)   \n",
       "7126        nutella (0.0.0), pyqt-timer-label (0.0.1)   \n",
       "\n",
       "                Combined Benign Packages Unique  \n",
       "0            X11Client (1.4), vCenter (6.7.0.2)  \n",
       "1            X11Client (1.4), vCenter (6.7.0.2)  \n",
       "2                 scent (0.1.0), cent (5.0.0b1)  \n",
       "3                 scent (0.1.0), cent (5.0.0b1)  \n",
       "4                 scent (0.1.0), cent (5.0.0b1)  \n",
       "...                                         ...  \n",
       "7122                  SUPPA (2.3), zupa (0.0.2)  \n",
       "7123                  SUPPA (2.3), zupa (0.0.2)  \n",
       "7124                            tzwhere (3.0.3)  \n",
       "7125             pynitro (0.1.0), nitro (0.0.0)  \n",
       "7126  pyqt-timer-label (0.0.1), nutella (0.0.0)  \n",
       "\n",
       "[7127 rows x 20 columns]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "211b0546-793c-4711-9882-3686fe33af3c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'ConsiderableUniqueBenignPackagesWithVersions.xlsx'"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "benign_sm = df[['Benign Package Name SM', 'Benign Package Version SM']].dropna().rename(\n",
    "    columns={'Benign Package Name SM': 'Benign Package Name', 'Benign Package Version SM': 'Benign Package Version'})\n",
    "\n",
    "benign_ls = df[['Benign Package Name LS', 'Benign Package Version LS']].dropna().rename(\n",
    "    columns={'Benign Package Name LS': 'Benign Package Name', 'Benign Package Version LS': 'Benign Package Version'})\n",
    "\n",
    "benign_js = df[['Benign Package Name JS', 'Benign Package Version JS']].dropna().rename(\n",
    "    columns={'Benign Package Name JS': 'Benign Package Name', 'Benign Package Version JS': 'Benign Package Version'})\n",
    "\n",
    "benign_cs = df[['Benign Package Name CS', 'Benign Package Version CS']].dropna().rename(\n",
    "    columns={'Benign Package Name CS': 'Benign Package Name', 'Benign Package Version CS': 'Benign Package Version'})\n",
    "\n",
    "benign_packages_with_versions = pd.concat([benign_sm, benign_ls, benign_js, benign_cs])\n",
    "\n",
    "# Drop duplicates to ensure only unique benign package names and versions\n",
    "benign_packages_with_versions_unique = benign_packages_with_versions.drop_duplicates()\n",
    "\n",
    "# Save the unique benign packages and versions to an Excel file\n",
    "output_benign_packages_file_fixed = 'ConsiderableUniqueBenignPackagesWithVersions.xlsx'\n",
    "benign_packages_with_versions_unique.to_excel(output_benign_packages_file_fixed, index=False)\n",
    "\n",
    "output_benign_packages_file_fixed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "c824419e-e031-4df7-a40e-a418b0ed55a4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "6900"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Count the number of unique benign packages with their versions\n",
    "unique_benign_packages_count = benign_packages_with_versions_unique.shape[0]\n",
    "unique_benign_packages_count"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d1ceb3d5-0fb0-4e2b-9685-842914da74ec",
   "metadata": {},
   "source": [
    "**----End of Step 5 Analysis Part 1----**"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
