{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "b4e4a068-c004-4136-b14e-04aba7a5d2ac",
   "metadata": {},
   "source": [
    "## DataPreprocessing-2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "c8572d82-c801-4b82-9948-1bbee704cd59",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\N11894571\\AppData\\Local\\Temp\\ipykernel_34804\\3523790256.py:4: DtypeWarning: Columns (154) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df = pd.read_csv(file_path)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Filename</th>\n",
       "      <th>Package_Name</th>\n",
       "      <th>Total System Calls</th>\n",
       "      <th>Unique System Calls</th>\n",
       "      <th>Unique System Calls List</th>\n",
       "      <th>Total Errors</th>\n",
       "      <th>Unique Errors</th>\n",
       "      <th>Unique Errors List</th>\n",
       "      <th>File Operations</th>\n",
       "      <th>Unique File Operations</th>\n",
       "      <th>...</th>\n",
       "      <th>Pattern 2</th>\n",
       "      <th>Pattern 3</th>\n",
       "      <th>Pattern 4</th>\n",
       "      <th>Pattern 5</th>\n",
       "      <th>Advanced Pattern 1</th>\n",
       "      <th>Advanced Pattern 2</th>\n",
       "      <th>Advanced Pattern 3</th>\n",
       "      <th>Advanced Pattern 4</th>\n",
       "      <th>Advanced Pattern 5</th>\n",
       "      <th>Level</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1337x_strace_output_24584.24584</td>\n",
       "      <td>1337x</td>\n",
       "      <td>28644</td>\n",
       "      <td>47</td>\n",
       "      <td>newfstatat, openat, fstat, write, close, renam...</td>\n",
       "      <td>3846</td>\n",
       "      <td>8</td>\n",
       "      <td>ENOTTY, ENOENT, else, ESPIPE, ENETUNREACH, EIN...</td>\n",
       "      <td>25079</td>\n",
       "      <td>14</td>\n",
       "      <td>...</td>\n",
       "      <td>newfstatat -&gt; openat -&gt; fstat</td>\n",
       "      <td>lseek -&gt; lseek -&gt; lseek</td>\n",
       "      <td>fstat -&gt; ioctl -&gt; lseek</td>\n",
       "      <td>openat -&gt; fstat -&gt; ioctl</td>\n",
       "      <td>newfstatat -&gt; newfstatat -&gt; newfstatat -&gt; no-e...</td>\n",
       "      <td>newfstatat -&gt; newfstatat -&gt; openat -&gt; no-error...</td>\n",
       "      <td>newfstatat -&gt; openat -&gt; fstat -&gt; no-error -&gt; n...</td>\n",
       "      <td>lseek -&gt; lseek -&gt; lseek -&gt; no-error -&gt; no-fd</td>\n",
       "      <td>openat -&gt; fstat -&gt; ioctl -&gt; no-error -&gt; no-fd</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3m_strace_output_11938.11938</td>\n",
       "      <td>3m</td>\n",
       "      <td>65205</td>\n",
       "      <td>49</td>\n",
       "      <td>restart_syscall, read, newfstatat, futex, open...</td>\n",
       "      <td>5970</td>\n",
       "      <td>7</td>\n",
       "      <td>ENOENT, EAGAIN, ENOTTY, ENETUNREACH, EINPROGRE...</td>\n",
       "      <td>52667</td>\n",
       "      <td>12</td>\n",
       "      <td>...</td>\n",
       "      <td>newfstatat -&gt; openat -&gt; fstat</td>\n",
       "      <td>geteuid -&gt; chown -&gt; utimensat</td>\n",
       "      <td>fstat -&gt; ioctl -&gt; lseek</td>\n",
       "      <td>openat -&gt; fstat -&gt; ioctl</td>\n",
       "      <td>newfstatat -&gt; newfstatat -&gt; newfstatat -&gt; no-e...</td>\n",
       "      <td>newfstatat -&gt; openat -&gt; fstat -&gt; no-error -&gt; n...</td>\n",
       "      <td>geteuid -&gt; chown -&gt; utimensat -&gt; no-error -&gt; n...</td>\n",
       "      <td>fstat -&gt; ioctl -&gt; lseek -&gt; no-error -&gt; no-fd</td>\n",
       "      <td>openat -&gt; fstat -&gt; ioctl -&gt; no-error -&gt; no-fd</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3-py_strace_output_52126.52126</td>\n",
       "      <td>3-py</td>\n",
       "      <td>11800</td>\n",
       "      <td>51</td>\n",
       "      <td>fstat, brk, getdents64, close, newfstatat, ope...</td>\n",
       "      <td>1034</td>\n",
       "      <td>7</td>\n",
       "      <td>ENOTTY, ENOENT, ESPIPE, ENETUNREACH, EINPROGRE...</td>\n",
       "      <td>9689</td>\n",
       "      <td>16</td>\n",
       "      <td>...</td>\n",
       "      <td>read -&gt; read -&gt; read</td>\n",
       "      <td>newfstatat -&gt; openat -&gt; fstat</td>\n",
       "      <td>newfstatat -&gt; newfstatat -&gt; openat</td>\n",
       "      <td>fstat -&gt; ioctl -&gt; lseek</td>\n",
       "      <td>newfstatat -&gt; newfstatat -&gt; newfstatat -&gt; no-e...</td>\n",
       "      <td>read -&gt; read -&gt; read -&gt; no-error -&gt; no-fd</td>\n",
       "      <td>newfstatat -&gt; openat -&gt; fstat -&gt; no-error -&gt; n...</td>\n",
       "      <td>newfstatat -&gt; newfstatat -&gt; openat -&gt; no-error...</td>\n",
       "      <td>openat -&gt; fstat -&gt; ioctl -&gt; no-error -&gt; no-fd</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>5345345345345345_strace_output_53409.53409</td>\n",
       "      <td>5345345345345345</td>\n",
       "      <td>10582</td>\n",
       "      <td>48</td>\n",
       "      <td>newfstatat, openat, fstat, write, close, renam...</td>\n",
       "      <td>1002</td>\n",
       "      <td>7</td>\n",
       "      <td>ENOENT, ENOTTY, else, ESPIPE, ENETUNREACH, EIN...</td>\n",
       "      <td>9410</td>\n",
       "      <td>15</td>\n",
       "      <td>...</td>\n",
       "      <td>newfstatat -&gt; newfstatat -&gt; openat</td>\n",
       "      <td>read -&gt; read -&gt; read</td>\n",
       "      <td>newfstatat -&gt; openat -&gt; fstat</td>\n",
       "      <td>close -&gt; newfstatat -&gt; newfstatat</td>\n",
       "      <td>newfstatat -&gt; newfstatat -&gt; newfstatat -&gt; no-e...</td>\n",
       "      <td>newfstatat -&gt; newfstatat -&gt; openat -&gt; no-error...</td>\n",
       "      <td>read -&gt; read -&gt; read -&gt; no-error -&gt; no-fd</td>\n",
       "      <td>newfstatat -&gt; openat -&gt; fstat -&gt; no-error -&gt; n...</td>\n",
       "      <td>close -&gt; newfstatat -&gt; newfstatat -&gt; no-error ...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>aadhaar_strace_output_199588.199588</td>\n",
       "      <td>aadhaar</td>\n",
       "      <td>2825</td>\n",
       "      <td>45</td>\n",
       "      <td>restart_syscall, read, newfstatat, openat, fst...</td>\n",
       "      <td>206</td>\n",
       "      <td>7</td>\n",
       "      <td>ENOENT, EINPROGRESS, EAGAIN, ENOTTY, ENETUNREA...</td>\n",
       "      <td>2110</td>\n",
       "      <td>13</td>\n",
       "      <td>...</td>\n",
       "      <td>read -&gt; read -&gt; read</td>\n",
       "      <td>newfstatat -&gt; openat -&gt; fstat</td>\n",
       "      <td>ioctl -&gt; ioctl -&gt; ioctl</td>\n",
       "      <td>lseek -&gt; lseek -&gt; lseek</td>\n",
       "      <td>newfstatat -&gt; newfstatat -&gt; newfstatat -&gt; no-e...</td>\n",
       "      <td>read -&gt; read -&gt; read -&gt; no-error -&gt; no-fd</td>\n",
       "      <td>newfstatat -&gt; openat -&gt; fstat -&gt; no-error -&gt; n...</td>\n",
       "      <td>lseek -&gt; lseek -&gt; lseek -&gt; no-error -&gt; no-fd</td>\n",
       "      <td>ioctl -&gt; ioctl -&gt; ioctl -&gt; no-error -&gt; no-fd</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 189 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                     Filename      Package_Name  \\\n",
       "0             1337x_strace_output_24584.24584             1337x   \n",
       "1                3m_strace_output_11938.11938                3m   \n",
       "2              3-py_strace_output_52126.52126              3-py   \n",
       "3  5345345345345345_strace_output_53409.53409  5345345345345345   \n",
       "4         aadhaar_strace_output_199588.199588           aadhaar   \n",
       "\n",
       "   Total System Calls  Unique System Calls  \\\n",
       "0               28644                   47   \n",
       "1               65205                   49   \n",
       "2               11800                   51   \n",
       "3               10582                   48   \n",
       "4                2825                   45   \n",
       "\n",
       "                            Unique System Calls List  Total Errors  \\\n",
       "0  newfstatat, openat, fstat, write, close, renam...          3846   \n",
       "1  restart_syscall, read, newfstatat, futex, open...          5970   \n",
       "2  fstat, brk, getdents64, close, newfstatat, ope...          1034   \n",
       "3  newfstatat, openat, fstat, write, close, renam...          1002   \n",
       "4  restart_syscall, read, newfstatat, openat, fst...           206   \n",
       "\n",
       "   Unique Errors                                 Unique Errors List  \\\n",
       "0              8  ENOTTY, ENOENT, else, ESPIPE, ENETUNREACH, EIN...   \n",
       "1              7  ENOENT, EAGAIN, ENOTTY, ENETUNREACH, EINPROGRE...   \n",
       "2              7  ENOTTY, ENOENT, ESPIPE, ENETUNREACH, EINPROGRE...   \n",
       "3              7  ENOENT, ENOTTY, else, ESPIPE, ENETUNREACH, EIN...   \n",
       "4              7  ENOENT, EINPROGRESS, EAGAIN, ENOTTY, ENETUNREA...   \n",
       "\n",
       "   File Operations  Unique File Operations  ...  \\\n",
       "0            25079                      14  ...   \n",
       "1            52667                      12  ...   \n",
       "2             9689                      16  ...   \n",
       "3             9410                      15  ...   \n",
       "4             2110                      13  ...   \n",
       "\n",
       "                            Pattern 2                      Pattern 3  \\\n",
       "0       newfstatat -> openat -> fstat        lseek -> lseek -> lseek   \n",
       "1       newfstatat -> openat -> fstat  geteuid -> chown -> utimensat   \n",
       "2                read -> read -> read  newfstatat -> openat -> fstat   \n",
       "3  newfstatat -> newfstatat -> openat           read -> read -> read   \n",
       "4                read -> read -> read  newfstatat -> openat -> fstat   \n",
       "\n",
       "                            Pattern 4                          Pattern 5  \\\n",
       "0             fstat -> ioctl -> lseek           openat -> fstat -> ioctl   \n",
       "1             fstat -> ioctl -> lseek           openat -> fstat -> ioctl   \n",
       "2  newfstatat -> newfstatat -> openat            fstat -> ioctl -> lseek   \n",
       "3       newfstatat -> openat -> fstat  close -> newfstatat -> newfstatat   \n",
       "4             ioctl -> ioctl -> ioctl            lseek -> lseek -> lseek   \n",
       "\n",
       "                                  Advanced Pattern 1  \\\n",
       "0  newfstatat -> newfstatat -> newfstatat -> no-e...   \n",
       "1  newfstatat -> newfstatat -> newfstatat -> no-e...   \n",
       "2  newfstatat -> newfstatat -> newfstatat -> no-e...   \n",
       "3  newfstatat -> newfstatat -> newfstatat -> no-e...   \n",
       "4  newfstatat -> newfstatat -> newfstatat -> no-e...   \n",
       "\n",
       "                                  Advanced Pattern 2  \\\n",
       "0  newfstatat -> newfstatat -> openat -> no-error...   \n",
       "1  newfstatat -> openat -> fstat -> no-error -> n...   \n",
       "2          read -> read -> read -> no-error -> no-fd   \n",
       "3  newfstatat -> newfstatat -> openat -> no-error...   \n",
       "4          read -> read -> read -> no-error -> no-fd   \n",
       "\n",
       "                                  Advanced Pattern 3  \\\n",
       "0  newfstatat -> openat -> fstat -> no-error -> n...   \n",
       "1  geteuid -> chown -> utimensat -> no-error -> n...   \n",
       "2  newfstatat -> openat -> fstat -> no-error -> n...   \n",
       "3          read -> read -> read -> no-error -> no-fd   \n",
       "4  newfstatat -> openat -> fstat -> no-error -> n...   \n",
       "\n",
       "                                  Advanced Pattern 4  \\\n",
       "0       lseek -> lseek -> lseek -> no-error -> no-fd   \n",
       "1       fstat -> ioctl -> lseek -> no-error -> no-fd   \n",
       "2  newfstatat -> newfstatat -> openat -> no-error...   \n",
       "3  newfstatat -> openat -> fstat -> no-error -> n...   \n",
       "4       lseek -> lseek -> lseek -> no-error -> no-fd   \n",
       "\n",
       "                                  Advanced Pattern 5 Level  \n",
       "0      openat -> fstat -> ioctl -> no-error -> no-fd     0  \n",
       "1      openat -> fstat -> ioctl -> no-error -> no-fd     0  \n",
       "2      openat -> fstat -> ioctl -> no-error -> no-fd     0  \n",
       "3  close -> newfstatat -> newfstatat -> no-error ...     0  \n",
       "4       ioctl -> ioctl -> ioctl -> no-error -> no-fd     0  \n",
       "\n",
       "[5 rows x 189 columns]"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "file_path = 'D:/Final Version/Step 14 CombinedTraces/CombinedBenignMaliciousDataset.csv'\n",
    "df = pd.read_csv(file_path)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "8c456482-4c76-4d54-8d99-894eadad1cd6",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\N11894571\\AppData\\Local\\Temp\\ipykernel_34804\\600274108.py:2: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'Unknown' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.\n",
      "  df.fillna('Unknown', inplace=True)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Filename</th>\n",
       "      <th>Package_Name</th>\n",
       "      <th>Total System Calls</th>\n",
       "      <th>Unique System Calls</th>\n",
       "      <th>Unique System Calls List</th>\n",
       "      <th>Total Errors</th>\n",
       "      <th>Unique Errors</th>\n",
       "      <th>Unique Errors List</th>\n",
       "      <th>File Operations</th>\n",
       "      <th>Unique File Operations</th>\n",
       "      <th>...</th>\n",
       "      <th>Pattern 2</th>\n",
       "      <th>Pattern 3</th>\n",
       "      <th>Pattern 4</th>\n",
       "      <th>Pattern 5</th>\n",
       "      <th>Advanced Pattern 1</th>\n",
       "      <th>Advanced Pattern 2</th>\n",
       "      <th>Advanced Pattern 3</th>\n",
       "      <th>Advanced Pattern 4</th>\n",
       "      <th>Advanced Pattern 5</th>\n",
       "      <th>Level</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1337x_strace_output_24584.24584</td>\n",
       "      <td>1337x</td>\n",
       "      <td>5.860776e-03</td>\n",
       "      <td>47</td>\n",
       "      <td>3937</td>\n",
       "      <td>0.003173</td>\n",
       "      <td>8</td>\n",
       "      <td>660</td>\n",
       "      <td>0.006743</td>\n",
       "      <td>14</td>\n",
       "      <td>...</td>\n",
       "      <td>32</td>\n",
       "      <td>30</td>\n",
       "      <td>20</td>\n",
       "      <td>58</td>\n",
       "      <td>24</td>\n",
       "      <td>42</td>\n",
       "      <td>49</td>\n",
       "      <td>47</td>\n",
       "      <td>75</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3m_strace_output_11938.11938</td>\n",
       "      <td>3m</td>\n",
       "      <td>1.334169e-02</td>\n",
       "      <td>49</td>\n",
       "      <td>6333</td>\n",
       "      <td>0.004925</td>\n",
       "      <td>7</td>\n",
       "      <td>245</td>\n",
       "      <td>0.014160</td>\n",
       "      <td>12</td>\n",
       "      <td>...</td>\n",
       "      <td>32</td>\n",
       "      <td>25</td>\n",
       "      <td>20</td>\n",
       "      <td>58</td>\n",
       "      <td>24</td>\n",
       "      <td>43</td>\n",
       "      <td>27</td>\n",
       "      <td>31</td>\n",
       "      <td>75</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3-py_strace_output_52126.52126</td>\n",
       "      <td>3-py</td>\n",
       "      <td>2.414248e-03</td>\n",
       "      <td>51</td>\n",
       "      <td>876</td>\n",
       "      <td>0.000853</td>\n",
       "      <td>7</td>\n",
       "      <td>591</td>\n",
       "      <td>0.002605</td>\n",
       "      <td>16</td>\n",
       "      <td>...</td>\n",
       "      <td>46</td>\n",
       "      <td>39</td>\n",
       "      <td>44</td>\n",
       "      <td>25</td>\n",
       "      <td>24</td>\n",
       "      <td>65</td>\n",
       "      <td>49</td>\n",
       "      <td>59</td>\n",
       "      <td>75</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>5345345345345345_strace_output_53409.53409</td>\n",
       "      <td>5345345345345345</td>\n",
       "      <td>2.165027e-03</td>\n",
       "      <td>48</td>\n",
       "      <td>3847</td>\n",
       "      <td>0.000827</td>\n",
       "      <td>7</td>\n",
       "      <td>470</td>\n",
       "      <td>0.002530</td>\n",
       "      <td>15</td>\n",
       "      <td>...</td>\n",
       "      <td>31</td>\n",
       "      <td>55</td>\n",
       "      <td>45</td>\n",
       "      <td>14</td>\n",
       "      <td>24</td>\n",
       "      <td>42</td>\n",
       "      <td>82</td>\n",
       "      <td>61</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>aadhaar_strace_output_199588.199588</td>\n",
       "      <td>aadhaar</td>\n",
       "      <td>5.778316e-04</td>\n",
       "      <td>45</td>\n",
       "      <td>6471</td>\n",
       "      <td>0.000170</td>\n",
       "      <td>7</td>\n",
       "      <td>286</td>\n",
       "      <td>0.000567</td>\n",
       "      <td>13</td>\n",
       "      <td>...</td>\n",
       "      <td>46</td>\n",
       "      <td>39</td>\n",
       "      <td>27</td>\n",
       "      <td>44</td>\n",
       "      <td>24</td>\n",
       "      <td>65</td>\n",
       "      <td>49</td>\n",
       "      <td>47</td>\n",
       "      <td>43</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14266</th>\n",
       "      <td>zuppa-0.0.1_strace_output_201586.201586</td>\n",
       "      <td>zuppa-0.0.1</td>\n",
       "      <td>9.678270e-05</td>\n",
       "      <td>19</td>\n",
       "      <td>6876</td>\n",
       "      <td>0.000010</td>\n",
       "      <td>2</td>\n",
       "      <td>307</td>\n",
       "      <td>0.000110</td>\n",
       "      <td>9</td>\n",
       "      <td>...</td>\n",
       "      <td>32</td>\n",
       "      <td>38</td>\n",
       "      <td>47</td>\n",
       "      <td>24</td>\n",
       "      <td>24</td>\n",
       "      <td>43</td>\n",
       "      <td>47</td>\n",
       "      <td>63</td>\n",
       "      <td>30</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14267</th>\n",
       "      <td>zuppa-0.0.2_strace_output_198201.198201</td>\n",
       "      <td>zuppa-0.0.2</td>\n",
       "      <td>4.092292e-07</td>\n",
       "      <td>2</td>\n",
       "      <td>1466</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>721</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14268</th>\n",
       "      <td>zwhrce-0.0.1_strace_output_199124.199124</td>\n",
       "      <td>zwhrce-0.0.1</td>\n",
       "      <td>3.218588e-04</td>\n",
       "      <td>39</td>\n",
       "      <td>5224</td>\n",
       "      <td>0.000079</td>\n",
       "      <td>5</td>\n",
       "      <td>560</td>\n",
       "      <td>0.000327</td>\n",
       "      <td>12</td>\n",
       "      <td>...</td>\n",
       "      <td>46</td>\n",
       "      <td>39</td>\n",
       "      <td>32</td>\n",
       "      <td>35</td>\n",
       "      <td>24</td>\n",
       "      <td>65</td>\n",
       "      <td>49</td>\n",
       "      <td>47</td>\n",
       "      <td>22</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14269</th>\n",
       "      <td>zydnitro-1.0_strace_output_199341.199341</td>\n",
       "      <td>zydnitro-1.0</td>\n",
       "      <td>1.381353e-03</td>\n",
       "      <td>47</td>\n",
       "      <td>2750</td>\n",
       "      <td>0.000489</td>\n",
       "      <td>7</td>\n",
       "      <td>581</td>\n",
       "      <td>0.001563</td>\n",
       "      <td>12</td>\n",
       "      <td>...</td>\n",
       "      <td>32</td>\n",
       "      <td>38</td>\n",
       "      <td>63</td>\n",
       "      <td>14</td>\n",
       "      <td>24</td>\n",
       "      <td>43</td>\n",
       "      <td>47</td>\n",
       "      <td>99</td>\n",
       "      <td>21</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14270</th>\n",
       "      <td>zyqnuutupjerllnbxaeq-0.0.1_strace_output_19956...</td>\n",
       "      <td>zyqnuutupjerllnbxaeq-0.0.1</td>\n",
       "      <td>1.418798e-03</td>\n",
       "      <td>43</td>\n",
       "      <td>2683</td>\n",
       "      <td>0.000499</td>\n",
       "      <td>7</td>\n",
       "      <td>581</td>\n",
       "      <td>0.001619</td>\n",
       "      <td>11</td>\n",
       "      <td>...</td>\n",
       "      <td>32</td>\n",
       "      <td>38</td>\n",
       "      <td>11</td>\n",
       "      <td>73</td>\n",
       "      <td>24</td>\n",
       "      <td>43</td>\n",
       "      <td>47</td>\n",
       "      <td>19</td>\n",
       "      <td>75</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>14271 rows × 189 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                Filename  \\\n",
       "0                        1337x_strace_output_24584.24584   \n",
       "1                           3m_strace_output_11938.11938   \n",
       "2                         3-py_strace_output_52126.52126   \n",
       "3             5345345345345345_strace_output_53409.53409   \n",
       "4                    aadhaar_strace_output_199588.199588   \n",
       "...                                                  ...   \n",
       "14266            zuppa-0.0.1_strace_output_201586.201586   \n",
       "14267            zuppa-0.0.2_strace_output_198201.198201   \n",
       "14268           zwhrce-0.0.1_strace_output_199124.199124   \n",
       "14269           zydnitro-1.0_strace_output_199341.199341   \n",
       "14270  zyqnuutupjerllnbxaeq-0.0.1_strace_output_19956...   \n",
       "\n",
       "                     Package_Name  Total System Calls  Unique System Calls  \\\n",
       "0                           1337x        5.860776e-03                   47   \n",
       "1                              3m        1.334169e-02                   49   \n",
       "2                            3-py        2.414248e-03                   51   \n",
       "3                5345345345345345        2.165027e-03                   48   \n",
       "4                         aadhaar        5.778316e-04                   45   \n",
       "...                           ...                 ...                  ...   \n",
       "14266                 zuppa-0.0.1        9.678270e-05                   19   \n",
       "14267                 zuppa-0.0.2        4.092292e-07                    2   \n",
       "14268                zwhrce-0.0.1        3.218588e-04                   39   \n",
       "14269                zydnitro-1.0        1.381353e-03                   47   \n",
       "14270  zyqnuutupjerllnbxaeq-0.0.1        1.418798e-03                   43   \n",
       "\n",
       "       Unique System Calls List  Total Errors  Unique Errors  \\\n",
       "0                          3937      0.003173              8   \n",
       "1                          6333      0.004925              7   \n",
       "2                           876      0.000853              7   \n",
       "3                          3847      0.000827              7   \n",
       "4                          6471      0.000170              7   \n",
       "...                         ...           ...            ...   \n",
       "14266                      6876      0.000010              2   \n",
       "14267                      1466      0.000000              0   \n",
       "14268                      5224      0.000079              5   \n",
       "14269                      2750      0.000489              7   \n",
       "14270                      2683      0.000499              7   \n",
       "\n",
       "       Unique Errors List  File Operations  Unique File Operations  ...  \\\n",
       "0                     660         0.006743                      14  ...   \n",
       "1                     245         0.014160                      12  ...   \n",
       "2                     591         0.002605                      16  ...   \n",
       "3                     470         0.002530                      15  ...   \n",
       "4                     286         0.000567                      13  ...   \n",
       "...                   ...              ...                     ...  ...   \n",
       "14266                 307         0.000110                       9  ...   \n",
       "14267                 721         0.000000                       0  ...   \n",
       "14268                 560         0.000327                      12  ...   \n",
       "14269                 581         0.001563                      12  ...   \n",
       "14270                 581         0.001619                      11  ...   \n",
       "\n",
       "       Pattern 2  Pattern 3  Pattern 4 Pattern 5  Advanced Pattern 1  \\\n",
       "0             32         30         20        58                  24   \n",
       "1             32         25         20        58                  24   \n",
       "2             46         39         44        25                  24   \n",
       "3             31         55         45        14                  24   \n",
       "4             46         39         27        44                  24   \n",
       "...          ...        ...        ...       ...                 ...   \n",
       "14266         32         38         47        24                  24   \n",
       "14267          0          0          0         0                   0   \n",
       "14268         46         39         32        35                  24   \n",
       "14269         32         38         63        14                  24   \n",
       "14270         32         38         11        73                  24   \n",
       "\n",
       "       Advanced Pattern 2 Advanced Pattern 3  Advanced Pattern 4  \\\n",
       "0                      42                 49                  47   \n",
       "1                      43                 27                  31   \n",
       "2                      65                 49                  59   \n",
       "3                      42                 82                  61   \n",
       "4                      65                 49                  47   \n",
       "...                   ...                ...                 ...   \n",
       "14266                  43                 47                  63   \n",
       "14267                   0                  0                   0   \n",
       "14268                  65                 49                  47   \n",
       "14269                  43                 47                  99   \n",
       "14270                  43                 47                  19   \n",
       "\n",
       "       Advanced Pattern 5 Level  \n",
       "0                      75     0  \n",
       "1                      75     0  \n",
       "2                      75     0  \n",
       "3                      21     0  \n",
       "4                      43     0  \n",
       "...                   ...   ...  \n",
       "14266                  30     1  \n",
       "14267                   0     1  \n",
       "14268                  22     1  \n",
       "14269                  21     1  \n",
       "14270                  75     1  \n",
       "\n",
       "[14271 rows x 189 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Handle missing values - fill missing patterns with 'Unknown'\n",
    "df.fillna('Unknown', inplace=True)\n",
    "\n",
    "# Encoding categorical data - Convert patterns and system calls lists to numerical values using label encoding for now\n",
    "from sklearn.preprocessing import LabelEncoder, MinMaxScaler\n",
    "\n",
    "# Initialize label encoder\n",
    "label_encoder = LabelEncoder()\n",
    "\n",
    "# List of columns to encode\n",
    "columns_to_encode = ['Unique System Calls List', 'Unique Errors List', 'Unique File Operations List', \n",
    "                     'Pattern 1', 'Pattern 2', 'Pattern 3', 'Pattern 4', 'Pattern 5',\n",
    "                     'Advanced Pattern 1', 'Advanced Pattern 2', 'Advanced Pattern 3', \n",
    "                     'Advanced Pattern 4', 'Advanced Pattern 5']\n",
    "\n",
    "# Apply label encoding\n",
    "for column in columns_to_encode:\n",
    "    df[column] = label_encoder.fit_transform(df[column])\n",
    "\n",
    "# Normalize the numerical columns\n",
    "scaler = MinMaxScaler()\n",
    "\n",
    "# List of numerical columns to normalize\n",
    "numerical_columns = ['Total System Calls', 'Total Errors', 'File Operations', 'Memory Operations', \n",
    "                     'Network Operations', 'Process Management Operations', 'I/O Operations', \n",
    "                     'Time Operations', 'Signal Operations', 'IPC Operations', 'Thread Operations', \n",
    "                     'Resource Operations', 'Filesystem Operations', 'Security Operations', \n",
    "                     'Miscellaneous Operations', 'Average System Calls per Line']\n",
    "\n",
    "# Apply MinMax normalization\n",
    "df[numerical_columns] = scaler.fit_transform(df[numerical_columns])\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "6909fe37-d0d9-4e0d-b881-218d163aab1c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 2ms/step - accuracy: 0.7571 - loss: 5.3129 - val_accuracy: 0.9734 - val_loss: 0.1498\n",
      "Epoch 2/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 1ms/step - accuracy: 0.9578 - loss: 0.2698 - val_accuracy: 0.9198 - val_loss: 0.3925\n",
      "Epoch 3/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 1ms/step - accuracy: 0.9675 - loss: 0.2186 - val_accuracy: 0.9870 - val_loss: 0.1055\n",
      "Epoch 4/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 1ms/step - accuracy: 0.9513 - loss: 0.5763 - val_accuracy: 0.9881 - val_loss: 0.1189\n",
      "Epoch 5/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 1ms/step - accuracy: 0.9568 - loss: 0.4834 - val_accuracy: 0.9681 - val_loss: 0.3061\n",
      "Epoch 6/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9686 - loss: 0.4341 - val_accuracy: 0.9905 - val_loss: 0.1330\n",
      "Epoch 7/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9789 - loss: 0.2333 - val_accuracy: 0.9891 - val_loss: 0.1503\n",
      "Epoch 8/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9782 - loss: 0.2706 - val_accuracy: 0.9905 - val_loss: 0.1255\n",
      "Epoch 9/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9841 - loss: 0.1959 - val_accuracy: 0.9860 - val_loss: 0.1597\n",
      "Epoch 10/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9820 - loss: 0.2469 - val_accuracy: 0.9926 - val_loss: 0.1377\n",
      "Epoch 11/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9849 - loss: 0.2310 - val_accuracy: 0.9706 - val_loss: 0.1887\n",
      "Epoch 12/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9755 - loss: 0.3938 - val_accuracy: 0.9422 - val_loss: 0.4710\n",
      "Epoch 13/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9820 - loss: 0.1788 - val_accuracy: 0.9940 - val_loss: 0.0883\n",
      "Epoch 14/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9862 - loss: 0.1610 - val_accuracy: 0.9898 - val_loss: 0.1179\n",
      "Epoch 15/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9748 - loss: 0.2614 - val_accuracy: 0.9846 - val_loss: 0.1369\n",
      "Epoch 16/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9830 - loss: 0.2181 - val_accuracy: 0.9933 - val_loss: 0.0802\n",
      "Epoch 17/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9797 - loss: 0.2436 - val_accuracy: 0.9930 - val_loss: 0.0866\n",
      "Epoch 18/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9863 - loss: 0.1586 - val_accuracy: 0.9961 - val_loss: 0.0629\n",
      "Epoch 19/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9858 - loss: 0.1161 - val_accuracy: 0.9846 - val_loss: 0.1000\n",
      "Epoch 20/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9843 - loss: 0.1697 - val_accuracy: 0.9776 - val_loss: 0.1279\n",
      "Epoch 21/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9807 - loss: 0.2068 - val_accuracy: 0.9965 - val_loss: 0.0533\n",
      "Epoch 22/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9798 - loss: 0.2386 - val_accuracy: 0.9916 - val_loss: 0.0909\n",
      "Epoch 23/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9882 - loss: 0.1059 - val_accuracy: 0.9958 - val_loss: 0.0502\n",
      "Epoch 24/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9913 - loss: 0.0775 - val_accuracy: 0.9979 - val_loss: 0.0378\n",
      "Epoch 25/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9931 - loss: 0.0559 - val_accuracy: 0.9965 - val_loss: 0.0340\n",
      "Epoch 26/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9882 - loss: 0.0909 - val_accuracy: 0.9912 - val_loss: 0.0404\n",
      "Epoch 27/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9905 - loss: 0.1113 - val_accuracy: 0.9853 - val_loss: 0.1209\n",
      "Epoch 28/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9907 - loss: 0.0642 - val_accuracy: 0.9958 - val_loss: 0.0351\n",
      "Epoch 29/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9934 - loss: 0.0479 - val_accuracy: 0.9930 - val_loss: 0.0602\n",
      "Epoch 30/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9901 - loss: 0.0708 - val_accuracy: 0.9849 - val_loss: 0.0701\n",
      "Epoch 31/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9888 - loss: 0.0642 - val_accuracy: 0.9940 - val_loss: 0.0336\n",
      "Epoch 32/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9926 - loss: 0.0474 - val_accuracy: 0.9755 - val_loss: 0.0745\n",
      "Epoch 33/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9869 - loss: 0.0652 - val_accuracy: 0.9947 - val_loss: 0.0166\n",
      "Epoch 34/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9940 - loss: 0.0350 - val_accuracy: 0.9912 - val_loss: 0.0317\n",
      "Epoch 35/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9916 - loss: 0.0433 - val_accuracy: 0.9905 - val_loss: 0.0498\n",
      "Epoch 36/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9916 - loss: 0.0426 - val_accuracy: 0.9961 - val_loss: 0.0204\n",
      "Epoch 37/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9881 - loss: 0.0483 - val_accuracy: 0.9660 - val_loss: 0.0740\n",
      "Epoch 38/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9935 - loss: 0.0276 - val_accuracy: 0.9961 - val_loss: 0.0164\n",
      "Epoch 39/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9917 - loss: 0.0345 - val_accuracy: 0.9951 - val_loss: 0.0188\n",
      "Epoch 40/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9901 - loss: 0.0434 - val_accuracy: 0.9961 - val_loss: 0.0144\n",
      "Epoch 41/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9915 - loss: 0.0351 - val_accuracy: 0.9954 - val_loss: 0.0175\n",
      "Epoch 42/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9947 - loss: 0.0257 - val_accuracy: 0.9954 - val_loss: 0.0210\n",
      "Epoch 43/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9929 - loss: 0.0332 - val_accuracy: 0.9954 - val_loss: 0.0156\n",
      "Epoch 44/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9910 - loss: 0.0343 - val_accuracy: 0.9965 - val_loss: 0.0152\n",
      "Epoch 45/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9922 - loss: 0.0260 - val_accuracy: 0.9930 - val_loss: 0.0268\n",
      "Epoch 46/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9893 - loss: 0.0373 - val_accuracy: 0.9944 - val_loss: 0.0155\n",
      "Epoch 47/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9926 - loss: 0.0264 - val_accuracy: 0.9930 - val_loss: 0.0200\n",
      "Epoch 48/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9922 - loss: 0.0289 - val_accuracy: 0.9905 - val_loss: 0.0292\n",
      "Epoch 49/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9888 - loss: 0.0418 - val_accuracy: 0.9940 - val_loss: 0.0188\n",
      "Epoch 50/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9936 - loss: 0.0234 - val_accuracy: 0.9881 - val_loss: 0.0314\n"
     ]
    }
   ],
   "source": [
    "from tensorflow.keras.models import Model\n",
    "from tensorflow.keras.layers import Dense, Input, concatenate\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "# Step 1: Separate features from the target (if you have a target)\n",
    "X = df.drop(columns=['Filename'])  # Drop Filename column\n",
    "y = df['Level']  # Replace 'target_column' with the actual target column name\n",
    "\n",
    "# Step 2: Split the data into training and testing sets\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
    "\n",
    "# Step 3: Define inputs for different modalities (numerical/categorical)\n",
    "input_numerical = Input(shape=(len(numerical_columns),), name='numerical_input')\n",
    "\n",
    "# You can define another input layer for categorical or pattern-based features if necessary\n",
    "input_categorical = Input(shape=(len(columns_to_encode),), name='categorical_input')\n",
    "\n",
    "# Step 4: Build submodels for different modalities\n",
    "# Example dense layers for numerical features\n",
    "x_num = Dense(128, activation='relu')(input_numerical)\n",
    "x_num = Dense(64, activation='relu')(x_num)\n",
    "\n",
    "# Example dense layers for categorical features\n",
    "x_cat = Dense(128, activation='relu')(input_categorical)\n",
    "x_cat = Dense(64, activation='relu')(x_cat)\n",
    "\n",
    "# Step 5: Combine the outputs from different modalities\n",
    "combined = concatenate([x_num, x_cat])\n",
    "\n",
    "# Step 6: Final output layers\n",
    "output = Dense(1, activation='sigmoid')(combined)  # For binary classification\n",
    "# output = Dense(3, activation='softmax')(combined)  # For multi-class classification\n",
    "\n",
    "# Step 7: Define the final model\n",
    "model = Model(inputs=[input_numerical, input_categorical], outputs=output)\n",
    "\n",
    "# Step 8: Compile the model\n",
    "model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\n",
    "\n",
    "# Step 9: Train the model\n",
    "history = model.fit([X_train[numerical_columns], X_train[columns_to_encode]], y_train, \n",
    "                    validation_data=([X_test[numerical_columns], X_test[columns_to_encode]], y_test),\n",
    "                    epochs=50, batch_size=32)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e03f19f2-6846-4d4d-9457-b9349206cf65",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/50\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\N11894571\\AppData\\Local\\Temp\\ipykernel_34804\\3871140484.py:9: DtypeWarning: Columns (154) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df = pd.read_csv(file_path)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.7674 - loss: 5.9782 - val_accuracy: 0.8553 - val_loss: 1.2699\n",
      "Epoch 2/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9341 - loss: 0.7590 - val_accuracy: 0.8175 - val_loss: 2.8726\n",
      "Epoch 3/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 1ms/step - accuracy: 0.9355 - loss: 0.8277 - val_accuracy: 0.9520 - val_loss: 0.4478\n",
      "Epoch 4/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 1ms/step - accuracy: 0.9652 - loss: 0.3936 - val_accuracy: 0.9615 - val_loss: 0.4153\n",
      "Epoch 5/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 1ms/step - accuracy: 0.9626 - loss: 0.4721 - val_accuracy: 0.9580 - val_loss: 0.3307\n",
      "Epoch 6/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9523 - loss: 0.7290 - val_accuracy: 0.9891 - val_loss: 0.0679\n",
      "Epoch 7/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9765 - loss: 0.2803 - val_accuracy: 0.9909 - val_loss: 0.0889\n",
      "Epoch 8/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9747 - loss: 0.3561 - val_accuracy: 0.9818 - val_loss: 0.1316\n",
      "Epoch 9/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9729 - loss: 0.3145 - val_accuracy: 0.9898 - val_loss: 0.0580\n",
      "Epoch 10/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9786 - loss: 0.2751 - val_accuracy: 0.9881 - val_loss: 0.0681\n",
      "Epoch 11/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9803 - loss: 0.2147 - val_accuracy: 0.9772 - val_loss: 0.1633\n",
      "Epoch 12/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9801 - loss: 0.2152 - val_accuracy: 0.9842 - val_loss: 0.1460\n",
      "Epoch 13/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9744 - loss: 0.3279 - val_accuracy: 0.9877 - val_loss: 0.0778\n",
      "Epoch 14/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9820 - loss: 0.1724 - val_accuracy: 0.9923 - val_loss: 0.0594\n",
      "Epoch 15/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9822 - loss: 0.2254 - val_accuracy: 0.9933 - val_loss: 0.0292\n",
      "Epoch 16/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9711 - loss: 0.4008 - val_accuracy: 0.9863 - val_loss: 0.0678\n",
      "Epoch 17/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9855 - loss: 0.1149 - val_accuracy: 0.9940 - val_loss: 0.0218\n",
      "Epoch 18/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9867 - loss: 0.1138 - val_accuracy: 0.9832 - val_loss: 0.0816\n",
      "Epoch 19/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9873 - loss: 0.1155 - val_accuracy: 0.9944 - val_loss: 0.0225\n",
      "Epoch 20/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9863 - loss: 0.1202 - val_accuracy: 0.9937 - val_loss: 0.0296\n",
      "Epoch 21/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9807 - loss: 0.1688 - val_accuracy: 0.9681 - val_loss: 0.1488\n",
      "Epoch 22/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9835 - loss: 0.1331 - val_accuracy: 0.9972 - val_loss: 0.0136\n",
      "Epoch 23/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9864 - loss: 0.1276 - val_accuracy: 0.9926 - val_loss: 0.0377\n",
      "Epoch 24/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9875 - loss: 0.0863 - val_accuracy: 0.9923 - val_loss: 0.0314\n",
      "Epoch 25/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9896 - loss: 0.0895 - val_accuracy: 0.9860 - val_loss: 0.0796\n",
      "Epoch 26/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9879 - loss: 0.1164 - val_accuracy: 0.9947 - val_loss: 0.0195\n",
      "Epoch 27/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9923 - loss: 0.0481 - val_accuracy: 0.9863 - val_loss: 0.0487\n",
      "Epoch 28/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9880 - loss: 0.0698 - val_accuracy: 0.9965 - val_loss: 0.0178\n",
      "Epoch 29/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9866 - loss: 0.0894 - val_accuracy: 0.9902 - val_loss: 0.0400\n",
      "Epoch 30/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9919 - loss: 0.0391 - val_accuracy: 0.9919 - val_loss: 0.0361\n",
      "Epoch 31/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9869 - loss: 0.0552 - val_accuracy: 0.9972 - val_loss: 0.0160\n",
      "Epoch 32/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9926 - loss: 0.0344 - val_accuracy: 0.9937 - val_loss: 0.0337\n",
      "Epoch 33/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9926 - loss: 0.0393 - val_accuracy: 0.9937 - val_loss: 0.0239\n",
      "Epoch 34/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9921 - loss: 0.0321 - val_accuracy: 0.9786 - val_loss: 0.0751\n",
      "Epoch 35/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9896 - loss: 0.0423 - val_accuracy: 0.9972 - val_loss: 0.0101\n",
      "Epoch 36/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9930 - loss: 0.0282 - val_accuracy: 0.9870 - val_loss: 0.0391\n",
      "Epoch 37/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9866 - loss: 0.0540 - val_accuracy: 0.9692 - val_loss: 0.0841\n",
      "Epoch 38/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9874 - loss: 0.0747 - val_accuracy: 0.9930 - val_loss: 0.0223\n",
      "Epoch 39/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9902 - loss: 0.0390 - val_accuracy: 0.9975 - val_loss: 0.0085\n",
      "Epoch 40/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9940 - loss: 0.0244 - val_accuracy: 0.9930 - val_loss: 0.0206\n",
      "Epoch 41/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9928 - loss: 0.0289 - val_accuracy: 0.9937 - val_loss: 0.0150\n",
      "Epoch 42/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9940 - loss: 0.0305 - val_accuracy: 0.9972 - val_loss: 0.0094\n",
      "Epoch 43/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9910 - loss: 0.0373 - val_accuracy: 0.9958 - val_loss: 0.0094\n",
      "Epoch 44/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9947 - loss: 0.0251 - val_accuracy: 0.9958 - val_loss: 0.0120\n",
      "Epoch 45/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9942 - loss: 0.0250 - val_accuracy: 0.9972 - val_loss: 0.0078\n",
      "Epoch 46/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9927 - loss: 0.0261 - val_accuracy: 0.9961 - val_loss: 0.0153\n",
      "Epoch 47/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9913 - loss: 0.0317 - val_accuracy: 0.9958 - val_loss: 0.0126\n",
      "Epoch 48/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9931 - loss: 0.0252 - val_accuracy: 0.9947 - val_loss: 0.0182\n",
      "Epoch 49/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9950 - loss: 0.0218 - val_accuracy: 0.9954 - val_loss: 0.0161\n",
      "Epoch 50/50\n",
      "\u001b[1m357/357\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 2ms/step - accuracy: 0.9951 - loss: 0.0215 - val_accuracy: 0.9940 - val_loss: 0.0226\n",
      "\u001b[1m90/90\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 1ms/step - accuracy: 0.9916 - loss: 0.0288\n",
      "Test Accuracy: 99.40%\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import LabelEncoder, MinMaxScaler\n",
    "from tensorflow.keras.models import Model\n",
    "from tensorflow.keras.layers import Dense, Input, concatenate\n",
    "\n",
    "# Load the preprocessed CSV file\n",
    "file_path = 'D:/Final Version/Step 14 CombinedTraces/CombinedBenignMaliciousDataset.csv'\n",
    "df = pd.read_csv(file_path)\n",
    "\n",
    "# Assuming 'target_column' is your label column and all other columns are features\n",
    "# Split the data into features and target\n",
    "X = df.drop(columns=['Filename'])  # Drop non-relevant columns like Filename\n",
    "y = df['Level']  # Replace with the actual target column\n",
    "\n",
    "# Split numerical and categorical features\n",
    "numerical_columns = ['Total System Calls', 'Total Errors', 'File Operations', 'Memory Operations',\n",
    "                     'Network Operations', 'Process Management Operations', 'I/O Operations',\n",
    "                     'Time Operations', 'Signal Operations', 'IPC Operations', 'Thread Operations',\n",
    "                     'Resource Operations', 'Filesystem Operations', 'Security Operations',\n",
    "                     'Miscellaneous Operations', 'Average System Calls per Line']\n",
    "\n",
    "categorical_columns = ['Unique System Calls List', 'Unique Errors List', 'Unique File Operations List', \n",
    "                       'Pattern 1', 'Pattern 2', 'Pattern 3', 'Pattern 4', 'Pattern 5',\n",
    "                       'Advanced Pattern 1', 'Advanced Pattern 2', 'Advanced Pattern 3', \n",
    "                       'Advanced Pattern 4', 'Advanced Pattern 5']\n",
    "\n",
    "# Split the data into training and testing sets\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
    "\n",
    "# Normalize numerical columns\n",
    "scaler = MinMaxScaler()\n",
    "X_train_numerical = scaler.fit_transform(X_train[numerical_columns])\n",
    "X_test_numerical = scaler.transform(X_test[numerical_columns])\n",
    "\n",
    "# Initialize label encoder\n",
    "label_encoder = LabelEncoder()\n",
    "\n",
    "# Combine the training and testing data for label encoding\n",
    "combined_data = pd.concat([X_train, X_test])\n",
    "\n",
    "# Fit label encoder on the combined data for each categorical column\n",
    "for col in categorical_columns:\n",
    "    label_encoder.fit(combined_data[col])  # Fit on combined data\n",
    "    X_train[col] = label_encoder.transform(X_train[col])  # Transform training data\n",
    "    X_test[col] = label_encoder.transform(X_test[col])  # Transform test data\n",
    "\n",
    "# Define the numerical input\n",
    "input_numerical = Input(shape=(len(numerical_columns),), name='numerical_input')\n",
    "\n",
    "# Define the categorical input\n",
    "input_categorical = Input(shape=(len(categorical_columns),), name='categorical_input')\n",
    "\n",
    "# Create the numerical processing branch\n",
    "x_num = Dense(128, activation='relu')(input_numerical)\n",
    "x_num = Dense(64, activation='relu')(x_num)\n",
    "\n",
    "# Create the categorical processing branch\n",
    "x_cat = Dense(128, activation='relu')(input_categorical)\n",
    "x_cat = Dense(64, activation='relu')(x_cat)\n",
    "\n",
    "# Combine both branches\n",
    "combined = concatenate([x_num, x_cat])\n",
    "\n",
    "# Add final output layer (assuming binary classification, adjust for multi-class)\n",
    "output = Dense(1, activation='sigmoid')(combined)\n",
    "\n",
    "# Define the final model\n",
    "model = Model(inputs=[input_numerical, input_categorical], outputs=output)\n",
    "\n",
    "# Compile the model\n",
    "model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\n",
    "\n",
    "# Train the model\n",
    "history = model.fit([X_train_numerical, X_train[categorical_columns]], y_train,\n",
    "                    validation_data=([X_test_numerical, X_test[categorical_columns]], y_test),\n",
    "                    epochs=50, batch_size=32)\n",
    "\n",
    "# Evaluate the model on the test set\n",
    "loss, accuracy = model.evaluate([X_test_numerical, X_test[categorical_columns]], y_test)\n",
    "print(f\"Test Accuracy: {accuracy * 100:.2f}%\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "188b803a-42e3-4b7f-a859-a265619136b9",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
