{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "70115385",
   "metadata": {},
   "source": [
    "## Common Family across years 2013 - 2024"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "6b87bcde",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{0: 'airpush', 1: 'dianjin', 2: 'dnotua', 3: 'ewind', 4: 'fakeapp', 5: 'plankton', 6: 'smsagent', 7: 'smspay', 8: 'smsreg', 9: 'umpay'}\n"
     ]
    }
   ],
   "source": [
    "# Create a map where the index is the key and the family is the value\n",
    "family_map = {index: family for index, family in enumerate(['airpush', 'dianjin', 'dnotua', 'ewind', 'fakeapp',\n",
    "       'plankton', 'smsagent', 'smspay', 'smsreg', 'umpay'])}\n",
    "\n",
    "# Print the map\n",
    "print(family_map)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "84d29d0d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# Dynamically generate CSV file paths for the years 2013 to 2024\n",
    "csv_files = [\n",
    "    f'/home/ihossain/ISMAIL/CADE/reports/drebin_{year}/intermediate/mlp_detect_results_all_m10.0_lambda0.1.csv'\n",
    "    for year in range(2013, 2025) if year != 2015 # Exclude 2015, as doent exist\n",
    "]\n",
    "\n",
    "# Combine all CSVs into a single DataFrame\n",
    "df_list = [pd.read_csv(file) for file in csv_files]\n",
    "df = pd.concat(df_list, ignore_index=True)\n",
    "\n",
    "df['Malware Families'] = df['real_label'].map(family_map)\n",
    "\n",
    "# Map is_drift flag for better legend labels\n",
    "df['DriftStatus'] = df['is_drift'].map({'Y': 'Drift', 'N': 'Non-drift'})\n",
    "\n",
    "# Calculate counts of 'Y' and 'N' for each Malware Families\n",
    "counts = df.groupby(['Malware Families', 'DriftStatus']).size().unstack(fill_value=0)\n",
    "\n",
    "# Plotting\n",
    "plt.figure(figsize=(7, 5), dpi=300)\n",
    "sns.boxplot(data=df, x='Malware Families', y='min_distance', hue='DriftStatus',\n",
    "            palette={'Drift': 'red', 'Non-drift': 'green'},\n",
    "            fliersize=2, linewidth=1.5)\n",
    "\n",
    "# Add counts as text annotations just above the xticks\n",
    "for i, family in enumerate(df['Malware Families'].unique()):\n",
    "    if family in counts.index:\n",
    "        drift_count = counts.loc[family, 'Drift'] if 'Drift' in counts.columns else 0\n",
    "        non_drift_count = counts.loc[family, 'Non-drift'] if 'Non-drift' in counts.columns else 0\n",
    "        plt.text(i, df['min_distance'].max() + 0.06, f\"{non_drift_count}\",\n",
    "                 ha='center', va='top', fontsize=8, color='green')\n",
    "        plt.text(i, df['min_distance'].max() + 0.09, f\"{drift_count}\",\n",
    "                 ha='center', va='top', fontsize=8, color='red')\n",
    "\n",
    "# plt.xlabel(\"Malware family used as unseen family\")\n",
    "plt.ylabel(\"Dist. to nearest centroid\")\n",
    "plt.xticks(rotation=45)\n",
    "plt.legend(title=\"\")\n",
    "plt.tight_layout()\n",
    "plt.show()\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
