{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "6b90590a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "ce1148ef",
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy.sparse import load_npz\n",
    "\n",
    "# Load VirusTotal data\n",
    "vt_df = pd.read_csv(\"/home/shared-datasets/Feature_extraction/all_hash_vtdetect_added.csv\")\n",
    "vt_df['vt_detection'] = pd.to_numeric(vt_df['vt_detection'], errors='coerce')\n",
    "vt_df['sha256'] = vt_df['sha256'].str.lower()\n",
    "\n",
    "# Directory containing meta files\n",
    "data_dir = \"/home/shared-datasets/Feature_extraction/npz_yearwise_Final_rerun_0.001\"\n",
    "thresholds = range(0, 100)\n",
    "results = {t: {'benign': 0, 'malware': 0} for t in thresholds}\n",
    "\n",
    "vt_feat_index = {t: {0: [], 1: []} for t in thresholds}\n",
    "vt_features = {t: {0: [], 1: []} for t in thresholds}\n",
    "\n",
    "# Aggregate counts across all years except 2015\n",
    "for year in range(2013, 2026):\n",
    "    if year == 2015:\n",
    "        continue\n",
    "\n",
    "    meta_file1 = os.path.join(data_dir, f\"{year}_meta_train.npz\")\n",
    "    meta_file2 = os.path.join(data_dir, f\"{year}_meta_test.npz\")\n",
    "    if not os.path.exists(meta_file1):\n",
    "        continue\n",
    "\n",
    "    meta1 = np.load(meta_file1, allow_pickle=True)\n",
    "    meta2 = np.load(meta_file2, allow_pickle=True)\n",
    "    # Concatenate meta1 and meta2 hashes and families\n",
    "    # Concatenate all fields from meta1 and meta2 into a single DataFrame\n",
    "    meta1_df = pd.DataFrame({k: meta1[k] for k in meta1.files})\n",
    "    meta2_df = pd.DataFrame({k: meta2[k] for k in meta2.files})\n",
    "    full_meta_df = pd.concat([meta1_df, meta2_df], axis=0, ignore_index=True)\n",
    "    # print(full_meta_df.shape)\n",
    "    # # Rename 'hash' column to 'sha256' if present\n",
    "    # if 'hash' in full_meta_df.columns:\n",
    "    #     full_meta_df = full_meta_df.rename(columns={'hash': 'sha256'})\n",
    "\n",
    "    # merged_df = pd.merge(full_meta_df, vt_df, on='sha256', how='inner')\n",
    "\n",
    "    benign_indices = {t: [] for t in thresholds}\n",
    "    malware_indices = {t: [] for t in thresholds}\n",
    "\n",
    "    for idx, row in full_meta_df.iterrows():\n",
    "        y_val = int(row['y'])\n",
    "        vt_detection = int(row['vt_count'])\n",
    "        # if vt_detection > 11.0:\n",
    "        #     continue\n",
    "        \n",
    "        if y_val == 0:\n",
    "            benign_indices[vt_detection].append(idx)\n",
    "        elif y_val == 1:\n",
    "            malware_indices[vt_detection].append(idx)\n",
    "        # print(f\"Processing {year} {idx} {y_val} {vt_detection}\")\n",
    "        # vt_feat_index[vt_detection][y_val].append(idx)\n",
    "\n",
    "    train_data_dir = f'{data_dir}/{year}_X_train.npz'\n",
    "    test_data_dir = f'{data_dir}/{year}_X_test.npz'\n",
    "    if os.path.exists(train_data_dir) and os.path.exists(test_data_dir):\n",
    "        train_data_X = load_npz(train_data_dir).toarray()\n",
    "        test_data_X = load_npz(test_data_dir).toarray()\n",
    "        data_X = np.concatenate((train_data_X, test_data_X), axis=0)\n",
    "        # print(len(data_X))\n",
    "        # print(len(benign_indices[vt_detection]))\n",
    "        for vts in thresholds:\n",
    "            \n",
    "            if len(benign_indices[vts]) > 0:\n",
    "                vt_features[vts][0].extend(data_X[benign_indices[vts]])\n",
    "                # print(len(benign_indices[vts]))\n",
    "            if len(malware_indices[vts]) > 0:\n",
    "                vt_features[vts][1].extend(data_X[malware_indices[vts]])\n",
    "                # print(len(malware_indices[vts]))\n",
    "            \n",
    "            # Save the merged DataFrame to a CSV file\n",
    "            # merged_df.to_csv(f\"merged_df_{year}_{vts}.csv\", index=False)\n",
    "\n",
    "    # if 'merged_df_all' in locals():\n",
    "    #     merged_df_all = pd.concat([merged_df_all, merged_df], ignore_index=True, axis=0)\n",
    "    # else:\n",
    "    #     merged_df_all = merged_df.copy()\n",
    "    # print(merged_df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "0db8dc54",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing 0 638475 0\n",
      "Processing 1 0 0\n",
      "Processing 2 0 0\n",
      "Processing 3 0 0\n",
      "Processing 4 0 44979\n",
      "Processing 5 0 43103\n",
      "Processing 6 0 40134\n",
      "Processing 7 0 35046\n",
      "Processing 8 0 28937\n",
      "Processing 9 0 22331\n",
      "Processing 10 0 17335\n",
      "Processing 11 0 14258\n",
      "Processing 12 0 12433\n",
      "Processing 13 0 12876\n",
      "Processing 14 0 12242\n",
      "Processing 15 0 10271\n",
      "Processing 16 0 8813\n",
      "Processing 17 0 7854\n",
      "Processing 18 0 6717\n",
      "Processing 19 0 6143\n",
      "Processing 20 0 5557\n",
      "Processing 21 0 4978\n",
      "Processing 22 0 4409\n",
      "Processing 23 0 3736\n",
      "Processing 24 0 3275\n",
      "Processing 25 0 2835\n",
      "Processing 26 0 2646\n",
      "Processing 27 0 2562\n",
      "Processing 28 0 2089\n",
      "Processing 29 0 2013\n",
      "Processing 30 0 1827\n",
      "Processing 31 0 1786\n",
      "Processing 32 0 1462\n",
      "Processing 33 0 1254\n",
      "Processing 34 0 1060\n",
      "Processing 35 0 897\n",
      "Processing 36 0 856\n",
      "Processing 37 0 762\n",
      "Processing 38 0 691\n",
      "Processing 39 0 562\n",
      "Processing 40 0 378\n",
      "Processing 41 0 316\n",
      "Processing 42 0 182\n",
      "Processing 43 0 120\n",
      "Processing 44 0 64\n",
      "Processing 45 0 39\n",
      "Processing 46 0 30\n",
      "Processing 47 0 20\n",
      "Processing 48 0 6\n",
      "Processing 49 0 9\n",
      "Processing 50 0 4\n",
      "Processing 51 0 5\n",
      "Processing 52 0 2\n",
      "Processing 53 0 1\n",
      "Processing 54 0 0\n",
      "Processing 55 0 1\n",
      "Processing 56 0 0\n",
      "Processing 57 0 0\n",
      "Processing 58 0 0\n",
      "Processing 59 0 0\n",
      "Processing 60 0 0\n",
      "Processing 61 0 0\n",
      "Processing 62 0 0\n",
      "Processing 63 0 0\n",
      "Processing 64 0 0\n",
      "Processing 65 0 0\n",
      "Processing 66 0 0\n",
      "Processing 67 0 0\n",
      "Processing 68 0 0\n",
      "Processing 69 0 0\n",
      "Processing 70 0 0\n",
      "Processing 71 0 0\n",
      "Processing 72 0 0\n",
      "Processing 73 0 0\n",
      "Processing 74 0 0\n",
      "Processing 75 0 0\n",
      "Processing 76 0 0\n",
      "Processing 77 0 0\n",
      "Processing 78 0 0\n",
      "Processing 79 0 0\n",
      "Processing 80 0 0\n",
      "Processing 81 0 0\n",
      "Processing 82 0 0\n",
      "Processing 83 0 0\n",
      "Processing 84 0 0\n",
      "Processing 85 0 0\n",
      "Processing 86 0 0\n",
      "Processing 87 0 0\n",
      "Processing 88 0 0\n",
      "Processing 89 0 0\n",
      "Processing 90 0 0\n",
      "Processing 91 0 0\n",
      "Processing 92 0 0\n",
      "Processing 93 0 0\n",
      "Processing 94 0 0\n",
      "Processing 95 0 0\n",
      "Processing 96 0 0\n",
      "Processing 97 0 0\n",
      "Processing 98 0 0\n",
      "Processing 99 0 0\n"
     ]
    }
   ],
   "source": [
    "for vts in vt_features.keys():\n",
    "    print(f\"Processing {vts} {len(vt_features[vts][0])} {len(vt_features[vts][1])}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "70df5e34",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.manifold import TSNE\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# Global font settings\n",
    "plt.rcParams.update({\n",
    "    \"font.size\": 14,\n",
    "    \"font.weight\": \"bold\",\n",
    "    \"axes.labelweight\": \"bold\",\n",
    "    \"axes.titlesize\": 16,\n",
    "    \"axes.titleweight\": \"bold\",\n",
    "    \"xtick.labelsize\": 13,\n",
    "    \"ytick.labelsize\": 13,\n",
    "    \"legend.fontsize\": 13,\n",
    "    \"legend.frameon\": False\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9007a11a",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "fig, axes = plt.subplots(2, 5, figsize=(25, 10))\n",
    "axes = axes.flatten()\n",
    "benign_feats = np.array(vt_features[0][0])\n",
    "for i, vts in enumerate(range(4, 14)):\n",
    "    malware_feats = np.array(vt_features[vts][1])\n",
    "    # Only plot if both classes have samples\n",
    "    if len(benign_feats) > 0 and len(malware_feats) > 0:\n",
    "        # Sample up to 500 from each class for visualization\n",
    "        n_benign = len(benign_feats) #min(10000, len(benign_feats))\n",
    "        n_malware = len(malware_feats)#min(10000, len(malware_feats))\n",
    "        X = np.vstack([benign_feats[:n_benign], malware_feats[:n_malware]])\n",
    "        y = np.array([0]*n_benign + [1]*n_malware)\n",
    "        tsne = TSNE(n_components=2, random_state=42, perplexity=30)\n",
    "        X_embedded = tsne.fit_transform(X)\n",
    "        ax = axes[i]\n",
    "        ax.scatter(X_embedded[y==0, 0], X_embedded[y==0, 1], label='Benign', alpha=0.5, s=10)\n",
    "        ax.scatter(X_embedded[y==1, 0], X_embedded[y==1, 1], label='Malware', alpha=0.5, s=10)\n",
    "        ax.set_title(f'vts={vts}')\n",
    "        ax.legend()\n",
    "    else:\n",
    "        axes[i].set_title(f'vts={vts} (insufficient data)')\n",
    "        axes[i].axis('off')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
