{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "13c6fa26",
   "metadata": {},
   "source": [
    "## Get the Lambda Meta data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eb5e457f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from scipy.sparse import load_npz\n",
    "import os\n",
    "\n",
    "year_singleton = {}\n",
    "year_families = {}\n",
    "# Load the .npz file\n",
    "for year in range(2013, 2026):\n",
    "    data_dir_train = f'/home/shared-datasets/Feature_extraction/npz_yearwise_Final/{year}_meta_train.npz'\n",
    "    data_dir_test = f'/home/shared-datasets/Feature_extraction/npz_yearwise_Final/{year}_meta_test.npz'\n",
    "\n",
    "    if os.path.exists(data_dir_train) and os.path.exists(data_dir_test):\n",
    "        data_train = np.load(data_dir_train, allow_pickle=True)\n",
    "        data_test = np.load(data_dir_test, allow_pickle=True)\n",
    "        \n",
    "        # Ensure keys exist in both train and test data before concatenating\n",
    "        common_keys = set(data_train.keys()).intersection(data_test.keys())\n",
    "        data_family = {key: np.concatenate((data_train[key], data_test[key])) for key in common_keys}\n",
    "        \n",
    "        # Extract the 'family' array from the data_family variable\n",
    "        families = data_family['family']\n",
    "\n",
    "        # Filter the families that do not start with \"singleton\" and do not contain \"benign\"\n",
    "        filtered_families = [family for family in families if not family.startswith(\"singleton\") and not family.startswith('SINGLETON') \\\n",
    "                            and not family.startswith(\"UNKNOWN\") and not family.startswith(\"-\")  and \"benign\" not in family]\n",
    "\n",
    "        # Count the number of families starting with \"singleton\"\n",
    "        year_singleton[year] = np.sum([1 for family in families if family.startswith(\"singleton\") or family.startswith(\"SINGLETON\")])\n",
    "        \n",
    "        # Store the filtered families for the year\n",
    "        year_families[year] = np.unique(filtered_families)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f4091403",
   "metadata": {},
   "outputs": [],
   "source": [
    "total_families = 0\n",
    "for year in year_families.keys():\n",
    "    total_families += len(year_families[year])\n",
    "    print(f'year: {year}, unique families: {len(year_families[year])}')\n",
    "\n",
    "print(\"total families: \", total_families)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8476ea01",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Compute the intersection of families across all years\n",
    "all_families = list(year_families.values())[:11]\n",
    "common_families_all_years = set.intersection(*[set(families) for families in all_families])\n",
    "\n",
    "print(f\"Number of common families across all years: {len(common_families_all_years)}\")\n",
    "print(f\"Common families: {common_families_all_years}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e1587b04",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Take valid families from the above except 'UNKNOWN'\n",
    "target_families = ['airpush', 'dianjin', 'dnotua', 'ewind', 'fakeapp',\n",
    "       'plankton', 'smsagent', 'smspay', 'smsreg', 'umpay']\n",
    "family_to_index = {family: idx for idx, family in enumerate(target_families)}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e385efdf",
   "metadata": {},
   "source": [
    "## Get the Lambda Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2daa4555",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from scipy.sparse import load_npz\n",
    "import os\n",
    "\n",
    "family_year_indices = {}\n",
    "# Load the .npz file\n",
    "for year in range(2013, 2025):\n",
    "    data_dir_train = f'/home/shared-datasets/Feature_extraction/npz_yearwise_Final/{year}_meta_train.npz'\n",
    "    data_dir_test = f'/home/shared-datasets/Feature_extraction/npz_yearwise_Final/{year}_meta_test.npz'\n",
    "\n",
    "    if os.path.exists(data_dir_train) and os.path.exists(data_dir_test):\n",
    "        data_train = np.load(data_dir_train, allow_pickle=True)\n",
    "        data_test = np.load(data_dir_test, allow_pickle=True)\n",
    "        \n",
    "        # Ensure keys exist in both train and test data before concatenating\n",
    "        common_keys = set(data_train.keys()).intersection(data_test.keys())\n",
    "        data_family = {key: np.concatenate((data_train[key], data_test[key]), axis=0) for key in common_keys}\n",
    "        # Extract the 'family' array from the data_family variable\n",
    "        families = data_family['family']\n",
    "\n",
    "        # Find the indices where the families match the target list\n",
    "        for i, family in enumerate(families): \n",
    "            if family in target_families:\n",
    "                if family not in family_year_indices:\n",
    "                    family_year_indices[family] = {}\n",
    "                if year not in family_year_indices[family]:\n",
    "                    family_year_indices[family][year] = []\n",
    "                family_year_indices[family][year].append(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2e8b81ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "family_features = {}\n",
    "for family in target_families:\n",
    "    family_features[family] = []\n",
    "    for year in range(2013, 2025):\n",
    "        train_data_dir = f'/home/shared-datasets/Feature_extraction/npz_yearwise_Final/{year}_X_train.npz'\n",
    "        test_data_dir = f'/home/shared-datasets/Feature_extraction/npz_yearwise_Final/{year}_X_test.npz'\n",
    "        if os.path.exists(train_data_dir) and os.path.exists(test_data_dir):\n",
    "            train_data_X = load_npz(train_data_dir).toarray()\n",
    "            test_data_X = load_npz(test_data_dir).toarray()\n",
    "            data_X = np.concatenate((train_data_X, test_data_X), axis=0)\n",
    "            indices = family_year_indices[family][year]\n",
    "            family_features[family].append(data_X[indices])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10a11a3b",
   "metadata": {},
   "outputs": [],
   "source": [
    "for family in family_features.keys():\n",
    "    print(f'family {family} {len(family_features[family])}')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c8291517",
   "metadata": {},
   "source": [
    "## Calcualte Jaccard similarity score (Stability score)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e5717ced",
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculate_jaccard_similarity(set1, set2):\n",
    "    \"\"\"\n",
    "    Calculate Jaccard similarity between two sets.\n",
    "    \"\"\"\n",
    "    intersection = len(set1.intersection(set2))\n",
    "    union = len(set1.union(set2))\n",
    "    return intersection / union if union != 0 else 0\n",
    "\n",
    "# Initialize a dictionary to store Jaccard similarity results\n",
    "jaccard_results = {}\n",
    "\n",
    "# Iterate over each family in family_features\n",
    "for family, feature_groups in family_features.items():\n",
    "    jaccard_scores = []\n",
    "    for i in range(len(feature_groups) - 1):\n",
    "        # Convert feature arrays to sets\n",
    "        set1 = set(map(tuple, feature_groups[i]))\n",
    "        set2 = set(map(tuple, feature_groups[i + 1]))\n",
    "        \n",
    "        # Calculate Jaccard similarity\n",
    "        jaccard_score = calculate_jaccard_similarity(set1, set2)\n",
    "        jaccard_scores.append(jaccard_score)\n",
    "    \n",
    "    # Store the results for the family\n",
    "    jaccard_results[family] = jaccard_scores\n",
    "\n",
    "# Display the Jaccard similarity results\n",
    "for family, scores in jaccard_results.items():\n",
    "    print(f\"Jaccard similarity for family '{family}': {scores}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "17f15ade",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a map where the index is the key and the family name is the value\n",
    "jaccard_key_map = {index: family  for index, family in enumerate(jaccard_results.keys())}\n",
    "\n",
    "# Print the map\n",
    "print(jaccard_key_map)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "78c42d06",
   "metadata": {},
   "source": [
    "## Plot the Stability Score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c5778f85",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# Extract the x-axis (pairs) and y-axis (scores) values\n",
    "\n",
    "# Plot the line plot\n",
    "plt.figure(figsize=(7, 5), dpi=500)\n",
    "\n",
    "markers = ['o', 's', 'D', '^', 'v', 'P', '*', 'X']  # Different markers for each label\n",
    "for idx, family in jaccard_key_map.items():\n",
    "    x_values = [i for i in range(10)]\n",
    "    y_values = jaccard_results[family]\n",
    "    plt.plot(x_values, y_values, marker=markers[idx % len(markers)], linestyle='-', label=f'{family}', linewidth=2)\n",
    "\n",
    "# Add labels, title, and legend\n",
    "# plt.xlabel(\"Group Pair\", fontsize=12)\n",
    "# plt.ylabel(\"Stability score\", fontsize=12)\n",
    "# plt.title(\"Jaccard Similarity Scores Between Consecutive Groups\", fontsize=14)\n",
    "plt.xticks(rotation=45)\n",
    "# plt.grid(True)\n",
    "plt.legend()\n",
    "\n",
    "# Show the plot\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
