{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Dependencies\n",
    "\"\"\"\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import numpy as np\n",
    "import utils\n",
    "from sklearn.decomposition import PCA\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Data path and global information\n",
    "\n",
    "Expected directory structure:\n",
    "    data/\n",
    "    ├── Koh\n",
    "    │   ├── koh.h5ad\n",
    "    ├── Kumar\n",
    "    │   ├── kumar.h5ad\n",
    "    ├── simkumar\n",
    "    │   ├── simkumar4easy.h5ad\n",
    "    │   ├── simkumar4hard.h5ad\n",
    "    │   ├── simkumar8hard.h5ad\n",
    "    ├── Trapnell\n",
    "    │   ├── trapnell.h5ad\n",
    "    ├── zheng\n",
    "    │   ├── zhengmix4eq.h5ad\n",
    "    │   ├── zhengmix4uneq.h5ad\n",
    "    │   ├── zhengmix8eq.h5ad\n",
    "\n",
    "\"\"\"\n",
    "\n",
    "dspath=['data/Koh/koh.h5ad','data/Kumar/kumar.h5ad', 'data/simkumar/simkumar4easy.h5ad','data/simkumar/simkumar4hard.h5ad','data/simkumar/simkumar8hard.h5ad','data/Trapnell/trapnell.h5ad','data/zheng/zhengmix4eq.h5ad','data/zheng/zhengmix4uneq.h5ad','data/zheng/zhengmix8eq.h5ad']\n",
    "dsname=['Koh','Kumar','simkumar4easy','simkumar4hard','simkumar8hard','Trapnell','Zheng4eq','Zheng4uneq','Zheng8eq']\n",
    "dsname_abrev = ['Koh', 'Kumar', 'sk4e', 'sk4h', 'sk8h', 'Trap', 'Zh4e', 'Zh4u', 'Zh8e']\n",
    "\n",
    "# Outlier removal rates to be tested\n",
    "removal_rates = [0.05, 0.1]\n",
    "\n",
    "# Outlier removal algorithms to be tested\n",
    "algos = [\"compression\", \"lof\", \"lof_pca\", \"knn\", \"knn_pca\", \"isolation_forest\", \"isolation_forest_pca\", \"ecod\", \"ecod_pca\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Calculate and save compression matricies\n",
    "\"\"\"\n",
    "\n",
    "for i in range(len(dsname)): \n",
    "    data, cs, labels = utils.initiate(dsname, dspath, i)\n",
    "    k = len(cs)\n",
    "\n",
    "    # PCA dim = k - 1\n",
    "    C = utils.get_compression_matrix(data, k - 1, True)\n",
    "    np.save(dspath[i][:-5] + '_C_cs-1.npy', C)\n",
    "\n",
    "    # PCA dim = 2k\n",
    "    C_2k = utils.get_compression_matrix(data, 2 * k, True)\n",
    "    np.save(dspath[i][:-5] + '_C_2cs', C_2k)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Core compressibility results\n",
    "\"\"\"\n",
    "\n",
    "for i in range(len(dsname)): \n",
    "    data, cs, labels = utils.initiate(dsname, dspath, i)\n",
    "    k = len(cs)\n",
    "\n",
    "    C = np.load(dspath[i][:-5] + '_C_cs-1.npy')\n",
    "    # Get cluster-wise average inter and intra cluster compression ratios\n",
    "    avg_inter_ratio, avg_intra_ratio = utils.get_average_compression(C, cs, k)\n",
    "\n",
    "    print('Dataset: ', dsname[i])\n",
    "    print('Average inter-cluster compression ratios: ', avg_inter_ratio)\n",
    "    print('Average inter-cluster compresion: ', np.mean(avg_inter_ratio))\n",
    "    print('Average intra-cluster compression ratios: ', avg_intra_ratio)\n",
    "    print('Average intra-cluster compresion: ', np.mean(avg_intra_ratio))\n",
    "    print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Core compressibility results (dim = 2k)\n",
    "\"\"\"\n",
    "\n",
    "for i in range(len(dsname)): \n",
    "    data, cs, labels = utils.initiate(dsname, dspath, i)\n",
    "    k = len(cs)\n",
    "\n",
    "    C = np.load(dspath[i][:-5] + '_C_2cs.npy')\n",
    "    # Get cluster-wise average inter and intra cluster compression ratios\n",
    "    avg_inter_ratio, avg_intra_ratio = utils.get_average_compression(C, cs, k)\n",
    "\n",
    "    print('Dataset: ', dsname[i])\n",
    "    print('Average inter-cluster compression ratios: ', avg_inter_ratio)\n",
    "    print('Average inter-cluster compresion: ', np.mean(avg_inter_ratio))\n",
    "    print('Average intra-cluster compression ratios: ', avg_intra_ratio)\n",
    "    print('Average intra-cluster compresion: ', np.mean(avg_intra_ratio))\n",
    "    print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "NMI and Purity baselines for PCA dim = k - 1\n",
    "\"\"\"\n",
    "\n",
    "baseline_nmi_k1 = []\n",
    "baseline_purity_k1 = []\n",
    "\n",
    "for i in range(len(dsname)):\n",
    "    data, cs, labels = utils.initiate(dsname, dspath, i)\n",
    "    k = len(cs)\n",
    "    \n",
    "    data_pca = PCA(n_components=k-1, random_state=1).fit_transform(data)\n",
    "    nmi = []\n",
    "    purity = []\n",
    "\n",
    "    # Compute baselines as the mean of 50 runs\n",
    "    for i in range (50): \n",
    "        nmi_res, purity_res = utils.kmeans_nmi_purity(data_pca, k, labels)\n",
    "        nmi.append(nmi_res)\n",
    "        purity.append(purity_res)\n",
    "\n",
    "    baseline_nmi_k1.append(np.mean(nmi))\n",
    "    baseline_purity_k1.append(np.mean(purity))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "NMI and Purity baselines for PCA dim = 2k\n",
    "\"\"\"\n",
    "\n",
    "baseline_nmi_2k = []\n",
    "baseline_purity_2k = []\n",
    "\n",
    "for i in range(len(dsname)):\n",
    "    data, cs, labels = utils.initiate(dsname, dspath, i)\n",
    "    k = len(cs)\n",
    "    \n",
    "    data_pca = PCA(n_components=k * 2, random_state=1).fit_transform(data)\n",
    "    nmi = []\n",
    "    purity = []\n",
    "\n",
    "    # Compute baselines as the mean of 50 runs\n",
    "    for i in range (50): \n",
    "        nmi_res, purity_res = utils.kmeans_nmi_purity(data_pca, k, labels)\n",
    "        nmi.append(nmi_res)\n",
    "        purity.append(purity_res)\n",
    "\n",
    "    baseline_nmi_2k.append(np.mean(nmi))\n",
    "    baseline_purity_2k.append(np.mean(purity))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_dataset(data,cs,labels,dimension,algo,C): \n",
    "    nmi_rem, purity_rem = [], []\n",
    "\n",
    "    # For each removal rate, compute the performance of PCA Kmeans with removal\n",
    "    for removal in removal_rates: \n",
    "        nmi_res, purity_res = utils.remove_pca_kmeans(data, cs, labels, dimension, removal, C, algo)\n",
    "        nmi_rem.append(nmi_res)\n",
    "        purity_rem.append(purity_res)\n",
    "    \n",
    "    nmi_med = [np.mean(nmi_rem[i]) for i in range(len(nmi_rem))]\n",
    "    purity_med = [np.mean(purity_rem[i]) for i in range(len(purity_rem))]\n",
    "    print('PCA + Kmeans with removal (mean): ', nmi_med, purity_med)\n",
    "    \n",
    "    return nmi_rem, purity_rem"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\" \n",
    "NMI and Purity improvement for PCA dim = k - 1\n",
    "\"\"\"\n",
    "\n",
    "# Each subarray is an algorithm, each dictionary is a metric (NMI, Purity)\n",
    "results_k1 = [[{}, {}], [{}, {}], [{}, {}], [{}, {}], [{}, {}], [{}, {}], [{}, {}], [{}, {}], [{}, {}]]\n",
    "for i in range(len(results_k1)): \n",
    "    # NMI improvement \n",
    "    results_k1[i][0] = {\n",
    "        '5% removal': tuple(),\n",
    "        '10% removal': tuple(),\n",
    "    }   \n",
    "    # Purity improvement\n",
    "    results_k1[i][1] = {\n",
    "        '5% removal': tuple(),\n",
    "        '10% removal': tuple(),\n",
    "    }\n",
    "\n",
    "for j in range(len(algos)): \n",
    "    for i in range(len(dspath)):\n",
    "        data, cs, labels = utils.initiate(dsname, dspath, i)\n",
    "        k = len(cs)\n",
    "        C = np.load(dspath[i][:-5] + '_C_cs-1.npy')\n",
    "\n",
    "        print('Dataset: ', dsname[i])\n",
    "        nmi_rem, purity_rem = process_dataset(data, cs, labels, k - 1, algos[j], C)\n",
    "        nmi_rem = np.array(nmi_rem)\n",
    "        purity_rem = np.array(purity_rem)\n",
    "\n",
    "        # Compute performance difference\n",
    "        results_k1[j][0]['5% removal'] += (nmi_rem[0] - baseline_nmi_k1[i],)\n",
    "        results_k1[j][0]['10% removal'] += (nmi_rem[1] - baseline_nmi_k1[i],)\n",
    "\n",
    "        results_k1[j][1]['5% removal'] += (purity_rem[0] - baseline_purity_k1[i],)\n",
    "        results_k1[j][1]['10% removal'] += (purity_rem[1] - baseline_purity_k1[i],)\n",
    "        \n",
    "    print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extract and group NMI results by algorithm\n",
    "five_nmi_k1 = {\n",
    "    'Variance of Compression': results_k1[0][0]['5% removal'],\n",
    "    'LOF': results_k1[1][0]['5% removal'],\n",
    "    'PCA + LOF': results_k1[2][0]['5% removal'],\n",
    "    'KNN': results_k1[3][0]['5% removal'],\n",
    "    'PCA + KNN': results_k1[4][0]['5% removal'],\n",
    "    'Isolation Forest': results_k1[5][0]['5% removal'],\n",
    "    'PCA + Isolation Forest': results_k1[6][0]['5% removal'],\n",
    "    'ECOD': results_k1[7][0]['5% removal'],\n",
    "    'PCA + ECOD': results_k1[8][0]['5% removal'],\n",
    "}\n",
    "\n",
    "ten_nmi_k1 = {\n",
    "    'Variance of Compression': results_k1[0][0]['10% removal'],\n",
    "    'LOF': results_k1[1][0]['10% removal'],\n",
    "    'PCA + LOF': results_k1[2][0]['10% removal'],\n",
    "    'KNN': results_k1[3][0]['10% removal'],\n",
    "    'PCA + KNN': results_k1[4][0]['10% removal'],\n",
    "    'Isolation Forest': results_k1[5][0]['10% removal'],\n",
    "    'PCA + Isolation Forest': results_k1[6][0]['10% removal'],\n",
    "    'ECOD': results_k1[7][0]['10% removal'],\n",
    "    'PCA + ECOD': results_k1[8][0]['10% removal'],\n",
    "}\n",
    "\n",
    "# Plot results\n",
    "utils.multi_bar_graph_error_bars(dsname_abrev, five_nmi_k1, \"NMI Improvement from Outlier Removal (dim = k - 1, 5%)\", \"NMI Improvement\", 8, \"outside\", 10, 4)\n",
    "utils.multi_bar_graph_error_bars(dsname_abrev, ten_nmi_k1, \"NMI Improvement from Outlier Removal (dim = k - 1, 10%)\", \"NMI Improvement\", 8, \"outside\", 10, 4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extract and group Purity results by algorithm\n",
    "five_purity_k1 = {\n",
    "    'Variance of Compression': results_k1[0][1]['5% removal'],\n",
    "    'LOF': results_k1[1][1]['5% removal'],\n",
    "    'PCA + LOF': results_k1[2][1]['5% removal'],\n",
    "    'KNN': results_k1[3][1]['5% removal'],\n",
    "    'PCA + KNN': results_k1[4][1]['5% removal'],\n",
    "    'Isolation Forest': results_k1[5][1]['5% removal'],\n",
    "    'PCA + Isolation Forest': results_k1[6][1]['5% removal'],\n",
    "    'ECOD': results_k1[7][1]['5% removal'],\n",
    "    'PCA + ECOD': results_k1[8][1]['5% removal'],\n",
    "}\n",
    "\n",
    "ten_purity_k1 = {\n",
    "    'Variance of Compression': results_k1[0][1]['10% removal'],\n",
    "    'LOF': results_k1[1][1]['10% removal'],\n",
    "    'PCA + LOF': results_k1[2][1]['10% removal'],\n",
    "    'KNN': results_k1[3][1]['10% removal'],\n",
    "    'PCA + KNN': results_k1[4][1]['10% removal'],\n",
    "    'Isolation Forest': results_k1[5][1]['10% removal'],\n",
    "    'PCA + Isolation Forest': results_k1[6][1]['10% removal'],\n",
    "    'ECOD': results_k1[7][1]['10% removal'],\n",
    "    'PCA + ECOD': results_k1[8][1]['10% removal'],\n",
    "}\n",
    "\n",
    "# Plot results\n",
    "utils.multi_bar_graph_error_bars(dsname_abrev, five_purity_k1, \"Purity Improvement from Outlier Removal (dim = k - 1, 5%)\", \"Purity Improvement\", 8, \"outside\", 10, 4)\n",
    "utils.multi_bar_graph_error_bars(dsname_abrev, ten_purity_k1, \"Purity Improvement from Outlier Removal (dim = k - 1, 10%)\", \"Purity Improvement\", 8, \"outside\", 10, 4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "NMI and Purity improvement for PCA dim = 2k\n",
    "\"\"\"\n",
    "\n",
    "# Each subarray is an algorithm, each dictionary is a metric (NMI, Purity)\n",
    "results_2k = [[{}, {}], [{}, {}], [{}, {}], [{}, {}], [{}, {}], [{}, {}], [{}, {}], [{}, {}], [{}, {}]]\n",
    "for i in range(len(results_2k)): \n",
    "    # NMI improvement \n",
    "    results_2k[i][0] = {\n",
    "        '5% removal': tuple(),\n",
    "        '10% removal': tuple(),\n",
    "    }   \n",
    "    # Purity improvement\n",
    "    results_2k[i][1] = {\n",
    "        '5% removal': tuple(),\n",
    "        '10% removal': tuple(),\n",
    "    }\n",
    "\n",
    "for j in range(len(algos)):\n",
    "    for i in range(len(dspath)):\n",
    "        data, cs, labels = utils.initiate(dsname, dspath, i)\n",
    "        k = len(cs)\n",
    "        C = np.load(dspath[i][:-5] + '_C_2cs.npy')\n",
    "\n",
    "        print('Dataset: ', dsname[i])\n",
    "        nmi_rem, purity_rem = process_dataset(data, cs, labels, k * 2, algos[j], C)\n",
    "        nmi_rem = np.array(nmi_rem)\n",
    "        purity_rem = np.array(purity_rem)\n",
    "\n",
    "        # Compute performance difference\n",
    "        results_2k[j][0]['5% removal'] += (nmi_rem[0] - baseline_nmi_2k[i],)\n",
    "        results_2k[j][0]['10% removal'] += (nmi_rem[1] - baseline_nmi_2k[i],)\n",
    "\n",
    "        results_2k[j][1]['5% removal'] += (purity_rem[0] - baseline_purity_2k[i],)\n",
    "        results_2k[j][1]['10% removal'] += (purity_rem[1] - baseline_purity_2k[i],)\n",
    "        \n",
    "    print()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extract and group NMI results by algorithm\n",
    "ten_nmi_2k = {\n",
    "    'Variance of Compression': results_2k[0][0]['10% removal'],\n",
    "    'LOF': results_2k[1][0]['10% removal'],\n",
    "    'PCA + LOF': results_2k[2][0]['10% removal'],\n",
    "    'KNN': results_2k[3][0]['10% removal'],\n",
    "    'PCA + KNN': results_2k[4][0]['10% removal'],\n",
    "    'Isolation Forest': results_2k[5][0]['10% removal'],\n",
    "    'PCA + Isolation Forest': results_2k[6][0]['10% removal'],\n",
    "    'ECOD': results_2k[7][0]['10% removal'],\n",
    "    'PCA + ECOD': results_2k[8][0]['10% removal'],\n",
    "}\n",
    "\n",
    "# Plot results\n",
    "utils.multi_bar_graph_error_bars(dsname_abrev, ten_nmi_2k, \"NMI Improvement from Outlier Removal (dim = 2k, 10%)\", \"NMI Improvement\", 8, \"outside\", 10, 4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extract and group Purity results by algorithm\n",
    "ten_purity_2k = {\n",
    "    'Variance of Compression': results_2k[0][1]['10% removal'],\n",
    "    'LOF': results_2k[1][1]['10% removal'],\n",
    "    'PCA + LOF': results_2k[2][1]['10% removal'],\n",
    "    'KNN': results_2k[3][1]['10% removal'],\n",
    "    'PCA + KNN': results_2k[4][1]['10% removal'],\n",
    "    'Isolation Forest': results_2k[5][1]['10% removal'],\n",
    "    'PCA + Isolation Forest': results_2k[6][1]['10% removal'],\n",
    "    'ECOD': results_2k[7][1]['10% removal'],\n",
    "    'PCA + ECOD': results_2k[8][1]['10% removal'],\n",
    "}\n",
    "\n",
    "# Plot results\n",
    "utils.multi_bar_graph_error_bars(dsname_abrev, ten_purity_2k, \"Purity Improvement from Outlier Removal (dim = 2k, 10%)\", \"Purity Improvement\", 8, \"outside\", 10, 4)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
