{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#The following document contains code to run IF-PCA+.\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from IFPCA import get_modified_IFPCA_metrics, get_IFPCA_metrics\n",
    "from IFPCAplus import DMF\n",
    "from sklearn.metrics.pairwise import euclidean_distances\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "num of clusters: 2\n",
      "data shape: (1502, 5547)\n"
     ]
    }
   ],
   "source": [
    "########### Load denoised Single-Cell data  ###########\n",
    "### Name set: (load p-by-n data matrix)\n",
    "#camp1, 13111-by-777\n",
    "#camp2, 11233-by-734\n",
    "#darmanis, 13400-by-466\n",
    "#deng, 16347-by-268\n",
    "#goolam, 21199-by-124\n",
    "#grun, 5547-by-1502 \n",
    "#li, 25369-by-561\n",
    "#patel, 5948-by-430\n",
    "path = \"Dataset/sc_data/\"\n",
    "name = 'grun'\n",
    "\n",
    "# First load original dataset\n",
    "Data = pd.read_csv(path + name + '-x-filter.txt', sep=\",\", header = None) # p-by-n data matrix after the pre-processing\n",
    "y = pd.read_csv(path + name + '-y.txt', sep=\" \", header=None).iloc[:,0] # n-by-1 true class labels, start from 0\n",
    "K = max(y) - min(y) + 1 #number of classes\n",
    "X = Data.T\n",
    "X= X.to_numpy()\n",
    "n, param_size = np.shape(X)  \n",
    "print(\"num of clusters:\",K)\n",
    "print(\"data shape:\",np.shape(X))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ########### Load denoised Micro array data  ###########\n",
    "# ### Name set: (load p-by-n data matrix)\n",
    "# # brain, 5597-by-42\n",
    "# # breast, 22215-by-276\n",
    "# # colon, 2000-by-62\n",
    "# # leukemia, 3571-by-72\n",
    "# # lung1, 12533-by-181\n",
    "# # lung2, 12600-by-203\n",
    "# # lymphoma, 4026-by-62\n",
    "# # prostate, 6033-by-102\n",
    "# # srbct, 2308-by-63\n",
    "# # su, 7909-by-174\n",
    "\n",
    "# path = \"Dataset/microarray_data/\" # path of the data sets\n",
    "# name = 'colon'\n",
    "\n",
    "# # First load original dataset\n",
    "# Data = pd.read_csv(path + name + '.x.txt', sep=\",\", header = None) # p-by-n data matrix after the pre-processing\n",
    "# y = pd.read_csv(path + name + '.y.txt', sep=\" \", header=None).T.values[0] \n",
    "# K = max(y) - min(y) + 1 #number of classes\n",
    "# X = Data.T\n",
    "# X= X.to_numpy()\n",
    "# n, param_size = np.shape(X)  \n",
    "# print(\"num of clusters:\",K)\n",
    "# print(\"data shape:\",np.shape(X))\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "XX_log = np.log1p(X)\n",
    "# XX_log = X #Use this for Patel or microarray dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Run IF-PCA+"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "if K<5:\n",
    "    K_diff =4\n",
    "else:\n",
    "    K_diff = K"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1502, 5547)\n",
      "calculating diffusion_neighborhood (1502, 5547)\n",
      "Proceed the main loop (1502, 5547)\n"
     ]
    }
   ],
   "source": [
    "knn1= 15\n",
    "log_y22_n_x = DMF(XX_log, knn=knn1, trans=False, K_diff = K_diff, epsilon = 'bgh_generous') #bgh_generous if bgh does not work "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Run double manifold fitting\n"
     ]
    },
    {
     "ename": "NameError",
     "evalue": "name 'euclidean_distances' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[7], line 7\u001b[0m\n\u001b[0;32m      5\u001b[0m     knn2\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m100\u001b[39m\n\u001b[0;32m      6\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRun double manifold fitting\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m----> 7\u001b[0m epsilon \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mvar(\u001b[43meuclidean_distances\u001b[49m(log_y22_n_x\u001b[38;5;241m.\u001b[39mT,log_y22_n_x\u001b[38;5;241m.\u001b[39mT))\n\u001b[0;32m      8\u001b[0m log_duo_y22_nn_x \u001b[38;5;241m=\u001b[39m DMF(log_y22_n_x\u001b[38;5;241m.\u001b[39mT, knn\u001b[38;5;241m=\u001b[39mknn2, trans\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, K_diff \u001b[38;5;241m=\u001b[39m K_diff, epsilon \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m5\u001b[39m\u001b[38;5;241m*\u001b[39mepsilon) \n\u001b[0;32m      9\u001b[0m y22_duo_acc, y22_duo_ari,y22_duo_nmi,y22_duo_ami \u001b[38;5;241m=\u001b[39m get_modified_IFPCA_metrics(log_duo_y22_nn_x\u001b[38;5;241m.\u001b[39mT, y, K)\n",
      "\u001b[1;31mNameError\u001b[0m: name 'euclidean_distances' is not defined"
     ]
    }
   ],
   "source": [
    "n0=100\n",
    "if n/K > n0:\n",
    "    knn2=50\n",
    "    if param_size>10000:\n",
    "        knn2=100\n",
    "    print(\"Run double manifold fitting\")\n",
    "    epsilon = np.var(euclidean_distances(log_y22_n_x.T,log_y22_n_x.T))\n",
    "    log_duo_y22_nn_x = DMF(log_y22_n_x.T, knn=knn2, trans=False, K_diff = K_diff, epsilon = 5*epsilon) \n",
    "    y22_duo_acc, y22_duo_ari,y22_duo_nmi,y22_duo_ami = get_modified_IFPCA_metrics(log_duo_y22_nn_x.T, y, K)\n",
    "    print(\"name\", name)\n",
    "    print(\"IFPCA+: acc, ari, nmi, ami\", y22_duo_acc, y22_duo_ari,y22_duo_nmi,y22_duo_ami)\n",
    "\n",
    "else:\n",
    "    y22_n_acc, y22_n_ari,y22_n_nmi,y22_n_ami = get_modified_IFPCA_metrics(log_y22_n_x, y, K)\n",
    "    print(\"name\", name)\n",
    "    print(\"IFPCA+(DMF-IFPCA): acc, ari, nmi,ami\", y22_n_acc, y22_n_ari,y22_n_nmi,y22_n_ami)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Run IF-PCA for one round"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Running IF steps\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\Yi G\\anaconda3\\envs\\python310\\lib\\site-packages\\sklearn\\cluster\\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n",
      "  warnings.warn(\n",
      "c:\\Users\\Yi G\\anaconda3\\envs\\python310\\lib\\site-packages\\sklearn\\cluster\\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=6.\n",
      "  warnings.warn(\n",
      "c:\\Users\\Yi G\\anaconda3\\envs\\python310\\lib\\site-packages\\sklearn\\cluster\\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n",
      "  warnings.warn(\n",
      "c:\\Users\\Yi G\\anaconda3\\envs\\python310\\lib\\site-packages\\sklearn\\cluster\\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=6.\n",
      "  warnings.warn(\n",
      "c:\\Users\\Yi G\\anaconda3\\envs\\python310\\lib\\site-packages\\sklearn\\cluster\\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n",
      "  warnings.warn(\n",
      "c:\\Users\\Yi G\\anaconda3\\envs\\python310\\lib\\site-packages\\sklearn\\cluster\\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=6.\n",
      "  warnings.warn(\n",
      "c:\\Users\\Yi G\\anaconda3\\envs\\python310\\lib\\site-packages\\sklearn\\cluster\\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n",
      "  warnings.warn(\n",
      "c:\\Users\\Yi G\\anaconda3\\envs\\python310\\lib\\site-packages\\sklearn\\cluster\\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=6.\n",
      "  warnings.warn(\n",
      "c:\\Users\\Yi G\\anaconda3\\envs\\python310\\lib\\site-packages\\sklearn\\cluster\\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n",
      "  warnings.warn(\n",
      "c:\\Users\\Yi G\\anaconda3\\envs\\python310\\lib\\site-packages\\sklearn\\cluster\\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=6.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "IF_PCA_acc,IF_PCA_ari,IF_PCA_nmi,IF_PCA_ami = get_IFPCA_metrics(XX_log, y, K, normalize = False, top_K = True) \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Single cell dataset grun\n",
      "log-IF-PCA normalize = False, top_K = True: acc, ari, nmi, ami 0.6727030625832223 -0.09666208716629758 0.05403782779474527 0.05332742410622796\n"
     ]
    }
   ],
   "source": [
    "print (\"Single cell dataset\", name)\n",
    "\n",
    "print(\"log-IF-PCA normalize = False, top_K = True: acc, ari, nmi, ami\", IF_PCA_acc,IF_PCA_ari,IF_PCA_nmi,IF_PCA_ami)\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "python310",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
