{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ag16115/anaconda3/lib/python3.8/site-packages/scipy/__init__.py:138: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.1)\n",
      "  warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion} is required for this version of \"\n",
      "[nltk_data] Downloading package wordnet to /home/ag16115/nltk_data...\n",
      "[nltk_data]   Package wordnet is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from scipy import linalg\n",
    "from scipy import sparse\n",
    "from scipy.sparse.linalg import svds\n",
    "import ot\n",
    "\n",
    "\n",
    "from numpy.linalg import matrix_rank\n",
    "import itertools\n",
    "import copy\n",
    "import time as time\n",
    "from sklearn.cluster import AgglomerativeClustering\n",
    "from matplotlib.pyplot import figure\n",
    "from scipy.cluster.hierarchy import dendrogram, fcluster, cophenet\n",
    "from scipy.spatial. distance import pdist\n",
    "import plotly.graph_objects as go\n",
    "from plotly.subplots import make_subplots\n",
    "import hdbscan\n",
    "from scipy.linalg import orthogonal_procrustes\n",
    "\n",
    "from tqdm import tqdm\n",
    "from sklearn.metrics import pairwise_distances\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "from sklearn.metrics.cluster import adjusted_rand_score, fowlkes_mallows_score\n",
    "from scipy.stats import rankdata,kendalltau,sem\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.ticker as mtick\n",
    "from sklearn.manifold import TSNE\n",
    "from random import choices\n",
    "\n",
    "from matplotlib.lines import Line2D\n",
    "\n",
    "from textblob import Word\n",
    "from nltk.corpus import stopwords\n",
    "import nltk\n",
    "import re\n",
    "from pprint import pprint\n",
    "from sklearn.datasets import fetch_20newsgroups\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "nltk.download('wordnet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random \n",
    "random.seed(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Functions "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## General"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def signif(x, p):\n",
    "    x = np.asarray(x)\n",
    "    x_positive = np.where(np.isfinite(x) & (x != 0), np.abs(x), 10**(p-1))\n",
    "    mags = 10 ** (p - 1 - np.floor(np.log10(x_positive)))\n",
    "    return np.round(x * mags) / mags\n",
    "\n",
    "def flatten_list(l):\n",
    "    flat_list = [item for sublist in l for item in sublist]\n",
    "    return flat_list\n",
    "\n",
    "def check_symmetric(a, rtol=1e-05, atol=1e-08):\n",
    "    return np.allclose(a, a.T, rtol=rtol, atol=atol)\n",
    "\n",
    "def is_pos_def(x):\n",
    "    return np.all(np.linalg.eigvals(x) > 0)\n",
    "\n",
    "def reverse_dict(dic):\n",
    "    for r in dic.keys():\n",
    "        if not isinstance(dic[r], list):\n",
    "            dic[r] = [dic[r]]\n",
    "    inverse = { v: k for k, l in dic.items() for v in l }\n",
    "    return inverse\n",
    "\n",
    "def pc_scores(X, r):\n",
    "    U, s, Vh = svds(X,k=r)\n",
    "    idx = s.argsort()[::-1]   \n",
    "    Vh = Vh[idx,:]\n",
    "    Y = X @ Vh.T\n",
    "    return Y\n",
    "\n",
    "def embed_cov(X, r):\n",
    "    if r == X.shape[0]:\n",
    "        U, s, Vh = np.linalg.svd(X, full_matrices=True)\n",
    "    else:\n",
    "        U, s, Vh = svds(X,k=r)\n",
    "    idx = s.argsort()[::-1]   \n",
    "    U = U[:,idx]\n",
    "    s = s[idx]\n",
    "    ## need to take square root as these are eigenvalues\n",
    "    Y = U @ np.diag(np.sqrt(s)) \n",
    "    return Y\n",
    "\n",
    "def find_rows_to_merge(F):\n",
    "    f = copy.deepcopy(F)\n",
    "    np.fill_diagonal(f, -np.inf)\n",
    "    i,j = np.unravel_index(f.argmax(), f.shape)\n",
    "    return [i,j]   \n",
    "\n",
    "def inner_products(X):\n",
    "    return X.dot(X.T)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Dimension selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def wasserstein_dim_select(Y, split = 0.5, rmin = 1, rmax = 50):\n",
    "    n = Y.shape[0]\n",
    "    train = round(n * split)\n",
    "    rtry = int(np.min((train, rmax)))\n",
    "    if sparse.issparse(Y):\n",
    "        Y = Y.todense()\n",
    "    Ytrain = Y[:train,:]\n",
    "    Ytest = Y[train:n,:]\n",
    "    U, s, Vh = sparse.linalg.svds(Ytrain,k=rtry-1)\n",
    "    idx = s.argsort()[::-1] \n",
    "    s = s[idx]\n",
    "    Vh = Vh[idx,:]\n",
    "    ws = []\n",
    "    for r in tqdm(range(rmin,rtry+1)):\n",
    "        P = Vh.T[:,:r] @ Vh[:r,:]\n",
    "        Yproj = Ytrain @ P.T\n",
    "        n1 = Yproj.shape[0]\n",
    "        n2 = Ytest.shape[0]\n",
    "        M = ot.dist(Yproj,Ytest, metric='euclidean')\n",
    "        W1 = ot.emd2(np.repeat(1/n1,n1),np.repeat(1/n2,n2),M)\n",
    "        ws.append(W1)\n",
    "    return ws"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Document data: text functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def del_email_address(text):\n",
    "    e = '\\S*@\\S*\\s?'\n",
    "    pattern = re.compile(e)\n",
    "    return pattern.sub('', text) \n",
    "\n",
    "def clean_text(text):\n",
    "    return \" \".join([ Word(word).lemmatize() for word in re.sub(\"[^A-Za-z0-9]+\", \" \", text).lower().split() if word not in stopword])  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## IP HC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def ip_metric(X,Y):\n",
    "    return  np.sum(X * Y)\n",
    "\n",
    "def ip_affinity(X):\n",
    "    ips = pairwise_distances(X, metric = ip_metric)\n",
    "    return np.max(ips) - ips"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def clusters_to_labels(clusters):\n",
    "    d = defaultdict(list)\n",
    "    for index, sublist in enumerate(clusters):\n",
    "        for item in sublist:\n",
    "            d[item].append(index)\n",
    "    labels = flatten_list([d[c] for c in range(len(d.keys()))])\n",
    "    return labels"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Ranking"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def find_ancestors(model, target):\n",
    "    n_samples = len(model.labels_)\n",
    "    global ances\n",
    "    for ind, merge in enumerate(model.children_):\n",
    "        if target in merge:\n",
    "            if n_samples+ind in ances:\n",
    "                return [target]+ ances[n_samples+ind]\n",
    "            ances[n_samples+ind] = find_ancestors(model,n_samples+ind)\n",
    "            return [target]+ances[n_samples+ind]\n",
    "    return [ind+n_samples]\n",
    "\n",
    "def find_descendents(model,node):\n",
    "    n_samples = len(model.labels_)\n",
    "    global desc\n",
    "    if node in desc:\n",
    "        return desc[node]\n",
    "    if node < n_samples:\n",
    "        return [node]\n",
    "    pair = model.children_[node-n_samples]\n",
    "    desc[node] = find_descendents(model,pair[0])+find_descendents(model,pair[1])\n",
    "    return desc[node]\n",
    "\n",
    "def get_ranking(model, target):\n",
    "    rank = np.zeros(len(model.labels_))\n",
    "    to_root = [find_descendents(model, cl) for cl in find_ancestors(model, target)]\n",
    "    to_rank = [list(set(to_root[i+1]) - set(to_root[i])) for i in range(len(to_root)-1)]\n",
    "    for i in range(1,len(to_rank)+1):\n",
    "        rank[to_rank[i-1]] = i\n",
    "    return rank"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def find_ancestors_v1(children,n, target):\n",
    "    n_samples = n\n",
    "    global ances\n",
    "    for ind, merge in enumerate(children):\n",
    "        if target in merge:\n",
    "            if n_samples+ind in ances:\n",
    "                return [target]+ ances[n_samples+ind]\n",
    "            ances[n_samples+ind] = find_ancestors_v1(children,n,n_samples+ind)\n",
    "            return [target]+ances[n_samples+ind]\n",
    "    return [ind+n_samples]\n",
    "\n",
    "def find_descendents_v1(children,n ,node):\n",
    "    n_samples = n\n",
    "    global desc\n",
    "    if node in desc:\n",
    "        return desc[node]\n",
    "    if node < n_samples:\n",
    "        return [node]\n",
    "    pair = children[node-n_samples]\n",
    "    desc[node] = find_descendents_v1(children,n,pair[0])+find_ancestors_v1(children,n,pair[1])\n",
    "    return desc[node]\n",
    "\n",
    "def get_ranking_v1(children,n, target):\n",
    "    rank = np.zeros(n)\n",
    "    to_root = [find_descendents_v1(children,n, cl) for cl in find_ancestors_v1(children,n, target)]\n",
    "    to_rank = [list(set(to_root[i+1]) - set(to_root[i])) for i in range(len(to_root)-1)]\n",
    "    for i in range(1,len(to_rank)+1):\n",
    "        rank[to_rank[i-1]] = i\n",
    "    return rank"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# input data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # df = pd.read_csv('train_40k.csv')\n",
    "# # df = df[df['Cat3'] != 'unknown']\n",
    "\n",
    "# min_length = 2\n",
    "# lengths = [len(df['Text'].iloc[i].split()) for i in range(len(df))]\n",
    "# length_ids = np.where(np.array(lengths) > min_length)[0]\n",
    "# df = df.iloc[length_ids].reset_index()\n",
    "\n",
    "# # ## use random sample of data \n",
    "# n = 5000\n",
    "# df = df.sample(n=n, replace=False).reset_index(drop=True)\n",
    "# n = len(df)\n",
    "\n",
    "# ## list of common words to delete\n",
    "# stopword = set(nltk.corpus.stopwords.words(\"english\"))\n",
    "# # gets rid of stopwords, symbols, makes lower case and base words\n",
    "# df[\"Text\"] = df.Text.apply(lambda row: clean_text(row))\n",
    "\n",
    "# # min_length = 2\n",
    "# # lengths = [len(df['Text'].iloc[i].split()) for i in range(n)]\n",
    "# # length_ids = np.where(np.array(lengths) > min_length)[0]\n",
    "# # df = df.iloc[length_ids].reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "## load sample of review data\n",
    "df = pd.read_csv('sample_df.csv')\n",
    "n = len(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "## list of common words to delete\n",
    "stopword = set(nltk.corpus.stopwords.words(\"english\"))\n",
    "df[\"Text\"] = df.Text.apply(lambda row: clean_text(row))\n",
    "\n",
    "## vectorize data\n",
    "vectorizer = TfidfVectorizer(min_df = 3, max_df = n-500)\n",
    "Y = vectorizer.fit_transform(df.Text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5000 5588\n"
     ]
    }
   ],
   "source": [
    "(n,p) = Y.shape\n",
    "print(n,p)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create true ranking\n",
    "id_2cats = {i:list(df[['Cat1','Cat2','Cat3']].iloc[i]) for i in range(n)}\n",
    "pairs = np.array(list(itertools.combinations(list(range(n)), 2)))\n",
    "n_inter =  [len(list(set(id_2cats[pairs[i][0]]) & set(id_2cats[pairs[i][1]]))) for i in range(pairs.shape[0])]\n",
    "\n",
    "upper = np.zeros((n, n))\n",
    "upper[np.triu_indices(n, 1)] = np.max(n_inter) - n_inter\n",
    "true_ranking = upper + upper.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "## dimension selection\n",
    "# rmin = 1\n",
    "# rmax = 50\n",
    "# ws = wasserstein_dim_select(Y, rmin = rmin, rmax = rmax)\n",
    "# dim = np.argmin(ws) + rmin\n",
    "# print(f'Dimension selected: {dim}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "dim = 22 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "zeta = p**-.5 * pc_scores(Y, dim)\n",
    "zeta = np.array(zeta)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "# method comparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "method_comparison_df = pd.DataFrame()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "Y = np.asarray(Y.todense())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## dot product"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### PCA "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1min 17s, sys: 298 ms, total: 1min 18s\n",
      "Wall time: 1min 18s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "AgglomerativeClustering(affinity=<function ip_affinity at 0x7f2f34407e50>,\n",
       "                        distance_threshold=0, linkage='average',\n",
       "                        n_clusters=None)"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ip_clust = AgglomerativeClustering(affinity = ip_affinity, linkage = 'average',distance_threshold=0, n_clusters=None)\n",
    "ip_clust.fit(zeta);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 8min 32s, sys: 291 ms, total: 8min 32s\n",
      "Wall time: 8min 34s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.14268642895718653"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ances = {}; desc = {}\n",
    "ip_ranking = np.array([get_ranking(ip_clust,t) for t in range(n)])\n",
    "\n",
    "ip_kt_z = [kendalltau(ip_ranking[i], true_ranking[i]).correlation for i in range(ip_ranking.shape[0])]\n",
    "np.mean(ip_kt_z)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 3min 38s, sys: 444 ms, total: 3min 39s\n",
      "Wall time: 3min 39s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "AgglomerativeClustering(affinity=<function ip_affinity at 0x7f2f34407e50>,\n",
       "                        distance_threshold=0, linkage='average',\n",
       "                        n_clusters=None)"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ip_clust = AgglomerativeClustering(affinity = ip_affinity, linkage = 'average',distance_threshold=0, n_clusters=None)\n",
    "ip_clust.fit(Y);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "KeyboardInterrupt\n",
      "\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "ances = {}; desc = {}\n",
    "ip_ranking = np.array([get_ranking(ip_clust,t) for t in range(n)])\n",
    "\n",
    "ip_kt_y = [kendalltau(ip_ranking[i], true_ranking[i]).correlation for i in range(ip_ranking.shape[0])]\n",
    "np.mean(ip_kt_y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## ward"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### PCA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1 s, sys: 52 ms, total: 1.06 s\n",
      "Wall time: 1.05 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "AgglomerativeClustering(distance_threshold=0, n_clusters=None)"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ward = AgglomerativeClustering(linkage=\"ward\", distance_threshold=0, n_clusters = None)\n",
    "ward.fit(zeta)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "ances = {}; desc = {}\n",
    "w_ranking = np.array([get_ranking(ward,t) for t in range(n)])\n",
    "\n",
    "w_kt_z = [kendalltau(w_ranking[i], true_ranking[i]).correlation for i in range(w_ranking.shape[0])]\n",
    "np.mean(w_kt_z)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "ward = AgglomerativeClustering(linkage=\"ward\", distance_threshold=0, n_clusters = None)\n",
    "ward.fit(Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "ances = {}; desc = {}\n",
    "w_ranking = np.array([get_ranking(ward,t) for t in range(n)])\n",
    "\n",
    "w_kt_y = [kendalltau(w_ranking[i], true_ranking[i]).correlation for i in range(w_ranking.shape[0])]\n",
    "np.mean(w_kt_y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## UPGMA"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### PCA "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "average = AgglomerativeClustering(linkage=\"average\", distance_threshold=0, n_clusters = None)\n",
    "average.fit(zeta)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "ances = {}; desc  = {}\n",
    "a_ranking = np.array([get_ranking(average,t) for t in range(n)])\n",
    "\n",
    "a_kt_z = [kendalltau(a_ranking[i], true_ranking[i]).correlation for i in range(a_ranking.shape[0])]\n",
    "np.mean(a_kt_z)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Y "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "average = AgglomerativeClustering(linkage=\"average\", distance_threshold=0, n_clusters = None)\n",
    "average.fit(Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "ances = {}; desc  = {}\n",
    "a_ranking = np.array([get_ranking(average,t) for t in range(n)])\n",
    "\n",
    "a_kt_y = [kendalltau(a_ranking[i], true_ranking[i]).correlation for i in range(a_ranking.shape[0])]\n",
    "np.mean(a_kt_y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## cosine "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### PCA "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "cosine = AgglomerativeClustering(affinity = 'cosine', linkage = 'average', distance_threshold=0, n_clusters = None)\n",
    "cosine.fit(zeta)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "ances = {}; desc  = {}\n",
    "cs_ranking = np.array([get_ranking(cosine,t) for t in range(n)])\n",
    "\n",
    "cs_kt_z = [kendalltau(cs_ranking[i], true_ranking[i]).correlation for i in range(cs_ranking.shape[0])]\n",
    "np.mean(cs_kt_z)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "cosine = AgglomerativeClustering(affinity = 'cosine', linkage = 'average', distance_threshold=0, n_clusters = None)\n",
    "cosine.fit(Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "ances = {}; desc  = {}\n",
    "cs_ranking = np.array([get_ranking(cosine,t) for t in range(n)])\n",
    "\n",
    "cs_kt_y = [kendalltau(cs_ranking[i], true_ranking[i]).correlation for i in range(cs_ranking.shape[0])]\n",
    "np.mean(cs_kt_y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## save"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "method_comparison_df['ip_z'] = [np.mean(ip_kt_z),sem(ip_kt_z)]\n",
    "method_comparison_df['ip_y'] = [np.mean(ip_kt_y),sem(ip_kt_y)]\n",
    "\n",
    "method_comparison_df['w_z'] = [np.mean(w_kt_z),sem(w_kt_z)]\n",
    "method_comparison_df['w_y'] = [np.mean(w_kt_y),sem(w_kt_y)]\n",
    "\n",
    "method_comparison_df['a_z'] = [np.mean(a_kt_z),sem(a_kt_z)]\n",
    "method_comparison_df['a_y'] = [np.mean(a_kt_y),sem(a_kt_y)]\n",
    "\n",
    "method_comparison_df['cs_z'] = [np.mean(cs_kt_z),sem(cs_kt_z)]\n",
    "method_comparison_df['cs_y'] = [np.mean(cs_kt_y),sem(cs_kt_y)]\n",
    "\n",
    "# method_comparison_df.to_csv('method_comparison_df.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## hdbscan"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### PCA "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# if need to increase recursion limit\n",
    "\n",
    "import sys\n",
    "sys.getrecursionlimit()\n",
    "sys.setrecursionlimit(10000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "hdbscan_ = hdbscan.HDBSCAN(min_cluster_size=2)\n",
    "hdbscan_.fit(zeta)\n",
    "\n",
    "hdbscan_.single_linkage_tree_;\n",
    "hdbscan_.children_ = np.array(hdbscan_.single_linkage_tree_.to_pandas()[['left_child','right_child']], dtype=int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "ances = {}; desc = {}\n",
    "hdbscan_ranking = np.array([get_ranking(hdbscan_,t) for t in range(n)]) \n",
    "\n",
    "hdbscan_kt_z = [kendalltau(hdbscan_ranking[i], true_ranking[i]).correlation for i in range(hdbscan_ranking.shape[0])]\n",
    "np.mean(hdbscan_kt_z)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "hdbscan_ = hdbscan.HDBSCAN(min_cluster_size=2)\n",
    "hdbscan_.fit(Y)\n",
    "\n",
    "hdbscan_.single_linkage_tree_;\n",
    "hdbscan_.children_ = np.array(hdbscan_.single_linkage_tree_.to_pandas()[['left_child','right_child']], dtype=int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "ances = {}; desc = {}\n",
    "hdbscan_ranking = np.array([get_ranking(hdbscan_,t) for t in range(n)]) # range(n)\n",
    "\n",
    "hdbscan_kt_y = [kendalltau(hdbscan_ranking[i], true_ranking[i]).correlation for i in range(hdbscan_ranking.shape[0])]\n",
    "np.mean(hdbscan_kt_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "method_comparison_df['hdbscan_z'] = [np.mean(hdbscan_kt_z),sem(hdbscan_kt_z)]\n",
    "method_comparison_df['hdbscan_y'] = [np.mean(hdbscan_kt_y),sem(hdbscan_kt_y)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# method_comparison_df.to_csv('method_comparison_df.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Appendix method comparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "linkage = ['complete','single']\n",
    "metric = ['euclidean', 'cosine']\n",
    "combs = list(itertools.product(linkage, metric))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "method_comparison_df_v2 = pd.DataFrame(columns = ['linkage','metric',\n",
    "                                                  'zeta_mean','zeta_se','Y_mean','Y_se'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(len(combs)):\n",
    "    print(combs[i])\n",
    "    \n",
    "    on_zeta = AgglomerativeClustering(affinity = combs[i][1], linkage = combs[i][0],distance_threshold=0, n_clusters=None)\n",
    "    on_zeta.fit(zeta);\n",
    "    ances = {}; desc = {}\n",
    "    on_zeta_ranking = np.array([get_ranking(on_zeta,t) for t in range(n)])\n",
    "    kt_z = [kendalltau(on_zeta_ranking[i], true_ranking[i]).correlation for i in range(on_zeta_ranking.shape[0])]\n",
    "    \n",
    "    on_Y = AgglomerativeClustering(affinity = combs[i][1], linkage = combs[i][0],distance_threshold=0, n_clusters=None)\n",
    "    on_Y.fit(Y);\n",
    "    ances = {}; desc = {}\n",
    "    on_Y_ranking = np.array([get_ranking(on_Y,t) for t in range(n)])\n",
    "    kt_Y = [kendalltau(on_Y_ranking[i], true_ranking[i]).correlation for i in range(on_Y_ranking.shape[0])]\n",
    "    \n",
    "    \n",
    "    new_row = {'linkage': combs[i][0],'metric':combs[i][1],\n",
    "               'zeta_mean': np.mean(kt_z),'zeta_se': sem(kt_z),\n",
    "               'Y_mean': np.mean(kt_Y),'Y_se': sem(kt_Y)}\n",
    "    method_comparison_df_v2 = pd.concat([method_comparison_df_v2,pd.DataFrame(new_row, index=[0])]).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "method_comparison_df_v2.to_csv('method_comparison_v2.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
