{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package wordnet to /Users/manpw/nltk_data...\n",
      "[nltk_data]   Package wordnet is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from scipy import linalg\n",
    "from scipy import sparse\n",
    "from scipy.sparse.linalg import svds\n",
    "#import ot\n",
    "\n",
    "\n",
    "from numpy.linalg import matrix_rank\n",
    "import itertools\n",
    "import copy\n",
    "import time as time\n",
    "from sklearn.cluster import AgglomerativeClustering\n",
    "from matplotlib.pyplot import figure\n",
    "from scipy.cluster.hierarchy import dendrogram, fcluster, cophenet\n",
    "from scipy.spatial. distance import pdist\n",
    "import plotly.graph_objects as go\n",
    "from plotly.subplots import make_subplots\n",
    "import hdbscan\n",
    "from scipy.linalg import orthogonal_procrustes\n",
    "\n",
    "from tqdm import tqdm\n",
    "from sklearn.metrics import pairwise_distances\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "from sklearn.metrics.cluster import adjusted_rand_score, fowlkes_mallows_score\n",
    "from scipy.stats import rankdata,kendalltau,sem\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.ticker as mtick\n",
    "from sklearn.manifold import TSNE\n",
    "from random import choices\n",
    "\n",
    "from matplotlib.lines import Line2D\n",
    "\n",
    "from textblob import Word\n",
    "from nltk.corpus import stopwords\n",
    "import nltk\n",
    "import re\n",
    "from pprint import pprint\n",
    "from sklearn.datasets import fetch_20newsgroups\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "nltk.download('wordnet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random \n",
    "random.seed(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Functions "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## General"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def signif(x, p):\n",
    "    x = np.asarray(x)\n",
    "    x_positive = np.where(np.isfinite(x) & (x != 0), np.abs(x), 10**(p-1))\n",
    "    mags = 10 ** (p - 1 - np.floor(np.log10(x_positive)))\n",
    "    return np.round(x * mags) / mags\n",
    "\n",
    "def flatten_list(l):\n",
    "    flat_list = [item for sublist in l for item in sublist]\n",
    "    return flat_list\n",
    "\n",
    "def check_symmetric(a, rtol=1e-05, atol=1e-08):\n",
    "    return np.allclose(a, a.T, rtol=rtol, atol=atol)\n",
    "\n",
    "def is_pos_def(x):\n",
    "    return np.all(np.linalg.eigvals(x) > 0)\n",
    "\n",
    "def reverse_dict(dic):\n",
    "    for r in dic.keys():\n",
    "        if not isinstance(dic[r], list):\n",
    "            dic[r] = [dic[r]]\n",
    "    inverse = { v: k for k, l in dic.items() for v in l }\n",
    "    return inverse\n",
    "\n",
    "def pc_scores(X, r):\n",
    "    U, s, Vh = svds(X,k=r)\n",
    "    idx = s.argsort()[::-1]   \n",
    "    Vh = Vh[idx,:]\n",
    "    Y = X @ Vh.T\n",
    "    return Y\n",
    "\n",
    "def embed_cov(X, r):\n",
    "    if r == X.shape[0]:\n",
    "        U, s, Vh = np.linalg.svd(X, full_matrices=True)\n",
    "    else:\n",
    "        U, s, Vh = svds(X,k=r)\n",
    "    idx = s.argsort()[::-1]   \n",
    "    U = U[:,idx]\n",
    "    s = s[idx]\n",
    "    ## need to take square root as these are eigenvalues\n",
    "    Y = U @ np.diag(np.sqrt(s)) \n",
    "    return Y\n",
    "\n",
    "def find_rows_to_merge(F):\n",
    "    f = copy.deepcopy(F)\n",
    "    np.fill_diagonal(f, -np.inf)\n",
    "    i,j = np.unravel_index(f.argmax(), f.shape)\n",
    "    return [i,j]   \n",
    "\n",
    "def inner_products(X):\n",
    "    return X.dot(X.T)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Dimension selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def wasserstein_dim_select(Y, split = 0.5, rmin = 1, rmax = 50):\n",
    "    n = Y.shape[0]\n",
    "    train = round(n * split)\n",
    "    rtry = int(np.min((train, rmax)))\n",
    "    if sparse.issparse(Y):\n",
    "        Y = Y.todense()\n",
    "    Ytrain = Y[:train,:]\n",
    "    Ytest = Y[train:n,:]\n",
    "    U, s, Vh = sparse.linalg.svds(Ytrain,k=rtry-1)\n",
    "    idx = s.argsort()[::-1] \n",
    "    s = s[idx]\n",
    "    Vh = Vh[idx,:]\n",
    "    ws = []\n",
    "    for r in tqdm(range(rmin,rtry+1)):\n",
    "        P = Vh.T[:,:r] @ Vh[:r,:]\n",
    "        Yproj = Ytrain @ P.T\n",
    "        n1 = Yproj.shape[0]\n",
    "        n2 = Ytest.shape[0]\n",
    "        M = ot.dist(Yproj,Ytest, metric='euclidean')\n",
    "        W1 = ot.emd2(np.repeat(1/n1,n1),np.repeat(1/n2,n2),M)\n",
    "        ws.append(W1)\n",
    "    return ws"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Document data: text functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def del_email_address(text):\n",
    "    e = '\\S*@\\S*\\s?'\n",
    "    pattern = re.compile(e)\n",
    "    return pattern.sub('', text) \n",
    "\n",
    "def clean_text(text):\n",
    "    return \" \".join([ Word(word).lemmatize() for word in re.sub(\"[^A-Za-z0-9]+\", \" \", text).lower().split() if word not in stopword])  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## IP HC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def ip_metric(X,Y):\n",
    "    return  np.sum(X * Y)\n",
    "\n",
    "def ip_affinity(X):\n",
    "    ips = pairwise_distances(X, metric = ip_metric)\n",
    "    return np.max(ips) - ips"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def clusters_to_labels(clusters):\n",
    "    d = defaultdict(list)\n",
    "    for index, sublist in enumerate(clusters):\n",
    "        for item in sublist:\n",
    "            d[item].append(index)\n",
    "    labels = flatten_list([d[c] for c in range(len(d.keys()))])\n",
    "    return labels"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Ranking"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def find_ancestors(model, target):\n",
    "    n_samples = len(model.labels_)\n",
    "    global ances\n",
    "    for ind, merge in enumerate(model.children_):\n",
    "        if target in merge:\n",
    "            if n_samples+ind in ances:\n",
    "                return [target]+ ances[n_samples+ind]\n",
    "            ances[n_samples+ind] = find_ancestors(model,n_samples+ind)\n",
    "            return [target]+ances[n_samples+ind]\n",
    "    return [ind+n_samples]\n",
    "\n",
    "def find_descendents(model,node):\n",
    "    n_samples = len(model.labels_)\n",
    "    global desc\n",
    "    if node in desc:\n",
    "        return desc[node]\n",
    "    if node < n_samples:\n",
    "        return [node]\n",
    "    pair = model.children_[node-n_samples]\n",
    "    desc[node] = find_descendents(model,pair[0])+find_descendents(model,pair[1])\n",
    "    return desc[node]\n",
    "\n",
    "def get_ranking(model, target):\n",
    "    rank = np.zeros(len(model.labels_))\n",
    "    to_root = [find_descendents(model, cl) for cl in find_ancestors(model, target)]\n",
    "    to_rank = [list(set(to_root[i+1]) - set(to_root[i])) for i in range(len(to_root)-1)]\n",
    "    for i in range(1,len(to_rank)+1):\n",
    "        rank[to_rank[i-1]] = i\n",
    "    return rank"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def find_ancestors_v1(children,n, target):\n",
    "    n_samples = n\n",
    "    global ances\n",
    "    for ind, merge in enumerate(children):\n",
    "        if target in merge:\n",
    "            if n_samples+ind in ances:\n",
    "                return [target]+ ances[n_samples+ind]\n",
    "            ances[n_samples+ind] = find_ancestors_v1(children,n,n_samples+ind)\n",
    "            return [target]+ances[n_samples+ind]\n",
    "    return [ind+n_samples]\n",
    "\n",
    "def find_descendents_v1(children,n ,node):\n",
    "    n_samples = n\n",
    "    global desc\n",
    "    if node in desc:\n",
    "        return desc[node]\n",
    "    if node < n_samples:\n",
    "        return [node]\n",
    "    pair = children[node-n_samples]\n",
    "    desc[node] = find_descendents_v1(children,n,pair[0])+find_ancestors_v1(children,n,pair[1])\n",
    "    return desc[node]\n",
    "\n",
    "def get_ranking_v1(children,n, target):\n",
    "    rank = np.zeros(n)\n",
    "    to_root = [find_descendents_v1(children,n, cl) for cl in find_ancestors_v1(children,n, target)]\n",
    "    to_rank = [list(set(to_root[i+1]) - set(to_root[i])) for i in range(len(to_root)-1)]\n",
    "    for i in range(1,len(to_rank)+1):\n",
    "        rank[to_rank[i-1]] = i\n",
    "    return rank"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Plotting dendrogram"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot_dendrogram(model, rescale = False, size = (10,10), **kwargs):\n",
    "    # Create linkage matrix and then plot the dendrogram\n",
    "\n",
    "    # create the counts of samples under each node\n",
    "    counts = np.zeros(model.children_.shape[0])\n",
    "    n_samples = len(model.labels_)\n",
    "    for i, merge in enumerate(model.children_):\n",
    "        current_count = 0\n",
    "        for child_idx in merge:\n",
    "            if child_idx < n_samples:\n",
    "                current_count += 1  # leaf node\n",
    "            else:\n",
    "                current_count += counts[child_idx - n_samples]\n",
    "        counts[i] = current_count\n",
    "    \n",
    "    if rescale == True:\n",
    "        d_max = np.max(model.distances_)\n",
    "        d_min = np.min(model.distances_)\n",
    "        distances = (model.distances_ - d_min) / (d_max - d_min)\n",
    "    else:\n",
    "        distances = model.distances_\n",
    "\n",
    "    linkage_matrix = np.column_stack(\n",
    "        [model.children_, distances, counts]\n",
    "    ).astype(float)\n",
    "\n",
    "    # Plot the corresponding dendrogram\n",
    "    fig = plt.figure(figsize = size)\n",
    "    dendrogram(linkage_matrix, **kwargs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def linkage_matrix(model, rescale = False):\n",
    "    # Create linkage matrix and then plot the dendrogram\n",
    "\n",
    "    # create the counts of samples under each node\n",
    "    counts = np.zeros(model.children_.shape[0])\n",
    "    n_samples = len(model.labels_)\n",
    "    for i, merge in enumerate(model.children_):\n",
    "        current_count = 0\n",
    "        for child_idx in merge:\n",
    "            if child_idx < n_samples:\n",
    "                current_count += 1  # leaf node\n",
    "            else:\n",
    "                current_count += counts[child_idx - n_samples]\n",
    "        counts[i] = current_count\n",
    "    \n",
    "    if rescale == True:\n",
    "        d_max = np.max(model.distances_)\n",
    "        d_min = np.min(model.distances_)\n",
    "        distances = (model.distances_ - d_min) / (d_max - d_min)\n",
    "    else:\n",
    "        distances = model.distances_\n",
    "\n",
    "    linkage_matrix = np.column_stack(\n",
    "        [model.children_, distances, counts]\n",
    "    ).astype(float)\n",
    "\n",
    "    return linkage_matrix"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Simulated Data: Simulate tree "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def find_tree_paths(observed_nodes, tree_dict):\n",
    "    if observed_nodes == 'leaf':\n",
    "        Z = np.array(list(set(tree_dict.keys()) - set(tree_dict.values())))\n",
    "    elif observed_nodes == 'all':\n",
    "        Z = np.array(list(set(list(tree_dict.keys()) + list(tree_dict.values()))))\n",
    "    else:\n",
    "        Z = np.array(observed_nodes)\n",
    "    origin_node = list(set(tree_dict.values()) - set(tree_dict.keys()))[0]\n",
    "    paths  = []\n",
    "    for z in Z:\n",
    "        path = [z]\n",
    "        if z == origin_node:\n",
    "            paths.append([z])\n",
    "        else:\n",
    "            while True:\n",
    "                parent_node =  tree_dict[path[-1]]\n",
    "                path = path + [parent_node]\n",
    "                if parent_node == origin_node:\n",
    "                    path.reverse()\n",
    "                    paths.append(path)\n",
    "                    break\n",
    "    return paths\n",
    "\n",
    "def make_tree_cov(paths,variance_dict, root_mean =0):\n",
    "    path_comb = list(itertools.product(paths, paths))\n",
    "    varis = []\n",
    "    for pair in path_comb:\n",
    "        inter = list(set(pair[0]).intersection(pair[1]))\n",
    "        var = np.sum([variance_dict[i] for i in inter])\n",
    "        varis.append(var)\n",
    "    F = np.array(varis).reshape((int(np.sqrt(len(varis))),int(np.sqrt(len(varis)))))\n",
    "    return F + root_mean**2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_X_by_tree(observed_nodes, tree_dict,root_mean, variance_dict,n,p):\n",
    "    if observed_nodes == 'leaf':\n",
    "        Z = np.array(list(set(tree_dict.keys()) - set(tree_dict.values())))\n",
    "    elif observed_nodes == 'all':\n",
    "        Z = np.array(list(variance_dict.keys()))\n",
    "    else:\n",
    "        Z = np.array(observed_nodes)\n",
    "    zs = np.array(random.choices(Z, k=n))\n",
    "    origin_node = list(set(tree_dict.values()) - set(tree_dict.keys()))[0]\n",
    "    paths  = find_tree_paths(Z, tree_dict)\n",
    "    path_comb = list(itertools.product(paths, paths))\n",
    "    nodes = list(variance_dict.keys())\n",
    "    sns = {no: np.random.normal(0, np.sqrt(variance_dict[no]), p) for no in list(set(nodes) - set([origin_node]))}\n",
    "    sns[origin_node] = np.random.normal(root_mean, np.sqrt(variance_dict[origin_node]), p)\n",
    "    mus = np.array([np.sum([sns[i] for i in path],axis = 0) for path in paths])\n",
    "    X = mus[zs-1]\n",
    "    labels = zs\n",
    "    return X, labels"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# input data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Input \n",
    "* n x p data matrix: Y \n",
    "* true ranking of data: true_ranking"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "sigma = 1\n",
    "root_mean = 0\n",
    "\n",
    "# defines the tree, 1:6 means 6 is a parent of 1 \n",
    "tree_dict = {1:6, 2:6, 3:6, 4:7, 5:7, 6:8, 7:8}\n",
    "\n",
    "# variance of each gaussian increment\n",
    "# 1:1 means the branch length between 1 and it's parent is 1\n",
    "variance_dict = {1: 5, 2: 2, 3: 2, 4: 0.5, 5: 7, 6: 2, 7: 1, 8: 1} # different bls\n",
    "\n",
    "\n",
    "# which vertices we have samples from\n",
    "realised_nodes = 'leaf'\n",
    "\n",
    "n = 1000\n",
    "p = 1000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/_k/160f1xr53jb36kv11_bcb42x_84r1r/T/ipykernel_71507/3200668975.py:2: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n",
      "  if observed_nodes == 'leaf':\n",
      "/var/folders/_k/160f1xr53jb36kv11_bcb42x_84r1r/T/ipykernel_71507/3200668975.py:4: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n",
      "  elif observed_nodes == 'all':\n"
     ]
    }
   ],
   "source": [
    "## make data matrix\n",
    "F = make_tree_cov(find_tree_paths(realised_nodes, tree_dict), variance_dict,root_mean)\n",
    "X,labels = make_X_by_tree(realised_nodes, tree_dict,root_mean, variance_dict,n,p)\n",
    "Y = X + np.random.normal(0,sigma,X.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "## dimension selection\n",
    "# rmin = 1\n",
    "# rmax = 50\n",
    "# ws = wasserstein_dim_select(Y, rmin = rmin, rmax = rmax)\n",
    "# dim = np.argmin(ws) + rmin\n",
    "# print(f'Dimension selected: {dim}')\n",
    "\n",
    "dim = 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "zeta = p**-.5 * pc_scores(Y, dim)\n",
    "zeta = np.array(zeta)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "centers = embed_cov(F,F.shape[0])\n",
    "phi = centers[(labels-1)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "true_ranking = np.array([rankdata(-(phi @ phi.T)[i], method='dense')-1 for i in range(n)])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "# method comparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "method_comparison_df = pd.DataFrame()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## dot product"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### PCA "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 3.63 s, sys: 74.5 ms, total: 3.7 s\n",
      "Wall time: 4.22 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "AgglomerativeClustering(affinity=<function ip_affinity at 0x7f8ade022040>,\n",
       "                        distance_threshold=0, linkage='average',\n",
       "                        n_clusters=None)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ip_clust = AgglomerativeClustering(affinity = ip_affinity, linkage = 'average',distance_threshold=0, n_clusters=None)\n",
    "ip_clust.fit(zeta);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4.87 s, sys: 48.1 ms, total: 4.91 s\n",
      "Wall time: 5.16 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.8640724755954285"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ances = {}; desc = {}\n",
    "ip_ranking = np.array([get_ranking(ip_clust,t) for t in range(n)])\n",
    "\n",
    "ip_kt_z = [kendalltau(ip_ranking[i], true_ranking[i]).correlation for i in range(ip_ranking.shape[0])]\n",
    "np.mean(ip_kt_z)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 3.27 s, sys: 15.9 ms, total: 3.29 s\n",
      "Wall time: 3.29 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "AgglomerativeClustering(affinity=<function ip_affinity at 0x7f8ade022040>,\n",
       "                        distance_threshold=0, linkage='average',\n",
       "                        n_clusters=None)"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ip_clust = AgglomerativeClustering(affinity = ip_affinity, linkage = 'average',distance_threshold=0, n_clusters=None)\n",
    "ip_clust.fit(Y);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4.5 s, sys: 14.8 ms, total: 4.52 s\n",
      "Wall time: 4.53 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.8640721260020724"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ances = {}; desc = {}\n",
    "ip_ranking = np.array([get_ranking(ip_clust,t) for t in range(n)])\n",
    "\n",
    "ip_kt_y = [kendalltau(ip_ranking[i], true_ranking[i]).correlation for i in range(ip_ranking.shape[0])]\n",
    "np.mean(ip_kt_y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## ward"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### PCA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 25.1 ms, sys: 2.45 ms, total: 27.6 ms\n",
      "Wall time: 26.5 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "AgglomerativeClustering(distance_threshold=0, n_clusters=None)"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ward = AgglomerativeClustering(linkage=\"ward\", distance_threshold=0, n_clusters = None)\n",
    "ward.fit(zeta)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 3.16 s, sys: 12.1 ms, total: 3.17 s\n",
      "Wall time: 3.19 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.5199813567674053"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ances = {}; desc = {}\n",
    "w_ranking = np.array([get_ranking(ward,t) for t in range(n)])\n",
    "\n",
    "w_kt_z = [kendalltau(w_ranking[i], true_ranking[i]).correlation for i in range(w_ranking.shape[0])]\n",
    "np.mean(w_kt_z)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 284 ms, sys: 4.25 ms, total: 288 ms\n",
      "Wall time: 294 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "AgglomerativeClustering(distance_threshold=0, n_clusters=None)"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ward = AgglomerativeClustering(linkage=\"ward\", distance_threshold=0, n_clusters = None)\n",
    "ward.fit(Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 3.18 s, sys: 8.79 ms, total: 3.19 s\n",
      "Wall time: 3.19 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.5199758186002048"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ances = {}; desc = {}\n",
    "w_ranking = np.array([get_ranking(ward,t) for t in range(n)])\n",
    "\n",
    "w_kt_y = [kendalltau(w_ranking[i], true_ranking[i]).correlation for i in range(w_ranking.shape[0])]\n",
    "np.mean(w_kt_y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## UPGMA"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### PCA "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 23 ms, sys: 2.46 ms, total: 25.5 ms\n",
      "Wall time: 24.2 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "AgglomerativeClustering(distance_threshold=0, linkage='average',\n",
       "                        n_clusters=None)"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "average = AgglomerativeClustering(linkage=\"average\", distance_threshold=0, n_clusters = None)\n",
    "average.fit(zeta)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 3.36 s, sys: 9.95 ms, total: 3.37 s\n",
      "Wall time: 3.39 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.5200496705218345"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ances = {}; desc  = {}\n",
    "a_ranking = np.array([get_ranking(average,t) for t in range(n)])\n",
    "\n",
    "a_kt_z = [kendalltau(a_ranking[i], true_ranking[i]).correlation for i in range(a_ranking.shape[0])]\n",
    "np.mean(a_kt_z)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Y "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 266 ms, sys: 3.51 ms, total: 269 ms\n",
      "Wall time: 271 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "AgglomerativeClustering(distance_threshold=0, linkage='average',\n",
       "                        n_clusters=None)"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "average = AgglomerativeClustering(linkage=\"average\", distance_threshold=0, n_clusters = None)\n",
    "average.fit(Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 3.7 s, sys: 9.72 ms, total: 3.71 s\n",
      "Wall time: 3.71 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.5200707013911766"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ances = {}; desc  = {}\n",
    "a_ranking = np.array([get_ranking(average,t) for t in range(n)])\n",
    "\n",
    "a_kt_y = [kendalltau(a_ranking[i], true_ranking[i]).correlation for i in range(a_ranking.shape[0])]\n",
    "np.mean(a_kt_y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## cosine "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### PCA "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 23.9 ms, sys: 1.83 ms, total: 25.7 ms\n",
      "Wall time: 25.4 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "AgglomerativeClustering(affinity='cosine', distance_threshold=0,\n",
       "                        linkage='average', n_clusters=None)"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "cosine = AgglomerativeClustering(affinity = 'cosine', linkage = 'average', distance_threshold=0, n_clusters = None)\n",
    "cosine.fit(zeta)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 3.37 s, sys: 8.93 ms, total: 3.38 s\n",
      "Wall time: 3.39 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.806472833689444"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ances = {}; desc  = {}\n",
    "cs_ranking = np.array([get_ranking(cosine,t) for t in range(n)])\n",
    "\n",
    "cs_kt_z = [kendalltau(cs_ranking[i], true_ranking[i]).correlation for i in range(cs_ranking.shape[0])]\n",
    "np.mean(cs_kt_z)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 639 ms, sys: 3.41 ms, total: 643 ms\n",
      "Wall time: 643 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "AgglomerativeClustering(affinity='cosine', distance_threshold=0,\n",
       "                        linkage='average', n_clusters=None)"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "cosine = AgglomerativeClustering(affinity = 'cosine', linkage = 'average', distance_threshold=0, n_clusters = None)\n",
    "cosine.fit(Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 3.74 s, sys: 12.9 ms, total: 3.75 s\n",
      "Wall time: 3.75 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.8065015491100818"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ances = {}; desc  = {}\n",
    "cs_ranking = np.array([get_ranking(cosine,t) for t in range(n)])\n",
    "\n",
    "cs_kt_y = [kendalltau(cs_ranking[i], true_ranking[i]).correlation for i in range(cs_ranking.shape[0])]\n",
    "np.mean(cs_kt_y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## save"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "## save these before hdbscan as it might crash \n",
    "method_comparison_df['ip_z'] = [np.mean(ip_kt_z),sem(ip_kt_z)]\n",
    "method_comparison_df['ip_y'] = [np.mean(ip_kt_y),sem(ip_kt_y)]\n",
    "\n",
    "method_comparison_df['w_z'] = [np.mean(w_kt_z),sem(w_kt_z)]\n",
    "method_comparison_df['w_y'] = [np.mean(w_kt_y),sem(w_kt_y)]\n",
    "\n",
    "method_comparison_df['a_z'] = [np.mean(a_kt_z),sem(a_kt_z)]\n",
    "method_comparison_df['a_y'] = [np.mean(a_kt_y),sem(a_kt_y)]\n",
    "\n",
    "method_comparison_df['cs_z'] = [np.mean(cs_kt_z),sem(cs_kt_z)]\n",
    "method_comparison_df['cs_y'] = [np.mean(cs_kt_y),sem(cs_kt_y)]\n",
    "\n",
    "method_comparison_df.to_csv('method_comparison_df.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## hdbscan"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### PCA "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# if need to increase recursion limit\n",
    "\n",
    "# import sys\n",
    "# sys.getrecursionlimit()\n",
    "# sys.setrecursionlimit(10000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "hdbscan_ = hdbscan.HDBSCAN(min_cluster_size=2)\n",
    "hdbscan_.fit(zeta)\n",
    "\n",
    "hdbscan_.single_linkage_tree_;\n",
    "hdbscan_.children_ = np.array(hdbscan_.single_linkage_tree_.to_pandas()[['left_child','right_child']], dtype=int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4.54 s, sys: 33.7 ms, total: 4.58 s\n",
      "Wall time: 4.66 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.5200753330625603"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ances = {}; desc = {}\n",
    "hdbscan_ranking = np.array([get_ranking(hdbscan_,t) for t in range(n)]) \n",
    "\n",
    "hdbscan_kt_z = [kendalltau(hdbscan_ranking[i], true_ranking[i]).correlation for i in range(hdbscan_ranking.shape[0])]\n",
    "np.mean(hdbscan_kt_z)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "hdbscan_ = hdbscan.HDBSCAN(min_cluster_size=2)\n",
    "hdbscan_.fit(Y)\n",
    "\n",
    "hdbscan_.single_linkage_tree_;\n",
    "hdbscan_.children_ = np.array(hdbscan_.single_linkage_tree_.to_pandas()[['left_child','right_child']], dtype=int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4.98 s, sys: 35.4 ms, total: 5.01 s\n",
      "Wall time: 5.23 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.5200758280624505"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ances = {}; desc = {}\n",
    "hdbscan_ranking = np.array([get_ranking(hdbscan_,t) for t in range(n)]) # range(n)\n",
    "\n",
    "hdbscan_kt_y = [kendalltau(hdbscan_ranking[i], true_ranking[i]).correlation for i in range(hdbscan_ranking.shape[0])]\n",
    "np.mean(hdbscan_kt_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "method_comparison_df['hdbscan_z'] = [np.mean(hdbscan_kt_z),sem(hdbscan_kt_z)]\n",
    "method_comparison_df['hdbscan_y'] = [np.mean(hdbscan_kt_y),sem(hdbscan_kt_y)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "method_comparison_df.to_csv('method_comparison_df.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Apendix method comparison "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "linkage = ['complete','single']\n",
    "metric = ['euclidean', 'cosine']\n",
    "combs = list(itertools.product(linkage, metric))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "method_comparison_df_v2 = pd.DataFrame(columns = ['linkage','metric',\n",
    "                                                  'zeta_mean','zeta_se','Y_mean','Y_se'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(len(combs)):\n",
    "    print(combs[i])\n",
    "    \n",
    "    on_zeta = AgglomerativeClustering(affinity = combs[i][1], linkage = combs[i][0],distance_threshold=0, n_clusters=None)\n",
    "    on_zeta.fit(zeta);\n",
    "    ances = {}; desc = {}\n",
    "    on_zeta_ranking = np.array([get_ranking(on_zeta,t) for t in range(n)])\n",
    "    kt_z = [kendalltau(on_zeta_ranking[i], true_ranking[i]).correlation for i in range(on_zeta_ranking.shape[0])]\n",
    "    \n",
    "    on_Y = AgglomerativeClustering(affinity = combs[i][1], linkage = combs[i][0],distance_threshold=0, n_clusters=None)\n",
    "    on_Y.fit(Y);\n",
    "    ances = {}; desc = {}\n",
    "    on_Y_ranking = np.array([get_ranking(on_Y,t) for t in range(n)])\n",
    "    kt_Y = [kendalltau(on_Y_ranking[i], true_ranking[i]).correlation for i in range(on_Y_ranking.shape[0])]\n",
    "    \n",
    "    \n",
    "    new_row = {'linkage': combs[i][0],'metric':combs[i][1],\n",
    "               'zeta_mean': np.mean(kt_z),'zeta_se': sem(kt_z),\n",
    "               'Y_mean': np.mean(kt_Y),'Y_se': sem(kt_Y)}\n",
    "    method_comparison_df_v2 = pd.concat([method_comparison_df_v2,pd.DataFrame(new_row, index=[0])]).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "method_comparison_df_v2.to_csv('method_comparison_v2.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
