{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ag16115/anaconda3/lib/python3.8/site-packages/scipy/__init__.py:138: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.1)\n",
      "  warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion} is required for this version of \"\n",
      "[nltk_data] Downloading package wordnet to /home/ag16115/nltk_data...\n",
      "[nltk_data]   Package wordnet is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from scipy import linalg\n",
    "from scipy import sparse\n",
    "from scipy.sparse.linalg import svds\n",
    "import ot\n",
    "\n",
    "\n",
    "from numpy.linalg import matrix_rank\n",
    "import itertools\n",
    "import copy\n",
    "import time as time\n",
    "from sklearn.cluster import AgglomerativeClustering\n",
    "from matplotlib.pyplot import figure\n",
    "from scipy.cluster.hierarchy import dendrogram, fcluster, cophenet\n",
    "from scipy.spatial.distance import pdist\n",
    "import plotly.graph_objects as go\n",
    "from plotly.subplots import make_subplots\n",
    "import hdbscan\n",
    "from scipy.linalg import orthogonal_procrustes\n",
    "\n",
    "from tqdm import tqdm\n",
    "from sklearn.metrics import pairwise_distances\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "from sklearn.metrics.cluster import adjusted_rand_score, fowlkes_mallows_score\n",
    "from scipy.stats import rankdata,kendalltau,sem\n",
    "\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.ticker as mtick\n",
    "from sklearn.manifold import TSNE\n",
    "from random import choices\n",
    "\n",
    "from matplotlib.lines import Line2D\n",
    "\n",
    "from textblob import Word\n",
    "from nltk.corpus import stopwords\n",
    "import nltk\n",
    "import re\n",
    "from pprint import pprint\n",
    "from sklearn.datasets import fetch_20newsgroups\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "nltk.download('wordnet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random \n",
    "random.seed(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Functions "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## General"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def signif(x, p):\n",
    "    x = np.asarray(x)\n",
    "    x_positive = np.where(np.isfinite(x) & (x != 0), np.abs(x), 10**(p-1))\n",
    "    mags = 10 ** (p - 1 - np.floor(np.log10(x_positive)))\n",
    "    return np.round(x * mags) / mags\n",
    "\n",
    "def flatten_list(l):\n",
    "    flat_list = [item for sublist in l for item in sublist]\n",
    "    return flat_list\n",
    "\n",
    "def check_symmetric(a, rtol=1e-05, atol=1e-08):\n",
    "    return np.allclose(a, a.T, rtol=rtol, atol=atol)\n",
    "\n",
    "def is_pos_def(x):\n",
    "    return np.all(np.linalg.eigvals(x) > 0)\n",
    "\n",
    "def reverse_dict(dic):\n",
    "    for r in dic.keys():\n",
    "        if not isinstance(dic[r], list):\n",
    "            dic[r] = [dic[r]]\n",
    "    inverse = { v: k for k, l in dic.items() for v in l }\n",
    "    return inverse\n",
    "\n",
    "def pc_scores(X, r):\n",
    "    U, s, Vh = svds(X,k=r)\n",
    "    idx = s.argsort()[::-1]   \n",
    "    Vh = Vh[idx,:]\n",
    "    Y = X @ Vh.T\n",
    "    return Y\n",
    "\n",
    "def embed_cov(X, r):\n",
    "    if r == X.shape[0]:\n",
    "        U, s, Vh = np.linalg.svd(X, full_matrices=True)\n",
    "    else:\n",
    "        U, s, Vh = svds(X,k=r)\n",
    "    idx = s.argsort()[::-1]   \n",
    "    U = U[:,idx]\n",
    "    s = s[idx]\n",
    "    ## need to take square root as these are eigenvalues\n",
    "    Y = U @ np.diag(np.sqrt(s)) \n",
    "    return Y\n",
    "\n",
    "def find_rows_to_merge(F):\n",
    "    f = copy.deepcopy(F)\n",
    "    np.fill_diagonal(f, -np.inf)\n",
    "    i,j = np.unravel_index(f.argmax(), f.shape)\n",
    "    return [i,j]   \n",
    "\n",
    "def inner_products(X):\n",
    "    return X.dot(X.T)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Dimension selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def wasserstein_dim_select(Y, split = 0.5, rmin = 1, rmax = 50):\n",
    "    n = Y.shape[0]\n",
    "    train = round(n * split)\n",
    "    rtry = int(np.min((train, rmax)))\n",
    "    if sparse.issparse(Y):\n",
    "        Y = Y.todense()\n",
    "    Ytrain = Y[:train,:]\n",
    "    Ytest = Y[train:n,:]\n",
    "    U, s, Vh = sparse.linalg.svds(Ytrain,k=rtry-1)\n",
    "    idx = s.argsort()[::-1] \n",
    "    s = s[idx]\n",
    "    Vh = Vh[idx,:]\n",
    "    ws = []\n",
    "    for r in tqdm(range(rmin,rtry+1)):\n",
    "        P = Vh.T[:,:r] @ Vh[:r,:]\n",
    "        Yproj = Ytrain @ P.T\n",
    "        n1 = Yproj.shape[0]\n",
    "        n2 = Ytest.shape[0]\n",
    "        M = ot.dist(Yproj,Ytest, metric='euclidean')\n",
    "        W1 = ot.emd2(np.repeat(1/n1,n1),np.repeat(1/n2,n2),M)\n",
    "        ws.append(W1)\n",
    "    return ws"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Document data: text functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def del_email_address(text):\n",
    "    e = '\\S*@\\S*\\s?'\n",
    "    pattern = re.compile(e)\n",
    "    return pattern.sub('', text) \n",
    "\n",
    "def clean_text(text):\n",
    "    return \" \".join([ Word(word).lemmatize() for word in re.sub(\"[^A-Za-z0-9]+\", \" \", text).lower().split() if word not in stopword])  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## IP HC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def ip_metric(X,Y):\n",
    "    return  np.sum(X * Y)\n",
    "\n",
    "def ip_affinity(X):\n",
    "    ips = pairwise_distances(X, metric = ip_metric)\n",
    "    return np.max(ips) - ips"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def clusters_to_labels(clusters):\n",
    "    d = defaultdict(list)\n",
    "    for index, sublist in enumerate(clusters):\n",
    "        for item in sublist:\n",
    "            d[item].append(index)\n",
    "    labels = flatten_list([d[c] for c in range(len(d.keys()))])\n",
    "    return labels"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Ranking"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def find_ancestors(model, target):\n",
    "    n_samples = len(model.labels_)\n",
    "    global ances\n",
    "    for ind, merge in enumerate(model.children_):\n",
    "        if target in merge:\n",
    "            if n_samples+ind in ances:\n",
    "                return [target]+ ances[n_samples+ind]\n",
    "            ances[n_samples+ind] = find_ancestors(model,n_samples+ind)\n",
    "            return [target]+ances[n_samples+ind]\n",
    "    return [ind+n_samples]\n",
    "\n",
    "def find_descendents(model,node):\n",
    "    n_samples = len(model.labels_)\n",
    "    global desc\n",
    "    if node in desc:\n",
    "        return desc[node]\n",
    "    if node < n_samples:\n",
    "        return [node]\n",
    "    pair = model.children_[node-n_samples]\n",
    "    desc[node] = find_descendents(model,pair[0])+find_descendents(model,pair[1])\n",
    "    return desc[node]\n",
    "\n",
    "def get_ranking(model, target):\n",
    "    rank = np.zeros(len(model.labels_))\n",
    "    to_root = [find_descendents(model, cl) for cl in find_ancestors(model, target)]\n",
    "    to_rank = [list(set(to_root[i+1]) - set(to_root[i])) for i in range(len(to_root)-1)]\n",
    "    for i in range(1,len(to_rank)+1):\n",
    "        rank[to_rank[i-1]] = i\n",
    "    return rank"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def find_ancestors_v1(children,n, target):\n",
    "    n_samples = n\n",
    "    global ances\n",
    "    for ind, merge in enumerate(children):\n",
    "        if target in merge:\n",
    "            if n_samples+ind in ances:\n",
    "                return [target]+ ances[n_samples+ind]\n",
    "            ances[n_samples+ind] = find_ancestors_v1(children,n,n_samples+ind)\n",
    "            return [target]+ances[n_samples+ind]\n",
    "    return [ind+n_samples]\n",
    "\n",
    "def find_descendents_v1(children,n ,node):\n",
    "    n_samples = n\n",
    "    global desc\n",
    "    if node in desc:\n",
    "        return desc[node]\n",
    "    if node < n_samples:\n",
    "        return [node]\n",
    "    pair = children[node-n_samples]\n",
    "    desc[node] = find_descendents_v1(children,n,pair[0])+find_ancestors_v1(children,n,pair[1])\n",
    "    return desc[node]\n",
    "\n",
    "def get_ranking_v1(children,n, target):\n",
    "    rank = np.zeros(n)\n",
    "    to_root = [find_descendents_v1(children,n, cl) for cl in find_ancestors_v1(children,n, target)]\n",
    "    to_rank = [list(set(to_root[i+1]) - set(to_root[i])) for i in range(len(to_root)-1)]\n",
    "    for i in range(1,len(to_rank)+1):\n",
    "        rank[to_rank[i-1]] = i\n",
    "    return rank"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Plotting dendrogram"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot_dendrogram(model, rescale = False, size = (10,10), **kwargs):\n",
    "    # Create linkage matrix and then plot the dendrogram\n",
    "\n",
    "    # create the counts of samples under each node\n",
    "    counts = np.zeros(model.children_.shape[0])\n",
    "    n_samples = len(model.labels_)\n",
    "    for i, merge in enumerate(model.children_):\n",
    "        current_count = 0\n",
    "        for child_idx in merge:\n",
    "            if child_idx < n_samples:\n",
    "                current_count += 1  # leaf node\n",
    "            else:\n",
    "                current_count += counts[child_idx - n_samples]\n",
    "        counts[i] = current_count\n",
    "    \n",
    "    if rescale == True:\n",
    "        d_max = np.max(model.distances_)\n",
    "        d_min = np.min(model.distances_)\n",
    "        distances = (model.distances_ - d_min) / (d_max - d_min)\n",
    "    else:\n",
    "        distances = model.distances_\n",
    "\n",
    "    linkage_matrix = np.column_stack(\n",
    "        [model.children_, distances, counts]\n",
    "    ).astype(float)\n",
    "\n",
    "    # Plot the corresponding dendrogram\n",
    "    fig = plt.figure(figsize = size)\n",
    "    dendrogram(linkage_matrix, **kwargs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def linkage_matrix(model, rescale = False):\n",
    "    # Create linkage matrix and then plot the dendrogram\n",
    "\n",
    "    # create the counts of samples under each node\n",
    "    counts = np.zeros(model.children_.shape[0])\n",
    "    n_samples = len(model.labels_)\n",
    "    for i, merge in enumerate(model.children_):\n",
    "        current_count = 0\n",
    "        for child_idx in merge:\n",
    "            if child_idx < n_samples:\n",
    "                current_count += 1  # leaf node\n",
    "            else:\n",
    "                current_count += counts[child_idx - n_samples]\n",
    "        counts[i] = current_count\n",
    "    \n",
    "    if rescale == True:\n",
    "        d_max = np.max(model.distances_)\n",
    "        d_min = np.min(model.distances_)\n",
    "        distances = (model.distances_ - d_min) / (d_max - d_min)\n",
    "    else:\n",
    "        distances = model.distances_\n",
    "\n",
    "    linkage_matrix = np.column_stack(\n",
    "        [model.children_, distances, counts]\n",
    "    ).astype(float)\n",
    "\n",
    "    return linkage_matrix"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# input data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "## load wiki table of industry / subindustry\n",
    "tables = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')\n",
    "comp_df = tables[0]\n",
    "comp_df[\"Symbol\"] = comp_df[\"Symbol\"].map(lambda x: x.replace(\".\", \"-\"))  # rename symbol to escape symbol error "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "industry_dict = {comp_df['Symbol'].iloc[i]: comp_df['GICS Sector'].iloc[i] for i in range(len(comp_df))}\n",
    "sub_industry_dict = {comp_df['Symbol'].iloc[i]: comp_df['GICS Sub-Industry'].iloc[i] for i in range(len(comp_df))}\n",
    "security_dict = {comp_df['Security'].iloc[i]: comp_df['Symbol'].iloc[i] for i in range(len(comp_df))}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load return data\n",
    "df = pd.read_csv('sp500_data.csv')\n",
    "df = df[list(set(df.columns) & set(comp_df['Symbol']))]\n",
    "df = df.dropna(axis = 1, how = 'any')\n",
    "\n",
    "Y = np.array(df).T\n",
    "comps = df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "n = len(comps)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# true ranking of data \n",
    "hier_df = pd.DataFrame()\n",
    "hier_df['comp'] = comps\n",
    "hier_df['ind'] = [industry_dict[i] for i in comps]\n",
    "hier_df['sub_ind'] = [sub_industry_dict[i] for i in comps]\n",
    "\n",
    "id_2cats = {i:list(hier_df[['ind','sub_ind']].iloc[i]) for i in range(n)}\n",
    "pairs = np.array(list(itertools.combinations(list(range(n)), 2)))\n",
    "n_inter =  [len(list(set(id_2cats[pairs[i][0]]) & set(id_2cats[pairs[i][1]]))) for i in range(pairs.shape[0])]\n",
    "\n",
    "upper = np.zeros((n, n))\n",
    "upper[np.triu_indices(n, 1)] = np.max(n_inter) - n_inter\n",
    "true_ranking = upper + upper.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "Y = np.array(df).T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "368 1259\n"
     ]
    }
   ],
   "source": [
    "(n,p) = Y.shape\n",
    "print(n,p)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "## dimension selection\n",
    "# rmin = 1\n",
    "# rmax = 50\n",
    "# ws = wasserstein_dim_select(Y,rmin = rmin, rmax = rmax)\n",
    "# dim = np.argmin(ws) + rmin\n",
    "# print(f'Dimension selected: {dim}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "dim = 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "zeta = p**-.5 * pc_scores(Y, dim)\n",
    "zeta = np.array(zeta)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "# method comparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "method_comparison_df = pd.DataFrame()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## dot product"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### PCA "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 446 ms, sys: 500 ms, total: 946 ms\n",
      "Wall time: 342 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "AgglomerativeClustering(affinity=<function ip_affinity at 0x7fdab08ed430>,\n",
       "                        distance_threshold=0, linkage='average',\n",
       "                        n_clusters=None)"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ip_clust = AgglomerativeClustering(affinity = ip_affinity, linkage = 'average',distance_threshold=0, n_clusters=None)\n",
    "ip_clust.fit(zeta);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# plot_dendrogram(ward,labels = list(industry_list), size = (10,20), orientation='left',truncate_mode=\"level\",color_threshold = 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 608 ms, sys: 5.19 ms, total: 613 ms\n",
      "Wall time: 601 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.357447079797884"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ances = {}; desc = {}\n",
    "ip_ranking = np.array([get_ranking(ip_clust,t) for t in range(n)])\n",
    "\n",
    "ip_kt_z = [kendalltau(ip_ranking[i], true_ranking[i]).correlation for i in range(ip_ranking.shape[0])]\n",
    "np.mean(ip_kt_z)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 377 ms, sys: 0 ns, total: 377 ms\n",
      "Wall time: 376 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "AgglomerativeClustering(affinity=<function ip_affinity at 0x7fdab08ed430>,\n",
       "                        distance_threshold=0, linkage='average',\n",
       "                        n_clusters=None)"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ip_clust = AgglomerativeClustering(affinity = ip_affinity, linkage = 'average',distance_threshold=0, n_clusters=None)\n",
    "ip_clust.fit(Y);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 548 ms, sys: 1.98 ms, total: 550 ms\n",
      "Wall time: 540 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.3378807319669298"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ances = {}; desc = {}\n",
    "ip_ranking = np.array([get_ranking(ip_clust,t) for t in range(n)])\n",
    "\n",
    "ip_kt_y = [kendalltau(ip_ranking[i], true_ranking[i]).correlation for i in range(ip_ranking.shape[0])]\n",
    "np.mean(ip_kt_y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## ward"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### PCA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 3.22 ms, sys: 0 ns, total: 3.22 ms\n",
      "Wall time: 2.67 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "AgglomerativeClustering(distance_threshold=0, n_clusters=None)"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ward = AgglomerativeClustering(linkage=\"ward\", distance_threshold=0, n_clusters = None)\n",
    "ward.fit(zeta)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 469 ms, sys: 1.71 ms, total: 471 ms\n",
      "Wall time: 460 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.391970663145865"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ances = {}; desc = {}\n",
    "w_ranking = np.array([get_ranking(ward,t) for t in range(n)])\n",
    "\n",
    "w_kt_z = [kendalltau(w_ranking[i], true_ranking[i]).correlation for i in range(w_ranking.shape[0])]\n",
    "np.mean(w_kt_z)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 65.9 ms, sys: 750 µs, total: 66.7 ms\n",
      "Wall time: 65.4 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "AgglomerativeClustering(distance_threshold=0, n_clusters=None)"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ward = AgglomerativeClustering(linkage=\"ward\", distance_threshold=0, n_clusters = None)\n",
    "ward.fit(Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 481 ms, sys: 0 ns, total: 481 ms\n",
      "Wall time: 477 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.347953606262907"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ances = {}; desc = {}\n",
    "w_ranking = np.array([get_ranking(ward,t) for t in range(n)])\n",
    "\n",
    "w_kt_y = [kendalltau(w_ranking[i], true_ranking[i]).correlation for i in range(w_ranking.shape[0])]\n",
    "np.mean(w_kt_y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## UPGMA"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### PCA "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 2.44 ms, sys: 755 µs, total: 3.2 ms\n",
      "Wall time: 2.63 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "AgglomerativeClustering(distance_threshold=0, linkage='average',\n",
       "                        n_clusters=None)"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "average = AgglomerativeClustering(linkage=\"average\", distance_threshold=0, n_clusters = None)\n",
    "average.fit(zeta)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 485 ms, sys: 6.4 ms, total: 491 ms\n",
      "Wall time: 480 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.390738922457119"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ances = {}; desc  = {}\n",
    "a_ranking = np.array([get_ranking(average,t) for t in range(n)])\n",
    "\n",
    "a_kt_z = [kendalltau(a_ranking[i], true_ranking[i]).correlation for i in range(a_ranking.shape[0])]\n",
    "np.mean(a_kt_z)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Y "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 52.7 ms, sys: 159 µs, total: 52.9 ms\n",
      "Wall time: 51.9 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "AgglomerativeClustering(distance_threshold=0, linkage='average',\n",
       "                        n_clusters=None)"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "average = AgglomerativeClustering(linkage=\"average\", distance_threshold=0, n_clusters = None)\n",
    "average.fit(Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 561 ms, sys: 6.08 ms, total: 568 ms\n",
      "Wall time: 554 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.3444095037157435"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ances = {}; desc  = {}\n",
    "a_ranking = np.array([get_ranking(average,t) for t in range(n)])\n",
    "\n",
    "a_kt_y = [kendalltau(a_ranking[i], true_ranking[i]).correlation for i in range(a_ranking.shape[0])]\n",
    "np.mean(a_kt_y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## cosine "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### PCA "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 3.69 ms, sys: 0 ns, total: 3.69 ms\n",
      "Wall time: 3.02 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "AgglomerativeClustering(affinity='cosine', distance_threshold=0,\n",
       "                        linkage='average', n_clusters=None)"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "cosine = AgglomerativeClustering(affinity = 'cosine', linkage = 'average', distance_threshold=0, n_clusters = None)\n",
    "cosine.fit(zeta)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 481 ms, sys: 788 µs, total: 482 ms\n",
      "Wall time: 472 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.41507580466588084"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ances = {}; desc  = {}\n",
    "cs_ranking = np.array([get_ranking(cosine,t) for t in range(n)])\n",
    "\n",
    "cs_kt_z = [kendalltau(cs_ranking[i], true_ranking[i]).correlation for i in range(cs_ranking.shape[0])]\n",
    "np.mean(cs_kt_z)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 61.1 ms, sys: 211 µs, total: 61.3 ms\n",
      "Wall time: 60.3 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "AgglomerativeClustering(affinity='cosine', distance_threshold=0,\n",
       "                        linkage='average', n_clusters=None)"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "cosine = AgglomerativeClustering(affinity = 'cosine', linkage = 'average', distance_threshold=0, n_clusters = None)\n",
    "cosine.fit(Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 548 ms, sys: 5.2 ms, total: 553 ms\n",
      "Wall time: 540 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.3378807319669298"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ances = {}; desc  = {}\n",
    "cs_ranking = np.array([get_ranking(cosine,t) for t in range(n)])\n",
    "\n",
    "cs_kt_y = [kendalltau(cs_ranking[i], true_ranking[i]).correlation for i in range(cs_ranking.shape[0])]\n",
    "np.mean(cs_kt_y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## save"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "method_comparison_df['ip_z'] = [np.mean(ip_kt_z),sem(ip_kt_z)]\n",
    "method_comparison_df['ip_y'] = [np.mean(ip_kt_y),sem(ip_kt_y)]\n",
    "\n",
    "method_comparison_df['w_z'] = [np.mean(w_kt_z),sem(w_kt_z)]\n",
    "method_comparison_df['w_y'] = [np.mean(w_kt_y),sem(w_kt_y)]\n",
    "\n",
    "method_comparison_df['a_z'] = [np.mean(a_kt_z),sem(a_kt_z)]\n",
    "method_comparison_df['a_y'] = [np.mean(a_kt_y),sem(a_kt_y)]\n",
    "\n",
    "method_comparison_df['cs_z'] = [np.mean(cs_kt_z),sem(cs_kt_z)]\n",
    "method_comparison_df['cs_y'] = [np.mean(cs_kt_y),sem(cs_kt_y)]\n",
    "\n",
    "# method_comparison_df.to_csv('method_comparison_df.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## hdbscan"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### PCA "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "hdbscan_ = hdbscan.HDBSCAN(min_cluster_size=2)\n",
    "hdbscan_.fit(zeta)\n",
    "\n",
    "hdbscan_.single_linkage_tree_;\n",
    "hdbscan_.children_ = np.array(hdbscan_.single_linkage_tree_.to_pandas()[['left_child','right_child']], dtype=int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 734 ms, sys: 0 ns, total: 734 ms\n",
      "Wall time: 731 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.3317329944737204"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ances = {}; desc = {}\n",
    "hdbscan_ranking = np.array([get_ranking(hdbscan_,t) for t in range(n)]) \n",
    "\n",
    "hdbscan_kt_z = [kendalltau(hdbscan_ranking[i], true_ranking[i]).correlation for i in range(hdbscan_ranking.shape[0])]\n",
    "np.mean(hdbscan_kt_z)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "hdbscan_ = hdbscan.HDBSCAN(min_cluster_size=2)\n",
    "hdbscan_.fit(Y)\n",
    "\n",
    "hdbscan_.single_linkage_tree_;\n",
    "hdbscan_.children_ = np.array(hdbscan_.single_linkage_tree_.to_pandas()[['left_child','right_child']], dtype=int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1.15 s, sys: 7.66 ms, total: 1.16 s\n",
      "Wall time: 1.16 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.1381112863212358"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ances = {}; desc = {}\n",
    "hdbscan_ranking = np.array([get_ranking(hdbscan_,t) for t in range(n)]) # range(n)\n",
    "\n",
    "hdbscan_kt_y = [kendalltau(hdbscan_ranking[i], true_ranking[i]).correlation for i in range(hdbscan_ranking.shape[0])]\n",
    "np.mean(hdbscan_kt_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "method_comparison_df['hdbscan_z'] = [np.mean(hdbscan_kt_z),sem(hdbscan_kt_z)]\n",
    "method_comparison_df['hdbscan_y'] = [np.mean(hdbscan_kt_y),sem(hdbscan_kt_y)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "# method_comparison_df.to_csv('method_comparison_df.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Appendix method comparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "linkage = ['complete','single']\n",
    "metric = ['euclidean', 'cosine']\n",
    "combs = list(itertools.product(linkage, metric))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "method_comparison_df_v2 = pd.DataFrame(columns = ['linkage','metric',\n",
    "                                                  'zeta_mean','zeta_se','Y_mean','Y_se'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(len(combs)):\n",
    "    print(combs[i])\n",
    "    \n",
    "    on_zeta = AgglomerativeClustering(affinity = combs[i][1], linkage = combs[i][0],distance_threshold=0, n_clusters=None)\n",
    "    on_zeta.fit(zeta);\n",
    "    ances = {}; desc = {}\n",
    "    on_zeta_ranking = np.array([get_ranking(on_zeta,t) for t in range(n)])\n",
    "    kt_z = [kendalltau(on_zeta_ranking[i], true_ranking[i]).correlation for i in range(on_zeta_ranking.shape[0])]\n",
    "    \n",
    "    on_Y = AgglomerativeClustering(affinity = combs[i][1], linkage = combs[i][0],distance_threshold=0, n_clusters=None)\n",
    "    on_Y.fit(Y);\n",
    "    ances = {}; desc = {}\n",
    "    on_Y_ranking = np.array([get_ranking(on_Y,t) for t in range(n)])\n",
    "    kt_Y = [kendalltau(on_Y_ranking[i], true_ranking[i]).correlation for i in range(on_Y_ranking.shape[0])]\n",
    "    \n",
    "    \n",
    "    new_row = {'linkage': combs[i][0],'metric':combs[i][1],\n",
    "               'zeta_mean': np.mean(kt_z),'zeta_se': sem(kt_z),\n",
    "               'Y_mean': np.mean(kt_Y),'Y_se': sem(kt_Y)}\n",
    "    method_comparison_df_v2 = pd.concat([method_comparison_df_v2,pd.DataFrame(new_row, index=[0])]).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "method_comparison_df_v2.to_csv('method_comparison_v2.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
