{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "%env CUDA_VISIBLE_DEVICES=1\n",
    "import matplotlib.pyplot as plt\n",
    "from utils.utils import get_path\n",
    "from utils.fig_utils import full_dist_to_print, dataset_to_print, dist_to_color, dist_to_print\n",
    "from utils.io_utils import load_multiple_res\n",
    "from utils.pd_utils import compute_outlier_scores, filter_dgms\n",
    "import os\n",
    "import numpy as np"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "fb61a894ae85b5b5"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "style_file = \"utils.style\"\n",
    "plt.style.use(style_file)"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "d26c39f8f4755590"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "root_path = get_path(\"data\")\n",
    "fig_path = os.path.join(root_path, \"figures\")"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "fc6dd98647f36153"
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Fig datasets"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "8655d28d0681bbba"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "datasets = [\"toy_circle\", \"inter_circles\", \"eyeglasses\", \"torus\", \"toy_sphere\"]\n",
    "\n",
    "embd_dim = 50\n",
    "sigmas = np.linspace(0.0, 0.35, 29)\n",
    "sigmas = np.array([np.format_float_positional(sigma, precision=4, unique=True, trim='0') for sigma in sigmas]).astype(float)\n",
    "seeds = [0, 1, 2]\n",
    "n = 1000\n",
    "\n",
    "# different datas\n",
    "sigmas_per_dataset = {}\n",
    "for dataset in datasets:\n",
    "    if dataset == \"eyeglasses\":\n",
    "        sigmas_per_dataset[dataset] = sigmas[:13]\n",
    "    else:\n",
    "        sigmas_per_dataset[dataset] = sigmas"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "fde7e290255cc723"
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Find best hyperparameters"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "92c614a3b245e0fd"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# The best hyperparameters are hard-coded below and do not have to be reselected, unless the set of methods and possible hyperparameters is changed."
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "1156e7817d6a36fd"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "distances = {\n",
    "    \"euclidean\": [{}],\n",
    "    \"fermat\": [\n",
    "               {\"p\": 2},\n",
    "               {\"p\": 3},\n",
    "               {\"p\": 5},\n",
    "               {\"p\": 7}\n",
    "               ],\n",
    "    \"dtm\": [\n",
    "            {\"k\": 4, \"p_dtm\": 2, \"p_radius\": 1},\n",
    "            {\"k\": 4, \"p_dtm\": np.inf, \"p_radius\": 1},\n",
    "            {\"k\": 15, \"p_dtm\": 2, \"p_radius\": 1},\n",
    "            {\"k\": 15, \"p_dtm\": np.inf, \"p_radius\": 1},\n",
    "            {\"k\": 100, \"p_dtm\": 2, \"p_radius\": 1},\n",
    "            {\"k\": 100, \"p_dtm\": np.inf, \"p_radius\": 1},\n",
    "\n",
    "            {\"k\": 4, \"p_dtm\": 2, \"p_radius\": 2},\n",
    "            {\"k\": 4, \"p_dtm\": np.inf, \"p_radius\": 2},\n",
    "            {\"k\": 15, \"p_dtm\": 2, \"p_radius\": 2},\n",
    "            {\"k\": 15, \"p_dtm\": np.inf, \"p_radius\": 2},\n",
    "            {\"k\": 100, \"p_dtm\": 2, \"p_radius\": 2},\n",
    "            {\"k\": 100, \"p_dtm\": np.inf, \"p_radius\": 2},\n",
    "\n",
    "            {\"k\": 4, \"p_dtm\": 2, \"p_radius\": np.inf},\n",
    "            {\"k\": 4, \"p_dtm\": np.inf, \"p_radius\": np.inf},\n",
    "            {\"k\": 15, \"p_dtm\": 2, \"p_radius\": np.inf},\n",
    "            {\"k\": 15, \"p_dtm\": np.inf, \"p_radius\": np.inf},\n",
    "            {\"k\": 100, \"p_dtm\": 2, \"p_radius\": np.inf},\n",
    "            {\"k\": 100, \"p_dtm\": np.inf, \"p_radius\": np.inf},\n",
    "    ],\n",
    "    \"eff_res\": [\n",
    "        {\"corrected\": True, \"weighted\": False, \"k\": 15, \"disconnect\": True},\n",
    "        {\"corrected\": True, \"weighted\": False, \"k\": 100, \"disconnect\": True,}\n",
    "    ],\n",
    "    \"diffusion\": [\n",
    "        {\"k\": 15, \"t\": 8, \"kernel\": \"sknn\", \"include_self\": False},\n",
    "        {\"k\": 100, \"t\": 8, \"kernel\": \"sknn\", \"include_self\": False},\n",
    "        {\"k\": 15, \"t\": 64, \"kernel\": \"sknn\", \"include_self\": False},\n",
    "        {\"k\": 100, \"t\": 64, \"kernel\": \"sknn\", \"include_self\": False},\n",
    "    ],\n",
    "}\n"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "8cb0e1d72df73df"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "#load data\n",
    "all_res = {}\n",
    "for dataset in datasets:\n",
    "    all_res[dataset] = load_multiple_res(datasets=dataset,\n",
    "                                         distances=distances,\n",
    "                                         root_path=get_path(\"data\"),\n",
    "                                         n=n,\n",
    "                                         seeds=seeds,\n",
    "                                         sigmas=sigmas_per_dataset[dataset],\n",
    "                                         embd_dims=embd_dim,\n",
    "                                         n_threads=10)"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "d6eb5cec3f152778"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# filtering\n",
    "dob = 1.25\n",
    "all_res = filter_dgms(dgms=all_res, dob=dob, dim=1, binary=True)\n",
    "all_res = filter_dgms(dgms=all_res, dob=dob, dim=2, binary=True)"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "d17dbb97933e5ea3"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# compute the detection score\n",
    "cycles_per_dataset = {\n",
    "    1: {\n",
    "        \"toy_circle\": 1,\n",
    "        \"inter_circles\": 2,\n",
    "        \"eyeglasses\": 1,\n",
    "        \"torus\": 2,\n",
    "        \"toy_sphere\": 0,  # so that the curves are high for AUC\n",
    "    },\n",
    "    2: {\n",
    "        \"toy_sphere\": 1,\n",
    "        \"torus\": 1 \n",
    "    }\n",
    "}\n",
    "\n",
    "outlier_scores_1d = {dataset: compute_outlier_scores(all_res[dataset], \n",
    "                                                     n_features=cycles_per_dataset[1][dataset],\n",
    "                                                     dim=1)\n",
    "                     for dataset in cycles_per_dataset[1]}\n",
    "outlier_scores_2d = {dataset: compute_outlier_scores(all_res[dataset],\n",
    "                                                     n_features=cycles_per_dataset[2][dataset],\n",
    "                                                     dim=2)\n",
    "                     for dataset in cycles_per_dataset[2]}\n",
    "\n",
    "outlier_scores = {\n",
    "    1: outlier_scores_1d,\n",
    "    2: outlier_scores_2d\n",
    "}"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "842c8b02572dcea3"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# compute area under the curves\n",
    "aucs = {}\n",
    "\n",
    "for feature_dim in outlier_scores:\n",
    "    auc_by_feature_dim = {}\n",
    "    for dataset in outlier_scores[feature_dim]:\n",
    "        auc_by_dataset = {}\n",
    "        for dist in outlier_scores[feature_dim][dataset]:\n",
    "            auc_per_method = {}\n",
    "            for full_dist in outlier_scores[feature_dim][dataset][dist]:\n",
    "                auc_per_method[full_dist] = outlier_scores[feature_dim][dataset][dist][full_dist].mean(1).sum() / len(sigmas_per_dataset[dataset])\n",
    "            auc_by_dataset[dist] = auc_per_method\n",
    "        auc_by_feature_dim[dataset] = auc_by_dataset\n",
    "    aucs[feature_dim] = auc_by_feature_dim"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "e6ce9884e9e919c8"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# Each panel selects the best method individually. This means different feature dimensions for torus and sphere can select different hyperparameters.\n",
    "\n",
    "best_method_per_dataset = {}\n",
    "for feature_dim in aucs:\n",
    "    best_method_per_feature_dim = {}\n",
    "\n",
    "    for dataset in aucs[feature_dim]:\n",
    "        best_method_this_dataset = {}\n",
    "        for dist in aucs[feature_dim][dataset]:\n",
    "            best_full_dist = None\n",
    "            best_auc = 0\n",
    "            for full_dist in aucs[feature_dim][dataset][dist]:\n",
    "                auc = aucs[feature_dim][dataset][dist][full_dist]\n",
    "                if auc > best_auc:\n",
    "                    best_auc = auc\n",
    "                    best_full_dist = full_dist\n",
    "            best_method_this_dataset[dist] = best_full_dist\n",
    "        best_method_per_feature_dim[dataset] = best_method_this_dataset\n",
    "    best_method_per_dataset[feature_dim] = best_method_per_feature_dim"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "f9c69e38e69e2db9"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# print best hyperparameter settings \n",
    "for feature_dim in best_method_per_dataset:\n",
    "    print(\"~\"*40)\n",
    "    print(f\"Topological dimension: {feature_dim}\")\n",
    "    for dataset in best_method_per_dataset[feature_dim]:\n",
    "        for dist in best_method_per_dataset[feature_dim][dataset]:\n",
    "            print(dataset_to_print[dataset], full_dist_to_print[best_method_per_dataset[feature_dim][dataset][dist]].replace(\"\\n\", \" \"))\n",
    "        print(\"\\n\")\n"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "52835b205bc53f99"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "### preselected best hyperparameters"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "f6347a21922bd31e"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# This can be run without loading all hyperparameter settings from above. The following dictionary contains the best hyperparameters for each dataset and distance as kwargs instead of as string, so that we can use the normal method for loading results.\n",
    "best_method_per_dataset_dict = {\n",
    "    1: {'toy_circle': \n",
    "            {'euclidean': [{}], \n",
    "             'fermat': [{\"p\": 3}], \n",
    "             'dtm': [{\"k\": 4, \"p_dtm\": 2, \"p_radius\": 1},], \n",
    "             'eff_res': [{\"corrected\": True, \"weighted\": False, \"k\": 100, \"disconnect\": True,}], \n",
    "             'diffusion': [{\"k\": 100, \"t\": 8, \"kernel\": \"sknn\", \"include_self\": False}], \n",
    "             },\n",
    "        'inter_circles': \n",
    "            {'euclidean': [{}], \n",
    "             'fermat': [{\"p\": 7}], \n",
    "             'dtm': [{\"k\": 15, \"p_dtm\": np.inf, \"p_radius\": 1},],\n",
    "             'eff_res': [{\"corrected\": True, \"weighted\": False, \"k\": 15, \"disconnect\": True,}], \n",
    "             'diffusion': [{\"k\": 15, \"t\": 8, \"kernel\": \"sknn\", \"include_self\": False}],\n",
    "             },\n",
    "        'eyeglasses': \n",
    "            {'euclidean': [{}],\n",
    "             'fermat': [{\"p\": 7}],\n",
    "             'dtm': [{\"k\": 100, \"p_dtm\": 2, \"p_radius\": 1},], \n",
    "             'eff_res': [{\"corrected\": True, \"weighted\": False, \"k\": 15, \"disconnect\": True,}],\n",
    "             'diffusion':  [{\"k\": 15, \"t\": 64, \"kernel\": \"sknn\", \"include_self\": False}], \n",
    "             },\n",
    "        'torus': \n",
    "            {'euclidean': [{}], \n",
    "             'fermat': [{\"p\": 2}], \n",
    "             'dtm': [{\"k\": 4, \"p_dtm\": 2, \"p_radius\": np.inf},], \n",
    "             'eff_res': [{\"corrected\": True, \"weighted\": False, \"k\": 100, \"disconnect\": True,}],\n",
    "             'diffusion': [{\"k\": 15, \"t\": 8, \"kernel\": \"sknn\", \"include_self\": False}],\n",
    "             },\n",
    "        'toy_sphere':\n",
    "            {\n",
    "                \"euclidean\" : [{}],\n",
    "                \"fermat\" : [{\"p\": 2}],\n",
    "                \"dtm\" : [{\"k\": 100, \"p_dtm\": 2, \"p_radius\": 1}],\n",
    "                \"eff_res\" : [{\"corrected\": True, \"weighted\": False, \"k\": 15, \"disconnect\": True}],\n",
    "                \"diffusion\" : [{\"k\": 100, \"t\": 64, \"kernel\": \"sknn\", \"include_self\": False}],\n",
    "            }\n",
    "        },\n",
    "    2: {'torus': \n",
    "            {'euclidean': [{}],\n",
    "             'fermat': [{\"p\": 2}], \n",
    "             'dtm': [{\"k\": 4, \"p_dtm\": 2, \"p_radius\": np.inf},], \n",
    "             'eff_res': [{\"corrected\": True, \"weighted\": False, \"k\": 100, \"disconnect\": True,}],\n",
    "             'diffusion': [{\"k\": 15, \"t\": 8, \"kernel\": \"sknn\", \"include_self\": False}]},\n",
    "        'toy_sphere':\n",
    "            {\n",
    "                \"euclidean\" : [{}],\n",
    "                \"fermat\" : [{\"p\": 2}],\n",
    "                \"dtm\" : [{\"k\": 4, \"p_dtm\": 2, \"p_radius\": 1}],\n",
    "                \"eff_res\" : [{\"corrected\": True, \"weighted\": False, \"k\": 100, \"disconnect\": True}],\n",
    "                \"diffusion\" : [{\"k\": 100, \"t\": 8, \"kernel\": \"sknn\", \"include_self\": False}],\n",
    "            }\n",
    "        }\n",
    "}"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "11f9cdab8f720554"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "selected_res = {}\n",
    "for feature_dim in best_method_per_dataset_dict:\n",
    "    sel_res_dim = {}\n",
    "    for dataset in best_method_per_dataset_dict[feature_dim]:\n",
    "        sel_res_dim[dataset] = load_multiple_res(datasets=dataset, \n",
    "                                                  distances=best_method_per_dataset_dict[feature_dim][dataset],\n",
    "                                                  root_path=root_path,\n",
    "                                                  n=n,\n",
    "                                                  seeds=seeds,\n",
    "                                                  sigmas=sigmas_per_dataset[dataset],\n",
    "                                                  embd_dims=embd_dim, \n",
    "                                                  n_threads=10)\n",
    "    selected_res[feature_dim] = sel_res_dim"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "699be6d224556595"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "dob = 1.25\n",
    "for feature_dim in selected_res:\n",
    "    selected_res[feature_dim] = filter_dgms(dgms=selected_res[feature_dim], dob=dob, binary=True, dim=feature_dim)"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "36bf759056a519ee"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# compute the detection scores for each method\n",
    "cycles_per_dataset = {\n",
    "    1: {\n",
    "        \"toy_circle\": 1,\n",
    "        \"eyeglasses\": 1,\n",
    "        \"toy_sphere\": 1,  # so that the curves are low\n",
    "        \"torus\": 2,\n",
    "        \"inter_circles\": 2,\n",
    "        #\"toy_blob\": 1, # so that the curves are low\n",
    "    },\n",
    "    2: {\n",
    "        \"toy_sphere\": 1,\n",
    "        \"torus\": 1 \n",
    "    }\n",
    "}\n",
    "selected_outlier_scores_1d = {dataset: compute_outlier_scores(selected_res[1][dataset],\n",
    "                                                              n_features=cycles_per_dataset[1][dataset],\n",
    "                                                              dim=1)\n",
    "                     for dataset in cycles_per_dataset[1]}\n",
    "selected_outlier_scores_2d = {dataset: compute_outlier_scores(selected_res[2][dataset],\n",
    "                                                              n_features=cycles_per_dataset[2][dataset],\n",
    "                                                              dim=2)\n",
    "                     for dataset in cycles_per_dataset[2]}\n",
    "\n",
    "selected_outlier_scores = {\n",
    "    1: selected_outlier_scores_1d,\n",
    "    2: selected_outlier_scores_2d,\n",
    "}"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "ea38fe41b48714d3"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# plot figure\n",
    "npanels = len(cycles_per_dataset[1]) + len(cycles_per_dataset[2]) +1\n",
    "nrows = 2\n",
    "ncols =  int(np.ceil(npanels /2))\n",
    "\n",
    "\n",
    "fig, ax = plt.subplots(ncols=ncols, nrows=nrows, figsize=(5.5, 2.))\n",
    "\n",
    "y_label=\"Detection score\"\n",
    "\n",
    "letters = \"abcdefgh\"\n",
    "\n",
    "shift = 0\n",
    "\n",
    "for i in range(npanels):\n",
    "    feature_dim = 1 if i < len(cycles_per_dataset[1]) else 2\n",
    "    if i == 5: \n",
    "        shift = 1\n",
    "        continue\n",
    "    j = (i-shift) % len(cycles_per_dataset[1])\n",
    "    dataset = list(cycles_per_dataset[feature_dim].keys())[j]\n",
    "    \n",
    "    r, c  = int(i / ncols), i % ncols\n",
    "    \n",
    "    for dist in selected_res[feature_dim][dataset]:\n",
    "        full_dist = list(selected_res[feature_dim][dataset][dist].keys())[0]\n",
    "        mean = selected_outlier_scores[feature_dim][dataset][dist][full_dist].mean(1)\n",
    "        std = selected_outlier_scores[feature_dim][dataset][dist][full_dist].std(1)\n",
    "        \n",
    "        if dist != \"euclidean\":\n",
    "            ax[r, c].plot(sigmas_per_dataset[dataset], \n",
    "                       mean,\n",
    "                       label=dist_to_print[dist],\n",
    "                       color=dist_to_color[dist],\n",
    "                       clip_on=False)\n",
    "            ax[r, c].fill_between(\n",
    "                sigmas_per_dataset[dataset],\n",
    "                mean - std,\n",
    "                mean + std,\n",
    "                alpha=0.2,\n",
    "                color=dist_to_color[dist],\n",
    "                edgecolor=None,\n",
    "            )\n",
    "            \n",
    "        else:\n",
    "            ax[r, c].plot(sigmas_per_dataset[dataset], \n",
    "                          mean,\n",
    "                          label=dist_to_print[dist],\n",
    "                          color=dist_to_color[dist],\n",
    "                          clip_on=False,\n",
    "                          linestyle=\"dashed\")\n",
    "            ax[r, c].fill_between(\n",
    "                sigmas_per_dataset[dataset],\n",
    "                mean - std,\n",
    "                mean + std,\n",
    "                alpha=0.2,\n",
    "                color=dist_to_color[dist],\n",
    "                edgecolor=None,\n",
    "            )       \n",
    "        \n",
    "    ax[r, c].set_ylim(0, 1)\n",
    "    ax[r, c].set_xlim(0, sigmas_per_dataset[dataset].max())\n",
    "    ax[r, c].set_xlabel(\"Noise std $\\sigma$\")\n",
    "    if c==0:\n",
    "        ax[r, c].set_ylabel(y_label)\n",
    "        \n",
    "    if c > 0:\n",
    "        ax[r, c].set_yticklabels([])\n",
    "    if r == 0:\n",
    "        ax[r, c].set_xlabel(\"\")\n",
    "        \n",
    "        \n",
    "    if i == 0 or i==1:\n",
    "        ax[r, c].set_title(dataset_to_print[dataset] + f\" (1 loop)\")\n",
    "    elif i == 2:\n",
    "        ax[r, c].set_title(dataset_to_print[dataset] + f\" (neg. control)\")\n",
    "    elif i==3 or i==4:\n",
    "        ax[r, c].set_title(dataset_to_print[dataset] + f\" (2 loops)\")\n",
    "    elif i==6 or i==7:\n",
    "        ax[r, c].set_title(dataset_to_print[dataset] + f\" (1 void)\")  \n",
    "    \n",
    "    \n",
    "    ax[r, c].set_title(\n",
    "        letters[i],\n",
    "        loc=\"left\",\n",
    "        ha=\"right\",\n",
    "        fontweight=\"bold\",\n",
    ")\n",
    "\n",
    "\n",
    "handles, labels = ax[1, 2].get_legend_handles_labels()\n",
    "ax[1, 1].legend(loc=(0.2, 0.0),\n",
    "                handles=handles,\n",
    "             frameon=False\n",
    "      )\n",
    "ax[1, 1].axis(\"off\")\n",
    "fig.savefig(os.path.join(fig_path, \"fig_datasets_dob.pdf\"))"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "ac561e3e970fc43"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   },
   "id": "818f3efb90d05731"
  }
 ],
 "metadata": {
  "kernelspec": {
   "name": "conda-env-ph-py",
   "language": "python",
   "display_name": "Python [conda env:ph]"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
