{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "various analysis\n",
    "\"\"\"\n",
    "# pylint: disable=anomalous-backslash-in-string\n",
    "# pylint: disable=invalid-name\n",
    "# pylint: disable=import-error\n",
    "# pylint: disable=missing-function-docstring\n",
    "import os\n",
    "import sys\n",
    "sys.path.extend([\"../\"]) # pylint: disable=wrong-import-position\n",
    "import random\n",
    "from time import time\n",
    "import warnings\n",
    "import pickle\n",
    "import datetime\n",
    "import socket\n",
    "import glob\n",
    "import random\n",
    "\n",
    "import shutil\n",
    "import yaml\n",
    "import numpy as np\n",
    "import torch\n",
    "from copy import deepcopy\n",
    "\n",
    "# import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn import linear_model\n",
    "import scipy.sparse as sp\n",
    "from sklearn.preprocessing import normalize\n",
    "from collections import defaultdict\n",
    "from dgl.dataloading import MultiLayerFullNeighborSampler\n",
    "\n",
    "import ppr\n",
    "from data_utils import *\n",
    "from graph_dict import *\n",
    "from utils import *\n",
    "from plotlib import *\n",
    "from script_utils import load_SDMP, confHolder\n",
    "from path_dict import path_dict, name_model_pool\n",
    "\n",
    "warnings.filterwarnings('ignore')\n",
    "os.environ[\"CURL_CA_BUNDLE\"] = \"\"\n",
    "DEVICE = 'cuda:1'\n",
    "\n",
    "%load_ext line_profiler\n",
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "# %lprun -f\n",
    "# Parameter settings and data loading\n",
    "DATA_ROOT_FOLDER = \"../dataset\"\n",
    "CONF_ROOT_FOLDER = \"../config\"\n",
    "RES_ROOT_FOLDER = \"../result\"\n",
    "TARGET_ROOT_FOLDER = \"../result/baselines\"\n",
    "\n",
    "conf_dict = defaultdict(dict)\n",
    "\n",
    "conf_dict[\"cora\"] = {\n",
    "    \"dense_k\":3,\n",
    "    \"theta_normed_values_xlim\": [0, 0.15]\n",
    "    }\n",
    "conf_dict[\"pubmed\"] = {\n",
    "    \"dense_k\":3,\n",
    "    \"theta_normed_values_xlim\": [0, 0.20]\n",
    "    }\n",
    "conf_dict[\"citeseer\"] = {\n",
    "    \"dense_k\":3,\n",
    "    \"theta_normed_values_xlim\": [0, 0.15]\n",
    "    }\n",
    "conf_dict[\"a-computer\"] = {\n",
    "    \"dense_k\":3,\n",
    "    \"theta_normed_values_xlim\": [0, 0.15]\n",
    "    }\n",
    "conf_dict[\"a-photo\"] = {\n",
    "    \"dense_k\":3,\n",
    "    \"theta_normed_values_xlim\": [0, 0.15]\n",
    "    }\n",
    "conf_dict[\"ogbn-arxiv\"] = {\n",
    "    \"dense_k\":3,\n",
    "    \"theta_normed_values_xlim\": [0, 0.15]\n",
    "    }\n",
    "conf_dict[\"ogbn-products\"] = {\n",
    "    \"dense_k\":2,\n",
    "    \"theta_normed_values_xlim\": [0, 0.15]\n",
    "    }\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_paths(name, model):\n",
    "    model_paths = path_dict[name, model][\"result_root_parent_folder\"]\n",
    "    return next(glob.iglob(f\"{model_paths}/*\"))\n",
    "\n",
    "# name, model = \"cora\", \"SAGE\"\n",
    "# name, model = \"cora\", \"geomGCN\"\n",
    "# name, model = \"citeseer\", \"SAGE\"\n",
    "# name, model = \"citeseer\", \"geomGCN\"\n",
    "# name, model = \"pubmed\", \"SAGE\"\n",
    "# name, model = \"pubmed\", \"geomGCN\"\n",
    "# name, model = \"a-computer\", \"SAGE\"\n",
    "# name, model = \"a-computer\", \"exphormer\"\n",
    "# name, model = \"a-photo\", \"SAGE\"\n",
    "# name, model = \"a-photo\", \"exphormer\"\n",
    "# name, model = \"ogbn-arxiv\", \"SAGE\"\n",
    "# name, model = \"ogbn-arxiv\", \"DRGAT\"\n",
    "# name, model = \"ogbn-products\", \"SAGE\"\n",
    "name, model = \"ogbn-products\", \"RevGNN-112\"\n",
    "\n",
    "# get cofigs and load data\n",
    "seed = 0\n",
    "cur_conf = conf_dict[name]\n",
    "\n",
    "TARGET_GNN_FOLDER = os.path.join(TARGET_ROOT_FOLDER, name, model, f\"seed_{seed}\")\n",
    "result_root_path = get_paths(name, model)\n",
    "preprocesser, sdmp, g, sdmp_conf, train_conf = load_SDMP(result_root_path,\n",
    "                                                         TARGET_GNN_FOLDER,\n",
    "                                                         DATA_ROOT_FOLDER,\n",
    "                                                         device=DEVICE)\n",
    "Theta = sdmp.ThetaT.tocsr()\n",
    "print(len(Theta.data))\n",
    "Theta.data[Theta.data<0] = 0\n",
    "Theta.eliminate_zeros()\n",
    "print(len(Theta.data), Theta.nnz, np.sum(Theta.data<0), np.sum(Theta.data>0), np.sum(Theta.data==0))\n",
    "\n",
    "# if size is too large, get sampling here\n",
    "num_nodes = Theta.shape[0]\n",
    "cap_size = 10000\n",
    "\n",
    "if num_nodes > cap_size:\n",
    "    print(f\"Graph too large, sampled {cap_size} nodes for estimation!!!\")\n",
    "    sampled_nodes_idx = random.sample(list(range(num_nodes)), cap_size)\n",
    "    Theta = Theta[sampled_nodes_idx, :]\n",
    "# Get A power\n",
    "dense_neigh = preprocesser.get_A_pow_local_indices(cur_conf[\"dense_k\"], sampled_nodes_idx)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# Get PPRGo \n",
    "A = torch_sparse_to_scipy_coo(g.adj()).tocsr()\n",
    "if num_nodes > cap_size:\n",
    "    ppr_matrix = ppr.topk_ppr_matrix(adj_matrix=A,\n",
    "                                     alpha=0.5,\n",
    "                                     eps=1e-4,\n",
    "                                     idx=sampled_nodes_idx,\n",
    "                                     topk=[10000] * cap_size)\n",
    "\n",
    "else:\n",
    "    ppr_matrix = ppr.topk_ppr_matrix(adj_matrix=A,\n",
    "                                     alpha=0.5,\n",
    "                                     eps=1e-4,\n",
    "                                     idx=list(range(A.shape[0])),\n",
    "                                     topk=[A.shape[0]] * A.shape[0])\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "############\n",
    "# Figure plotting\n",
    "\n",
    "\n",
    "fig_path = \"../figures/theta_analysis\"\n",
    "\n",
    "if not os.path.exists(fig_path):\n",
    "    os.makedirs(fig_path)\n",
    "# row statisitcs\n",
    "# prepare data\n",
    "\n",
    "\n",
    "########################\n",
    "theta_cnt_style = defaultdict(dict)\n",
    "theta_cnt_style[(\"ogbn-products\", \"RevGNN-112\")][\"show_legend\"] = False\n",
    "\n",
    "style = theta_cnt_style[(name, model)]\n",
    "\n",
    "cnt_nonzeros = Theta.indptr[1:] - Theta.indptr[0:-1]\n",
    "cnt_nonzeros_raw = dense_neigh.indptr[1:] - dense_neigh.indptr[0:-1]\n",
    "cnt_nonzeros_ppr = ppr_matrix.indptr[1:] - ppr_matrix.indptr[0:-1]\n",
    "\n",
    "x_sdmp, y_sdmp = prepare_ECDF(cnt_nonzeros, num=50)\n",
    "x_ori, y_ori = prepare_ECDF(cnt_nonzeros_raw, num=5000)\n",
    "x_ppr, y_ppr = prepare_ECDF(cnt_nonzeros_ppr, num=5000)\n",
    "\n",
    "plot_curves([x_sdmp, x_ori, x_ppr], \n",
    "            [y_sdmp, y_ori, y_ppr],\n",
    "            [\"SDGNN\", \"SGC\", \"PPRGo\"],\n",
    "            figure_path=os.path.join(fig_path, f\"{name}_{model}_Theta_row_nonzero_cnt_cdf.pdf\"),\n",
    "            styles=[\"solid\", \"dashed\", \"dashdot\"],\n",
    "            # x_label='Number of Receptive Nodes'+fig_conf_dict_receptive[(name, model)][\"x_label_extra\"],\n",
    "            x_label='# Receptive Nodes',\n",
    "            y_label='CDF',\n",
    "            figsize=[6, 6],\n",
    "            x_log=True,\n",
    "            # xlim=[-1000, 1200],\n",
    "            # y_log=True,\n",
    "            widths=[4,4,4],\n",
    "            fontsize=32,\n",
    "            show_legend=style.get(\"show_legend\", False)\n",
    "            )\n",
    "\n",
    "\n",
    "normed_Theta = normalize(Theta, norm='l1', axis=1)\n",
    "normed_dense_neigh = normalize(dense_neigh, norm='l1', axis=1)\n",
    "normed_ppr_matrix = normalize(ppr_matrix, norm='l1', axis=1)\n",
    "\n",
    "\n",
    "x_sdmp, y_sdmp = prepare_ECDF(random.sample(normed_Theta.data.tolist(), min(100000, len(normed_Theta.data))))\n",
    "\n",
    "\n",
    "# all_cnt = len(normed_dense_neigh.data)\n",
    "# idx = np.arange(all_cnt)\n",
    "# np.random.shuffle(idx)\n",
    "# reduced_idx = idx[:100000]\n",
    "# x_ori, y_ori = prepare_ECDF(normed_dense_neigh.data[reduced_idx])\n",
    "x_ori, y_ori = prepare_ECDF(random.sample(normed_dense_neigh.data.tolist(), min(100000, len(normed_dense_neigh.data))))\n",
    "\n",
    "# all_cnt = len(normed_ppr_matrix.data)\n",
    "# idx = np.arange(all_cnt)\n",
    "# np.random.shuffle(idx)\n",
    "# reduced_idx = idx[:100000]\n",
    "# x_ppr, y_ppr = prepare_ECDF(normed_ppr_matrix.data[reduced_idx])\n",
    "x_ppr, y_ppr = prepare_ECDF(random.sample(normed_ppr_matrix.data.tolist(), min(100000, len(normed_ppr_matrix.data))))\n",
    "\n",
    "plot_curves([x_sdmp, x_ori, x_ppr], \n",
    "            [y_sdmp, y_ori, y_ppr],\n",
    "            [\"SDGNN\", \"SGC\", \"PPRGo\"],\n",
    "            figure_path=os.path.join(fig_path, f\"{name}_{model}_Theta_direct_cdf.pdf\"),\n",
    "            styles=[\"solid\", \"dashed\", \"dashdot\"],\n",
    "            x_label='Theta Values',\n",
    "            y_label='CDF',\n",
    "            figsize=[6, 6],\n",
    "            xlim=cur_conf[\"theta_normed_values_xlim\"],\n",
    "            widths=[4,4,4],\n",
    "            fontsize=32)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "gnn",
   "language": "python",
   "name": "gnn"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.13"
  },
  "toc-autonumbering": false,
  "toc-showmarkdowntxt": true
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
