{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Explainable Conditional LSTM random walk graph GAN training\n",
    "EUcore-top\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data chunk size:  25571\n"
     ]
    }
   ],
   "source": [
    "from eggen import utils\n",
    "from eggen.eggen_shadow import *\n",
    "\n",
    "import tensorflow as tf\n",
    "import scipy.sparse as sp\n",
    "import numpy as np\n",
    "from matplotlib import pyplot as plt\n",
    "from sklearn.metrics import roc_auc_score, average_precision_score\n",
    "import time\n",
    "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
    "import re\n",
    "import pickle\n",
    "\n",
    "## Load data\n",
    "# \"\"\" Entire set of email data \"\"\"\n",
    "data = pd.read_csv('./data/EU-core-join.csv')\n",
    "print(\"Data chunk size: \" ,len(data))\n",
    "\n",
    "#### Dept data (conditions)\n",
    "cond_list = pd.read_csv('./data/raw/email-Eu-core-department-labels.txt', sep=\" \", header=None)\n",
    "cond_list.columns = ['ID', 'DEPT']\n",
    "cond_list = cond_list.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[1, 4, 7, 14, 21]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "############################ Top N departments ############################\n",
    "# The sorted list of biggest groups by count of send only\n",
    "count_series = data.groupby(['RECEIVER DEPT', 'SENDER DEPT']).size()\n",
    "df_count = count_series.to_frame(name = 'size').reset_index()\n",
    "df_count.sort_values(by=['size'], inplace=True, ascending=False)\n",
    "\n",
    "df_countshow = df_count\n",
    "df_countshow.columns = ['Receiver dept', 'Sender dept', 'Email count']\n",
    "df_countshow = df_countshow.reset_index(drop=True)\n",
    "df_countshow.iloc[:10]\n",
    "\n",
    "############################ Top N departments ############################\n",
    "\"\"\" Select top N departments \"\"\"\n",
    "topN_grp = 5\n",
    "dept_list = df_count.iloc[:topN_grp]['Receiver dept'].tolist() #RECEIVER DEPT'].tolist()\n",
    "dept_list.sort()\n",
    "dept_list\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "642\n",
      "Got rid of 642 useless emails! That's 2.51% of the total number of messages in this dataset.\n"
     ]
    }
   ],
   "source": [
    "def no_selfloop(df, x):\n",
    "    \"\"\"Drops rows containing messages without some specified value in the expected locations. \n",
    "    Returns original dataframe without these values. Don't forget to reindex after doing this!!!\"\"\"\n",
    "    rows = []\n",
    "    for ind in range(x):\n",
    "        if (df.iloc[ind]['SENDER'] == df.iloc[ind]['RECEIVER']):\n",
    "            rows.append(ind)\n",
    "    \n",
    "    print(len(rows))\n",
    "    df = df.drop(df.index[rows])\n",
    "    return df\n",
    "\n",
    "#### Clean data\n",
    "x = len(data.index)\n",
    "data = no_selfloop(data, x)\n",
    "data = data.reset_index()\n",
    "print(\"Got rid of {} useless emails! That's {}% of the total number of messages in this dataset.\".format(x - len(data.index), np.round(((x - len(data.index)) / x) * 100, decimals=2)))\n",
    "\n",
    "x = len(data.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5206"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "############################ Top N departments ############################\n",
    "def subset_dept(df, x, dept_list):\n",
    "    \"\"\"Drops rows containing messages without some specified value in the expected locations. \n",
    "    Returns original dataframe without these values. Don't forget to reindex after doing this!!!\"\"\"\n",
    "    rows = []\n",
    "    for ind in range(x):\n",
    "#         # all depts associated with dept list\n",
    "#         if not (df.iloc[ind]['SENDER DEPT'] or df.iloc[ind]['RECEIVER DEPT']) in dept_list:\n",
    "        # only depts in the dept list\n",
    "        if ((data.iloc[ind]['SENDER DEPT'] in dept_list) and (data.iloc[ind]['RECEIVER DEPT'] in dept_list)):\n",
    "            rows.append(ind)\n",
    "\n",
    "#     df = df.drop(df.index[rows])\n",
    "    df = df.iloc[rows]\n",
    "    return df\n",
    "\n",
    "############################ Top N departments ############################\n",
    "# # Selecting some groups. For example, groups 3 and 28\n",
    "# grpone = dept_list[0]\n",
    "# grptwo = dept_list[1]\n",
    "# grpthree = dept_list[2]\n",
    "# grpfour = dept_list[3]\n",
    "# grpfive = dept_list[4]\n",
    "# dept_list = [grpone, grptwo, grpthree]#, grpfour, grpfive] \n",
    "dept_list.sort()\n",
    "grpothers = dept_list[-1]+1\n",
    "\n",
    "data_small = subset_dept(data, x, dept_list)\n",
    "# data = data.reset_index()\n",
    "data_small = data_small.reset_index()\n",
    "len(data_small)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "subset_condlist:  (array([0, 1, 2, 3, 4]), array([ 60, 103,  48,  90,  47]))\n",
      "Number of nodes in G:  348\n",
      "Number of edges in G:  3342\n",
      "Number of selfloops in G:  0\n",
      "Selecting 1 largest connected components\n",
      "n_conds:  5\n"
     ]
    }
   ],
   "source": [
    "import networkx as nx\n",
    "import nxviz as nv\n",
    "\n",
    "G = nx.from_pandas_edgelist(data_small, 'SENDER', 'RECEIVER') #, edge_attr=['SENDER DEPT']) # , 'RECEIVER DEPT'\n",
    "\n",
    "\"\"\" Get the subset of cond_list before reindexing from 0 \"\"\"\n",
    "all_nodes = np.arange(1005)\n",
    "train_nodes = np.array(G.nodes) #(348,)\n",
    "nontrain_nodes = np.setdiff1d(all_nodes, train_nodes) #(657,)\n",
    "nontrain = nontrain_nodes.tolist()\n",
    "\n",
    "subset_condlist = np.delete(cond_list, nontrain, 0)\n",
    "\n",
    "# reindexing the condlist subset\n",
    "subset_condlist[:,0] = np.arange(subset_condlist.shape[0])\n",
    "for i in np.arange(len(dept_list)):\n",
    "    subset_condlist[:,1][subset_condlist[:,1]==dept_list[i]] = i\n",
    "\n",
    "print(\"subset_condlist: \", np.unique(subset_condlist[:,1], return_counts=True))\n",
    "\n",
    "# Relabel nodes indices in G to match the generated indicies\n",
    "G = nx.convert_node_labels_to_integers(G)\n",
    "\n",
    "print(\"Number of nodes in G: \" ,G.number_of_nodes())\n",
    "print(\"Number of edges in G: \" ,G.number_of_edges())\n",
    "print(\"Number of selfloops in G: \" ,G.number_of_selfloops())\n",
    "\n",
    "## Preparing data\n",
    "Adjtraining = nx.adjacency_matrix(G)\n",
    "Adjtraining = sp.csr_matrix(Adjtraining, dtype='float64')\n",
    "_A_obs = Adjtraining\n",
    "_A_obs = _A_obs + _A_obs.T # (597, 597)\n",
    "_A_obs[_A_obs > 1] = 1 # Max value of 1 (597, 597)\n",
    "\n",
    "\"\"\" Reduce input graph to a subgraph where only the nodes in largest n_components are kept. \"\"\" \n",
    "lcc = utils.largest_connected_components(_A_obs) # len(lcc) = 584\n",
    "_A_obs = _A_obs[lcc,:][:,lcc] # (584, 584)\n",
    "_N = _A_obs.shape[0] # 584\n",
    "\n",
    "#### Separate the edges into train, test, validation\n",
    "val_share = 0.1\n",
    "test_share = 0.05\n",
    "seed = 2020 #481516234  \n",
    "\"\"\"\n",
    "Split the edges of the adjacency matrix into train, validation and test edges and randomly samples equal amount of validation and test non-edges. \n",
    "\"\"\"\n",
    "train_ones, val_ones, val_zeros, test_ones, test_zeros = utils.train_val_test_split_adjacency(_A_obs, val_share, test_share, seed, undirected=True, connected=True, asserts=False) \n",
    "\n",
    "## EGGen\n",
    "train_graph = sp.coo_matrix((np.ones(len(train_ones)),(train_ones[:,0], train_ones[:,1]))).tocsr()\n",
    "assert (train_graph.toarray() == train_graph.toarray().T).all()\n",
    "\n",
    "#### Parameters\n",
    "\"\"\" Adjustable parameters for training. \"\"\" \n",
    "# setting GPU id \n",
    "gpu_id = 0\n",
    "# setting the number of nodes\n",
    "_N = _A_obs.shape[0]\n",
    "# setting the length of random walks\n",
    "rw_len = 16 #8 #32 #\n",
    "# setting the training data batch size\n",
    "batch_size = 128 #512 #\n",
    "# getting the number of departments\n",
    "# n_conds=np.unique(data[['SENDER DEPT', 'RECEIVER DEPT']].values).shape[0] #42\n",
    "n_conds=np.unique(data_small[['SENDER DEPT', 'RECEIVER DEPT']].values).shape[0] #5\n",
    "# n_conds=len(dept_list)\n",
    "print(\"n_conds: \", n_conds)\n",
    "sample_batch = 1000 #128 #2048 #256 #512 #1024 #\n",
    "# log_num = 13 #99 #\n",
    "\n",
    "walker = utils.RandomWalker(train_graph, subset_condlist, rw_len, p=1, q=1, batch_size=batch_size, sample_batch=sample_batch)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create our generative model "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "intermediate:  Tensor(\"Generator/Generator.int_1/Tanh:0\", shape=(128, 50), dtype=float32)\n",
      "h:  Tensor(\"Generator/Generator.h_1/Tanh:0\", shape=(128, 50), dtype=float32)\n",
      "c:  Tensor(\"Generator/Generator.c_1/Tanh:0\", shape=(128, 50), dtype=float32)\n",
      "Generator initial_states:  1\n",
      "Initial cond:  Tensor(\"Generator/unstack:0\", shape=(128, 5), dtype=float32)\n"
     ]
    }
   ],
   "source": [
    "l2_gen=1e-7; l2_disc=5e-5 #1e-4 \n",
    "gencond_lay=[10]; gen_lay=[50]; disc_lay=[40] \n",
    "lr_gencond=0.01; lr_gen=0.0003; lr_disc=0.0003 #0.0002\n",
    "gencond_iters=1; gen_iters=1; disc_iters=3\n",
    "discWdown_size=128; genWdown_size=128 \n",
    "\n",
    "eggen = EGGen(_N,\n",
    "rw_len,\n",
    "walk_generator=walker,\n",
    "n_conds=n_conds,\n",
    "condgenerator_layers=gencond_lay,\n",
    "generator_layers=gen_lay,\n",
    "discriminator_layers=disc_lay,\n",
    "W_down_discriminator_size=discWdown_size,\n",
    "W_down_generator_size=genWdown_size,\n",
    "batch_size=batch_size,\n",
    "sample_batch=sample_batch,\n",
    "condition_dim=n_conds,\n",
    "gencond_iters=gencond_iters,\n",
    "gen_iters=gen_iters,\n",
    "disc_iters=disc_iters,\n",
    "wasserstein_penalty=10, #20, #\n",
    "l2_penalty_generator=l2_gen,\n",
    "l2_penalty_discriminator=l2_disc,\n",
    "lr_gencond=lr_gencond,\n",
    "lr_gen=lr_gen,\n",
    "lr_disc=lr_disc,\n",
    "noise_dim=16, #32, #\n",
    "noise_type=\"Uniform\", #\"Gaussian\", #\n",
    "temp_start=10.0, #5.0, #30.0, #\n",
    "min_temperature=0.5,\n",
    "temperature_decay=1-5e-5,\n",
    "seed=15, #seed, #\n",
    "use_gumbel=True,\n",
    "legacy_generator=False,\n",
    "gpu_id=gpu_id,\n",
    "plot_show=False\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model:  model_best_33\n",
      "INFO:tensorflow:Restoring parameters from snapshots_shadow/model_best_33.ckpt\n"
     ]
    }
   ],
   "source": [
    "\"\"\" ===================================================================================== \"\"\"\n",
    "\n",
    "model_name = \"model_best_33\" #19, \"model_best_20\" #\n",
    "print(\"Model: \", model_name)\n",
    "saver = tf.train.Saver()\n",
    "saver.restore(eggen.session, \"snapshots_shadow/\" + model_name + \".ckpt\") # "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generate graphs to evaluate performance "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "intermediate:  Tensor(\"Generator_1/Generator.int_1/Tanh:0\", shape=(1000, 50), dtype=float32)\n",
      "h:  Tensor(\"Generator_1/Generator.h_1/Tanh:0\", shape=(1000, 50), dtype=float32)\n",
      "c:  Tensor(\"Generator_1/Generator.c_1/Tanh:0\", shape=(1000, 50), dtype=float32)\n",
      "Generator initial_states:  1\n",
      "Initial cond:  Tensor(\"Generator_1/unstack:0\", shape=(1000, 5), dtype=float32)\n"
     ]
    }
   ],
   "source": [
    "sample_many, explain_conds = eggen.generate_discrete(1000, conds=True, rw_len=rw_len, reuse=True) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "33\n",
      "66\n",
      "99\n",
      "total time:  37.351624727249146\n"
     ]
    }
   ],
   "source": [
    "import time\n",
    "t0 = time.time()\n",
    "\n",
    "num_paths = 100 #120\n",
    "samples = []\n",
    "for _ in range(num_paths): \n",
    "    if (_+1) % round(num_paths/3) == 0:\n",
    "        print(_+1)\n",
    "    samples.append(sample_many.eval({eggen.tau: 0.5}))\n",
    "    \n",
    "t1 = time.time()\n",
    "print(\"total time: \", t1-t0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Assemble score matrix from the random walks\n",
    "rws = np.array(samples).reshape([-1, rw_len])\n",
    "scores_matrix = utils.score_matrix_from_random_walks(rws, _N).tocsr()\n",
    "\n",
    "A_select = sp.csr_matrix((np.ones(len(train_ones)), (train_ones[:,0], train_ones[:,1])))\n",
    "A_select = train_graph"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {
    "scrolled": false
   },
   "source": [
    "### Original graph from scores ###\n",
    "\n",
    "sampled_graph = utils.graph_from_scores(scores_matrix, A_select.sum())\n",
    "# EO = utils.edge_overlap(A_select.toarray(), sampled_graph)/A_select.sum()\n",
    "# print(\"EO: \", EO)\n",
    "\n",
    "stats = utils.compute_graph_statistics(sampled_graph) #, encoded)\n",
    "print(stats['assortativity'])\n",
    "print(stats['clustering_coefficient'])\n",
    "print(stats['cpl'])\n",
    "print(stats['gini'])\n",
    "print(stats['d_max'])\n",
    "print(stats['power_law_exp'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Better graph from scores ###\n",
    "\n",
    "def graph_from_scores(scores, n_edges):\n",
    "#     scores=scores_matrix; n_edges=A_select.sum()\n",
    "\n",
    "    target_g = np.zeros(scores.shape) # initialize target graph\n",
    "    scores_int = scores.toarray().copy() # internal copy of the scores matrix\n",
    "    scores_int[np.diag_indices_from(scores_int)] = 0  # set diagonal to zero\n",
    "    N = scores.shape[0]\n",
    "#     print(\"N: \", N)\n",
    "\n",
    "    for n in np.random.choice(N, replace=False, size=N): # Iterate the nodes in random order\n",
    "        row = scores_int[n,:].copy()\n",
    "        if row.sum() == 0:\n",
    "            target = np.random.choice(N)\n",
    "    #         continue\n",
    "        else:\n",
    "            probs = row / row.sum()\n",
    "            target = np.random.choice(N, p=probs)\n",
    "    #         target = np.argmax(probs) # argmax probs\n",
    "        target_g[n, target] = 1\n",
    "        target_g[target, n] = 1\n",
    "\n",
    "    # print(target_g.sum()/2)\n",
    "    diff = (n_edges - target_g.sum())/2\n",
    "#     print(\"diff: \", diff)\n",
    "#     print(\"n_edges - N: \", n_edges/2 - N)\n",
    "    if diff > 0:   \n",
    "        triu = np.triu(scores_int) # upper triangle\n",
    "        triu[target_g > 0] = 0 # set previously assigned edge to zero\n",
    "        # print(\"triu nonzeros: \",np.count_nonzero(triu))\n",
    "\n",
    "        num_elements = np.count_nonzero(triu) #len(triu[triu>0]) \n",
    "        avg_threshold = triu.sum()/num_elements\n",
    "        tau = 1.4 #1.2\n",
    "        avg_threshold = avg_threshold*tau #1.485# tune\n",
    "#         print(\"avg_threshold: \", avg_threshold)\n",
    "        triu[triu < avg_threshold] = 0 # \n",
    "#         print(\"triu nonzeros: \",np.count_nonzero(triu))\n",
    "\n",
    "        triu = triu / triu.sum() # every count divided by total sum\n",
    "        triu_ixs = np.triu_indices_from(triu) # indices\n",
    "        extra_edges = np.random.choice(triu_ixs[0].shape[0], replace=False, p=triu[triu_ixs], size=int(diff))\n",
    "    #     extra_edges = triu[triu_ixs].argsort()[-int(diff):][::-1] # choose top k based on prob\n",
    "\n",
    "        target_g[(triu_ixs[0][extra_edges], triu_ixs[1][extra_edges])] = 1\n",
    "        target_g[(triu_ixs[1][extra_edges], triu_ixs[0][extra_edges])] = 1\n",
    "\n",
    "    target_g = utils.symmetric(target_g)\n",
    "    return target_g"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-0.04503670968431043\n",
      "0.02885649001635811\n",
      "2.8580443207790918\n",
      "0.4301643192488265\n",
      "61.0\n",
      "1.4315679125933238\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/wesleyjtann/miniconda3/envs/eggen/lib/python3.6/site-packages/powerlaw.py:700: RuntimeWarning: invalid value encountered in true_divide\n",
      "  (Theoretical_CDF * (1 - Theoretical_CDF))\n"
     ]
    }
   ],
   "source": [
    "### New graph statistics ###\n",
    "\n",
    "target_g = graph_from_scores(scores_matrix, A_select.sum())\n",
    "newstats = utils.compute_graph_statistics(target_g)\n",
    "\n",
    "print(newstats['assortativity'])\n",
    "print(newstats['clustering_coefficient'])\n",
    "print(newstats['cpl'])\n",
    "print(newstats['gini'])\n",
    "print(newstats['d_max'])\n",
    "print(newstats['power_law_exp'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Graph stats over 5 runs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "trial:  0\n",
      "33\n",
      "66\n",
      "99\n",
      "rws:  (100000, 16)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/wesleyjtann/miniconda3/envs/eggen/lib/python3.6/site-packages/powerlaw.py:700: RuntimeWarning: invalid value encountered in true_divide\n",
      "  (Theoretical_CDF * (1 - Theoretical_CDF))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Trial 0: --- 36.0580313205719 seconds ---\n",
      "stats:  {'d_max': 63.0, 'd_min': 1.0, 'd': 16.32183908045977, 'LCC': 348, 'wedge_count': 74171.0, 'claw_count': 794866.0, 'triangle_count': 7782, 'square_count': 10384, 'power_law_exp': 1.4365840441146673, 'gini': 0.4435081754897201, 'rel_edge_distr_entropy': 0.9419612128241251, 'assortativity': -0.010465408544316832, 'clustering_coefficient': 0.029370988317527734, 'n_components': 1, 'cpl': 2.85794494683494}\n",
      "trial:  1\n",
      "33\n",
      "66\n",
      "99\n",
      "rws:  (100000, 16)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/wesleyjtann/miniconda3/envs/eggen/lib/python3.6/site-packages/powerlaw.py:700: RuntimeWarning: invalid value encountered in true_divide\n",
      "  (Theoretical_CDF * (1 - Theoretical_CDF))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Trial 1: --- 34.73612332344055 seconds ---\n",
      "stats:  {'d_max': 65.0, 'd_min': 1.0, 'd': 16.32183908045977, 'LCC': 348, 'wedge_count': 72233.0, 'claw_count': 759870.0, 'triangle_count': 7339, 'square_count': 8250, 'power_law_exp': 1.43177218464978, 'gini': 0.4274415169175976, 'rel_edge_distr_entropy': 0.9454236485937706, 'assortativity': -0.05947509150843339, 'clustering_coefficient': 0.02897469303959888, 'n_components': 1, 'cpl': 2.855079664778562}\n",
      "trial:  2\n",
      "33\n",
      "66\n",
      "99\n",
      "rws:  (100000, 16)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/wesleyjtann/miniconda3/envs/eggen/lib/python3.6/site-packages/powerlaw.py:700: RuntimeWarning: invalid value encountered in true_divide\n",
      "  (Theoretical_CDF * (1 - Theoretical_CDF))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Trial 2: --- 34.96060752868652 seconds ---\n",
      "stats:  {'d_max': 68.0, 'd_min': 1.0, 'd': 16.32183908045977, 'LCC': 348, 'wedge_count': 73397.0, 'claw_count': 792434.0, 'triangle_count': 7322, 'square_count': 8300, 'power_law_exp': 1.434384091554319, 'gini': 0.4342308159300632, 'rel_edge_distr_entropy': 0.943591537281438, 'assortativity': -0.005384547328036809, 'clustering_coefficient': 0.027719658671889393, 'n_components': 1, 'cpl': 2.8345258206631554}\n",
      "trial:  3\n",
      "33\n",
      "66\n",
      "99\n",
      "rws:  (100000, 16)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/wesleyjtann/miniconda3/envs/eggen/lib/python3.6/site-packages/powerlaw.py:700: RuntimeWarning: invalid value encountered in true_divide\n",
      "  (Theoretical_CDF * (1 - Theoretical_CDF))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Trial 3: --- 35.52893781661987 seconds ---\n",
      "stats:  {'d_max': 66.0, 'd_min': 1.0, 'd': 16.32183908045977, 'LCC': 348, 'wedge_count': 73638.0, 'claw_count': 799570.0, 'triangle_count': 7372, 'square_count': 8615, 'power_law_exp': 1.431806685472663, 'gini': 0.4361684474664078, 'rel_edge_distr_entropy': 0.9439424589897542, 'assortativity': -0.051420052127954204, 'clustering_coefficient': 0.0276598671786085, 'n_components': 1, 'cpl': 2.8073636092616514}\n",
      "trial:  4\n",
      "33\n",
      "66\n",
      "99\n",
      "rws:  (100000, 16)\n",
      "Trial 4: --- 34.57370042800903 seconds ---\n",
      "stats:  {'d_max': 69.0, 'd_min': 1.0, 'd': 16.32183908045977, 'LCC': 348, 'wedge_count': 72645.0, 'claw_count': 761195.0, 'triangle_count': 7257, 'square_count': 7816, 'power_law_exp': 1.4332761475749853, 'gini': 0.4329812206572772, 'rel_edge_distr_entropy': 0.9443914078425304, 'assortativity': -0.04319207226923226, 'clustering_coefficient': 0.028601081194700437, 'n_components': 1, 'cpl': 2.860213985226407}\n",
      "avg_stats:  [-0.0339874343555947, 0.028465257680464984, 2.8430256053529432, 0.4348660352922131, 66.2, 1.4335646306732828, 0.4245774647887323, 1.0, 16.32183908045977, 348.0, 73216.8, 781587.0, 7414.4, 8673.0, 0.9438620531063237, 1.0]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/wesleyjtann/miniconda3/envs/eggen/lib/python3.6/site-packages/powerlaw.py:700: RuntimeWarning: invalid value encountered in true_divide\n",
      "  (Theoretical_CDF * (1 - Theoretical_CDF))\n"
     ]
    }
   ],
   "source": [
    "dmax,dmin,deg,lcc,wc,cc,tc,sc,law,gini,rel,assrt,coe,ncomp,intra,inter,cpl,eo = ([] for i in range(18))\n",
    "num_trials = 5 #1 #\n",
    "num_paths = 100 #\n",
    "\n",
    "def compute_stats(samples):\n",
    "    rws = np.array(samples).reshape([-1, rw_len])\n",
    "    print(\"rws: \", rws.shape)\n",
    "    scores_matrix = utils.score_matrix_from_random_walks(rws, _N).tocsr()\n",
    "    \n",
    "    A_select = sp.csr_matrix((np.ones(len(train_ones)), (train_ones[:,0], train_ones[:,1])))\n",
    "    A_select = train_graph\n",
    "#     sampled_graph = utils.graph_from_scores(scores_matrix, A_select.sum())\n",
    "    sampled_graph = graph_from_scores(scores_matrix, A_select.sum())\n",
    "    EO = utils.edge_overlap(A_select.toarray(), sampled_graph)/A_select.sum()\n",
    "    \n",
    "    stats = utils.compute_graph_statistics(sampled_graph) #, encoded)\n",
    "    return stats, EO\n",
    "\n",
    "for trials in range(num_trials):\n",
    "    print(\"trial: \", trials)\n",
    "    start_time = time.time()\n",
    "  \n",
    "    samples = []\n",
    "    for _ in range(num_paths): \n",
    "        if (_+1) % round(num_paths/3) == 0:\n",
    "            print(_+1)\n",
    "        samples.append(sample_many.eval({eggen.tau: 0.5}))\n",
    "\n",
    "    stats, EO = compute_stats(samples)\n",
    "    print(\"Trial %i: --- %s seconds ---\" % (trials, time.time() - start_time))\n",
    "\n",
    "    dmax.append(stats['d_max'])\n",
    "    dmin.append(stats['d_min'])\n",
    "    deg.append(stats['d'])\n",
    "    lcc.append(stats['LCC'])\n",
    "    wc.append(stats['wedge_count'])\n",
    "    cc.append(stats['claw_count'])\n",
    "    tc.append(stats['triangle_count'])\n",
    "    sc.append(stats['square_count'])\n",
    "    law.append(stats['power_law_exp'])\n",
    "    gini.append(stats['gini'])\n",
    "    rel.append(stats['rel_edge_distr_entropy'])\n",
    "    assrt.append(stats['assortativity'])\n",
    "    coe.append(stats['clustering_coefficient'])\n",
    "    ncomp.append(stats['n_components'])\n",
    "    cpl.append(stats['cpl'])\n",
    "    eo.append(EO)\n",
    "\n",
    "    print(\"stats: \", stats)\n",
    "\n",
    "\n",
    "# all_stats = [dmax, dmin, deg, lcc, wc, cc, tc, sc, law, gini, rel, assrt, coe, ncomp, cpl, eo]\n",
    "# ====== ASST CLUST CPL GINI MD PLE EO ====== \n",
    "all_stats = [assrt, coe, cpl, gini, dmax, law, eo, dmin, deg, lcc, wc, cc, tc, sc, rel, ncomp]\n",
    "\n",
    "# avg_stats = [np.mean(dmax), np.mean(dmin), np.mean(deg), np.mean(lcc), np.mean(wc), np.mean(cc), np.mean(tc), np.mean(sc), np.mean(law), np.mean(gini), np.mean(rel), np.mean(assrt), np.mean(coe), np.mean(ncomp), np.mean(cpl), np.mean(eo)]\n",
    "avg_stats = [np.mean(assrt), np.mean(coe), np.mean(cpl), np.mean(gini), np.mean(dmax), np.mean(law), np.mean(eo), np.mean(dmin), np.mean(deg), np.mean(lcc), np.mean(wc), np.mean(cc), np.mean(tc), np.mean(sc), np.mean(rel), np.mean(ncomp)]\n",
    "print(\"avg_stats: \", avg_stats)\n",
    "\n",
    "\n",
    "from scipy import stats\n",
    "# stderror_stats = [stats.sem(dmax), stats.sem(dmin), stats.sem(deg), stats.sem(lcc), stats.sem(wc), stats.sem(cc), stats.sem(tc), stats.sem(sc), stats.sem(law), stats.sem(gini), stats.sem(rel), stats.sem(assrt), stats.sem(coe), stats.sem(ncomp), stats.sem(cpl), stats.sem(eo)]\n",
    "stderror_stats = [stats.sem(assrt), stats.sem(coe), stats.sem(cpl), stats.sem(gini), stats.sem(dmax), stats.sem(law), stats.sem(eo), stats.sem(dmin), stats.sem(deg), stats.sem(lcc), stats.sem(wc), stats.sem(cc), stats.sem(tc), stats.sem(sc), stats.sem(rel), stats.sem(ncomp)]\n",
    "    \n",
    "save_directory = \"./generate_stats\" #\"./snapshots_gencond\"  #\"./snapshots_gencond2\" \n",
    "# log_num = 8\n",
    "data_name  = \"EUcore-top\" #\"EUcore\" #\n",
    "\n",
    "# save_stats = \"{}/{}_stats{}.txt\".format(save_directory, data_name, log_num)\n",
    "save_stats = \"{}/{}_{}_numpaths{}.txt\".format(save_directory, data_name, model_name, num_paths)\n",
    "\n",
    "np.savetxt(save_stats, np.c_[avg_stats ,stderror_stats , all_stats])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "eggen",
   "language": "python",
   "name": "eggen"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
