{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import scipy as sp\n",
    "import scipy.sparse.linalg, scipy.optimize\n",
    "import scipy.stats\n",
    "from scipy.special import expit, logit, softmax\n",
    "import networkx as nx\n",
    "\n",
    "from math import comb\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "eps = 1e-8 # small real number\n",
    "\n",
    "import itertools\n",
    "import time\n",
    "import subprocess\n",
    "import os\n",
    "from scipy.sparse.csgraph import connected_components"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Utility functions:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def two_comm_sbm_explicit(n, exp_deg, alpha):\n",
    "    \"\"\"\n",
    "    Create a sampled SBM with given n (num nodes), expected degree, and alpha (homophily ratio)\n",
    "    \"\"\"\n",
    "    u_idcs, v_idcs = np.arange(0,n//2), np.arange(n//2,n)\n",
    "    num_homoph = int((alpha*exp_deg*n)//4)\n",
    "    num_heteroph = int(((1-alpha)*exp_deg*n)//2)\n",
    "    uu = np.hstack((np.random.choice(u_idcs, size=num_homoph)[:,None],\n",
    "                     np.random.choice(u_idcs, size=num_homoph)[:,None]))\n",
    "    uv = np.hstack((np.random.choice(u_idcs, size=num_heteroph)[:,None],\n",
    "                     np.random.choice(v_idcs, size=num_heteroph)[:,None]))\n",
    "    vv = np.hstack((np.random.choice(v_idcs, size=num_homoph)[:,None],\n",
    "                     np.random.choice(v_idcs, size=num_homoph)[:,None]))\n",
    "    adj = np.zeros((n,n))\n",
    "    adj[uu[:,0],uu[:,1]] = 1\n",
    "    adj[uv[:,0],uv[:,1]] = 1\n",
    "    adj[vv[:,0],vv[:,1]] = 1\n",
    "    adj = np.maximum(adj, adj.T)\n",
    "    edge_labels = np.concatenate((0*np.ones(num_homoph), 1*np.ones(num_heteroph), 2*np.ones(num_homoph))).astype(int)\n",
    "    return np.vstack((uu,uv,vv)), adj, edge_labels\n",
    "\n",
    "def power_law_graph(n, plot=False):\n",
    "    deg_exp = np.sqrt(1/(1+np.arange(n)))\n",
    "    adj = deg_exp[:,None] * deg_exp[None,:]\n",
    "    if plot:\n",
    "        plt.matshow(adj, vmin=0, vmax=1, cmap='Blues')\n",
    "        plt.colorbar()\n",
    "        plt.show()\n",
    "    adj = adj > np.random.random((n,n))\n",
    "    adj[np.triu_indices_from(adj)] = 0.\n",
    "    adj += adj.T\n",
    "    if plot:\n",
    "        plt.matshow(adj, vmin=0, vmax=1, cmap='Blues')\n",
    "        plt.colorbar()\n",
    "        plt.show()\n",
    "        plt.plot(adj.sum(axis=0))\n",
    "    return sp.sparse.csr_array(adj)\n",
    "\n",
    "def readMTX( network_filename ):\n",
    "    r_index = []\n",
    "    c_index = []\n",
    "    data = []\n",
    "    node_dict = dict()\n",
    "    k = 0\n",
    "    with open('datasets/' + network_filename + '.mtx','r') as f:\n",
    "        header = False\n",
    "        for line in f:\n",
    "            if line[0] == \"%\":\n",
    "                continue\n",
    "            if header == False:\n",
    "                rows, columns, edges = line.split()\n",
    "                header = True\n",
    "            else:\n",
    "                if len(line.split()) == 2:\n",
    "                    row, col = line.split()\n",
    "                    data.append( 1. )\n",
    "                else:\n",
    "                    row, col, w = line.split()\n",
    "                    data.append( int(w) )\n",
    "                row = int(row)\n",
    "                col = int(col)\n",
    "                r_index.append( int(row) - 1 )\n",
    "                c_index.append( int(col) - 1 )\n",
    "    adj = scipy.sparse.csc_array((data, (r_index, c_index)), shape=(int(rows), int(columns)))\n",
    "    return adj\n",
    "\n",
    "def preparePivoter( G ):\n",
    "    # Open \n",
    "    outF = open(\"./Pivoter/graphs/pivoter_temp.edges\", \"w\")\n",
    "    # Write number of nodes\n",
    "    n = G.number_of_nodes()\n",
    "    m = G.number_of_edges()\n",
    "    outF.write(str(n)+' '+ str(m)+'\\n')\n",
    "    # Write edges\n",
    "    for u,v in G.edges():\n",
    "        outF.write(str(u)+' '+str(v)+'\\n')\n",
    "    outF.close()\n",
    "    return\n",
    "\n",
    "def pivoterCounts(k):\n",
    "    \"\"\"\"\n",
    "    k is upper bound for maximal cliques\n",
    "    \"\"\"\"\"\n",
    "    os.chdir('Pivoter')\n",
    "    cmd = \"./bin/degeneracy_cliques -i graphs/pivoter_temp.edges -t A -k {} -d 1\".format(str(k))\n",
    "    p = subprocess.Popen(cmd, shell=True)\n",
    "    p.wait()\n",
    "    os.chdir('..')\n",
    "    clique_counts = dict()\n",
    "    with open('Pivoter/results/pivoter_temp_{}_A.txt'.format(str(k))) as f:\n",
    "        # skip first 4 lines\n",
    "        [f.readline()for i in range(4)]\n",
    "        for line in f:\n",
    "            if line in ['\\n', '\\r\\n']:\n",
    "                break\n",
    "            cq_size, cnt = line.rstrip('\\n').split(',')\n",
    "            clique_counts[int(cq_size)] = int(float(cnt))\n",
    "\n",
    "    return clique_counts\n",
    "\n",
    "def returnKs(G,k):\n",
    "    preparePivoter(G)\n",
    "    result = pivoterCounts(k)\n",
    "    if k in result:\n",
    "        return result[k]\n",
    "    else:\n",
    "        #$ Did not find any k-cliques\n",
    "        return 0\n",
    "    \n",
    "def _writeMace( G ):\n",
    "    n = G.number_of_nodes()\n",
    "    f2 = open( 'mace_format4.txt', 'w' )\n",
    "    for i in range(n):\n",
    "            i_neighbors = [str(v) for v in G.neighbors(i) if v > i]\n",
    "            i_neighbors = ' '.join(i_neighbors)\n",
    "            f2.write(i_neighbors+'\\n')\n",
    "    f2.close()\n",
    "    return\n",
    "\n",
    "def _executeMace():\n",
    "        cmd = \"mace22/mace M -l 2 mace_format4.txt mace4.cliques\"\n",
    "        p = subprocess.Popen(cmd, shell=True)\n",
    "        p.wait()\n",
    "\n",
    "        return\n",
    "    \n",
    "def _readCliques():\n",
    "    with open('mace4.cliques','r') as f:\n",
    "        cliques = [list(map(int, line.strip().split(' '))) for line in f]\n",
    "    return cliques\n",
    "    \n",
    "def listCliques( G ):\n",
    "    _writeMace(G)\n",
    "    _executeMace()\n",
    "    maximalCliques = _readCliques()\n",
    "    return maximalCliques"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def sparse_row_prod(coo, complement=False, revert=True):\n",
    "    \"\"\"\n",
    "    Given a sparse COO array, return the row products over non-zero (stored) entries\n",
    "    If complement, then return the product of 1 - each entry.\n",
    "    \"\"\"\n",
    "    if complement:\n",
    "        with np.errstate(divide='ignore'):\n",
    "            coo.data = np.log(1. - coo.data)\n",
    "        prod = np.exp(coo.sum(axis=1))\n",
    "        if revert:\n",
    "            coo.data = 1. - np.exp(coo.data)\n",
    "    else:\n",
    "        coo.data = np.log(coo.data)\n",
    "        prod = np.exp(coo.sum(axis=1))\n",
    "        if revert:\n",
    "            coo.data = np.exp(coo.data)       \n",
    "    return prod\n",
    "\n",
    "# Helper Functions\n",
    "def getStatistics( C, G ):\n",
    "    GN = Graph()\n",
    "    GN.importNetwork( *C.getOutput(), network_name )\n",
    "    result = GN.getStatistics( *G.getAdjTriangles() )\n",
    "    return result\n",
    "\n",
    "def to_title(network_name):\n",
    "    if network_name == \"cora\":\n",
    "        return \"Cora\"\n",
    "    elif network_name == 'polblogs':\n",
    "        return \"PolBlogs\"\n",
    "    \n",
    "def degree_hist(dataset,it,method,g_sample,G):\n",
    "    degree_freq = nx.degree_histogram(g_sample)\n",
    "    degrees = range(len(degree_freq))\n",
    "    #plt.figure(figsize=(12, 8)) \n",
    "    plt.loglog(degrees[1:], degree_freq[1:],'go-', label='Generated Graph') \n",
    "    degree_freq = nx.degree_histogram(G)\n",
    "    degrees = range(len(degree_freq))\n",
    "    #plt.figure(figsize=(12, 8)) \n",
    "    plt.loglog(degrees[1:], degree_freq[1:], 'ro-', label='Original Graph') \n",
    "    plt.xlabel('Degree')\n",
    "    plt.ylabel('Frequency')\n",
    "    plt.legend()\n",
    "    plt.savefig('baseline_pickles_cc/degree_plots/'+dataset+'_it_'+str(it)+'_'+method+'.png',format='png')\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Average Path Length functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def characteristicPathLength( G ):\n",
    "    pairs_count = 1000\n",
    "    number_of_nodes = G.number_of_nodes()\n",
    "    pairs = _gencoordinates(0, number_of_nodes-1, pairs_count )\n",
    "    cpl = _average_paths(G, pairs, None )\n",
    "    return cpl\n",
    "\n",
    "def _gencoordinates(m, n, total_pairs):\n",
    "    from random import randint\n",
    "    seen = set()\n",
    "    pairs = []\n",
    "    cnt = 0\n",
    "    x, y = randint(m, n), randint(m, n)\n",
    "    while cnt < total_pairs:\n",
    "        seen.add((x, y))\n",
    "        pairs.append( (x, y) )\n",
    "        cnt += 1\n",
    "        x, y = randint(m, n), randint(m, n)\n",
    "        while (x, y) in seen or x == y:\n",
    "            x, y = randint(m, n), randint(m, n)\n",
    "    return pairs\n",
    "\n",
    "def _average_paths( G, pairs, weight=None ):\n",
    "    length = 0\n",
    "    found = 0\n",
    "    for p in pairs:\n",
    "        u,v = p\n",
    "        try:\n",
    "            l = nx.shortest_path_length(G, source=u, target=v, weight=weight)\n",
    "            found += 1\n",
    "        except:\n",
    "            l = 0\n",
    "        length += l\n",
    "    return length / found"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "rel_frob_error = lambda observed, actual : np.linalg.norm(observed - actual) / np.linalg.norm(actual)\n",
    "\n",
    "def _calc_adj_match_deg(deg_targets, adj_init=None, thresh=1e-12, allow_loops=False, return_vec=False, logit_init=None, print_iters=False):\n",
    "    \"\"\"\n",
    "    Return the expected adjacency matrix with the odds product model\n",
    "    \"\"\"\n",
    "    n = deg_targets.size\n",
    "    if logit_init is None:\n",
    "        logit_vec = np.zeros(n)\n",
    "    else:\n",
    "        logit_vec = logit_init\n",
    "    \n",
    "    if adj_init is None:\n",
    "        adj_init = np.zeros((n,n))\n",
    "    \n",
    "    adj_deg = expit(logit_vec[:,None]+logit_vec[None,:])\n",
    "    if not allow_loops:\n",
    "        np.fill_diagonal(adj_deg, 0.)\n",
    "    adj_deg_with_init = 1. - (1.-adj_deg) * (1.-adj_init)\n",
    "    deg_recon = adj_deg_with_init.sum(axis=1)\n",
    "    error = rel_frob_error(deg_recon, deg_targets)\n",
    "    if print_iters:\n",
    "        print(\"Iter 0, Error %s\" % np.format_float_scientific(error, precision=3))\n",
    "    i = 0\n",
    "    while error > thresh:\n",
    "        adj_deg_jacob = adj_deg * (1.-adj_deg_with_init)\n",
    "        adj_deg_jacob[np.diag_indices_from(adj_deg_jacob)] += (adj_deg * (1.-adj_deg_with_init)).sum(axis=1)\n",
    "        logit_vec_increment = np.linalg.solve(adj_deg_jacob, deg_recon - deg_targets)\n",
    "        logit_vec = logit_vec - logit_vec_increment\n",
    "        adj_deg = expit(logit_vec[:,None]+logit_vec[None,:])\n",
    "        if not allow_loops:\n",
    "            np.fill_diagonal(adj_deg, 0.)\n",
    "        adj_deg_with_init = 1. - (1.-adj_deg) * (1.-adj_init)\n",
    "        deg_recon = adj_deg_with_init.sum(axis=1)\n",
    "        error = rel_frob_error(deg_recon, deg_targets)\n",
    "        i += 1\n",
    "        if print_iters:\n",
    "            print(\"Iter %i, Error %s\" % (i, np.format_float_scientific(error, precision=3)))\n",
    "    if return_vec:\n",
    "        return adj_deg, logit_vec\n",
    "    else:\n",
    "        return adj_deg\n",
    "\n",
    "# def _get_max_clique_to_nodes(adj):\n",
    "#     n = adj.shape[0]\n",
    "#     cliques = list(nx.find_cliques(nx.from_scipy_sparse_array(adj)))\n",
    "#     cliques = [clique for clique in cliques if len(clique)>1] # remove singletons\n",
    "#     num_cliques = len(cliques)\n",
    "#     clique_sizes = np.array([len(clique) for clique in cliques])\n",
    "#     max_clique_to_node = sp.sparse.coo_array( (np.ones(clique_sizes.sum()), \n",
    "#                                             (np.repeat(np.arange(num_cliques), clique_sizes),\n",
    "#                                              np.array(list(itertools.chain(*cliques))))),\n",
    "#                                              shape = (num_cliques, n))\n",
    "#     return max_clique_to_node\n",
    "\n",
    "def get_max_clique_to_nodes(adj):\n",
    "    n = adj.shape[0]\n",
    "    cliques = list(nx.find_cliques(nx.from_scipy_sparse_array(adj)))\n",
    "    cliques = [clique for clique in cliques if len(clique)>1] # remove singletons\n",
    "    #print(\"Listing maximal cliques (starts)...\")\n",
    "    #cliques = listCliques( nx.from_scipy_sparse_array(adj) ) # use MACE\n",
    "    #print(\"Listing maximal cliques (done)...\")\n",
    "    num_cliques = len(cliques)\n",
    "    clique_sizes = np.array([len(clique) for clique in cliques])\n",
    "    max_clique_to_node = sp.sparse.coo_array( (np.ones(clique_sizes.sum()), \n",
    "                                            (np.repeat(np.arange(num_cliques), clique_sizes),\n",
    "                                             np.array(list(itertools.chain(*cliques))))),\n",
    "                                             shape = (num_cliques, n))\n",
    "    return max_clique_to_node"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def sample_ccop(adj, combo_coeff, return_sparse=False):\n",
    "    n = adj.shape[0]\n",
    "    deg_targets = adj.sum(axis=1)\n",
    "    adj_deg = _calc_adj_match_deg(deg_targets, adj_init=None, print_iters=False)\n",
    "    adj_ccop_exp = combo_coeff * adj.todense() + (1.-combo_coeff) * adj_deg\n",
    "    \n",
    "    while True:\n",
    "        adj_sample = np.zeros((n,n))\n",
    "        adj_sample[np.triu_indices(n,1)] = np.random.binomial(1, adj_ccop_exp[np.triu_indices(n,1)])\n",
    "        adj_sample = adj_sample + adj_sample.T\n",
    "        if return_sparse:\n",
    "            adj_sample = sp.sparse.csr_array(adj_sample)\n",
    "        yield adj_sample\n",
    "\n",
    "def sample_epop(adj, edge_plant_prob, return_sparse=False):\n",
    "    n = adj.shape[0]\n",
    "    deg_targets = adj.sum(axis=1)\n",
    "    adj_plant = (edge_plant_prob*adj).todense()\n",
    "    adj_deg = _calc_adj_match_deg(deg_targets, adj_init=adj_plant, print_iters=False)\n",
    "    adj_ecop_exp = 1. - (1.-adj_deg)*(1.-adj_plant)\n",
    "    \n",
    "    while True:\n",
    "        adj_sample = np.zeros((n,n))\n",
    "        adj_sample[np.triu_indices(n,1)] = np.random.binomial(1, adj_ecop_exp[np.triu_indices(n,1)])\n",
    "        adj_sample = adj_sample + adj_sample.T\n",
    "        if return_sparse:\n",
    "            adj_sample = sp.sparse.csr_array(adj_sample)\n",
    "        yield adj_sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "def _calc_adj_edgeset_plant(n, edgeset_plant_prob, edge_to_edgeset, edge_idcs, sample=False):\n",
    "    \"\"\"\n",
    "    Return the expected adjacency matrix when planting a set of edgesets independently with some fixed probability\n",
    "    \"\"\"\n",
    "    num_edgesets = edge_to_edgeset.shape[1]\n",
    "    \n",
    "    edgeset_probs = edgeset_plant_prob * np.ones(num_edgesets)\n",
    "    if sample:\n",
    "        edgeset_probs = np.random.binomial(1, edgeset_probs)\n",
    "    edge_to_edgeset.data = edgeset_probs[edge_to_edgeset.col]\n",
    "    edge_probs = 1. - sparse_row_prod(edge_to_edgeset, complement=True, revert=False)\n",
    "\n",
    "    adj_plant = np.zeros((n,n))\n",
    "    adj_plant[edge_idcs[:,0],edge_idcs[:,1]] = edge_probs\n",
    "    adj_plant += adj_plant.T\n",
    "    return adj_plant\n",
    "\n",
    "def sample_max_clique_plant_match_deg(adj, max_clique_plant_prob, max_clique_to_node, return_components=False, return_sparse=False):\n",
    "    n = adj.shape[0]\n",
    "    edge_idcs = np.array(list(itertools.combinations(np.arange(n),2)))\n",
    "    num_possible_edge = edge_idcs.shape[0]\n",
    "    edge_to_node = sp.sparse.csr_array((num_possible_edge,n), dtype=int)\n",
    "    edge_to_node[np.arange(num_possible_edge)[:,None],edge_idcs] = 1\n",
    "\n",
    "    #max_clique_to_node = _get_max_clique_to_nodes(adj)\n",
    "    edge_to_max_clique = sp.sparse.coo_array((edge_to_node@max_clique_to_node.T) == 2)\n",
    "\n",
    "    adj_plant = _calc_adj_edgeset_plant(n, max_clique_plant_prob, edge_to_max_clique, edge_idcs)\n",
    "    deg_targets = adj.sum(axis=1)\n",
    "    adj_deg = _calc_adj_match_deg(deg_targets, adj_init=adj_plant, print_iters=False)\n",
    "    \n",
    "    while True:\n",
    "        adj_sample_deg = np.zeros((n,n))\n",
    "        adj_sample_deg[np.triu_indices(n,1)] = np.random.binomial(1, adj_deg[np.triu_indices(n,1)])\n",
    "        adj_sample_deg = adj_sample_deg + adj_sample_deg.T\n",
    "\n",
    "        adj_sample_plant = _calc_adj_edgeset_plant(n, max_clique_plant_prob, edge_to_max_clique, edge_idcs, sample=True)\n",
    "\n",
    "        adj_sample = np.maximum(adj_sample_deg, adj_sample_plant)\n",
    "        if return_sparse:\n",
    "            adj_sample = sp.sparse.csr_array(adj_sample)\n",
    "        if return_components:\n",
    "            yield adj_sample, adj_sample_deg, adj_sample_plant\n",
    "        else:\n",
    "            yield adj_sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def _calc_adj_clique_partic(clique_partic_prob,\n",
    "                            clique_to_node, edge_to_clique, edge_idcs,\n",
    "                            sample=False,\n",
    "                            adjust_prob_by_size='root'):\n",
    "    \"\"\"\n",
    "    Return the expected adjacency matrix when having nodes participate in cliques with with some probabilities\n",
    "    \"\"\"\n",
    "    num_cliques, n = clique_to_node.shape\n",
    "    clique_sizes = clique_to_node.sum(axis=1)\n",
    "    \n",
    "    clique_to_node = sp.sparse.coo_array(clique_to_node)\n",
    "    if adjust_prob_by_size == 'cliquesize':\n",
    "        clique_to_node.data = clique_partic_prob ** (1./clique_sizes[clique_to_node.row])\n",
    "    elif adjust_prob_by_size == 'squareroot':\n",
    "        clique_to_node.data = np.sqrt(clique_partic_prob) * np.ones(len(clique_to_node.data))\n",
    "    else:\n",
    "        assert adjust_prob_by_size is None\n",
    "        clique_to_node.data = clique_partic_prob * np.ones(len(clique_to_node.data))\n",
    "    \n",
    "    if sample:\n",
    "        clique_to_node.data = np.random.binomial(1, clique_to_node.data)\n",
    "    \n",
    "    clique_to_node = sp.sparse.csr_array(clique_to_node)\n",
    "    edge_to_clique.data = clique_to_node[edge_to_clique.col,edge_idcs[edge_to_clique.row,0]] * clique_to_node[edge_to_clique.col,edge_idcs[edge_to_clique.row,1]]\n",
    "    edge_probs = 1. - sparse_row_prod(edge_to_clique, complement=True, revert=False)\n",
    "\n",
    "    adj_partic = np.zeros((n,n))\n",
    "    adj_partic[edge_idcs[:,0],edge_idcs[:,1]] = edge_probs\n",
    "    adj_partic += adj_partic.T\n",
    "    return adj_partic\n",
    "\n",
    "def sample_max_clique_partic_match_deg(adj, clique_partic_prob, max_clique_to_node, adjust_prob_by_size='root', return_components=False, return_sparse=False):\n",
    "    n = adj.shape[0]\n",
    "    edge_idcs = np.array(list(itertools.combinations(np.arange(n),2)))\n",
    "    num_possible_edge = edge_idcs.shape[0]\n",
    "    edge_to_node = sp.sparse.csr_array((num_possible_edge,n), dtype=int)\n",
    "    edge_to_node[np.arange(num_possible_edge)[:,None],edge_idcs] = 1\n",
    "\n",
    "    #max_clique_to_node = _get_max_clique_to_nodes(adj)\n",
    "    edge_to_max_clique = sp.sparse.coo_array((edge_to_node@max_clique_to_node.T) == 2)\n",
    "\n",
    "    adj_partic = _calc_adj_clique_partic(clique_partic_prob, max_clique_to_node, edge_to_max_clique, edge_idcs,\n",
    "                                        sample=False, adjust_prob_by_size=adjust_prob_by_size)\n",
    "    deg_targets = adj.sum(axis=1)\n",
    "    adj_deg = _calc_adj_match_deg(deg_targets, adj_init=adj_partic, print_iters=False)\n",
    "    \n",
    "    while True:\n",
    "        adj_sample_deg = np.zeros((n,n))\n",
    "        adj_sample_deg[np.triu_indices(n,1)] = np.random.binomial(1, adj_deg[np.triu_indices(n,1)])\n",
    "        adj_sample_deg = adj_sample_deg + adj_sample_deg.T\n",
    "\n",
    "        adj_sample_partic = _calc_adj_clique_partic(clique_partic_prob, max_clique_to_node, edge_to_max_clique, edge_idcs,\n",
    "                                        sample=True, adjust_prob_by_size=adjust_prob_by_size)\n",
    "\n",
    "        adj_sample = np.maximum(adj_sample_deg, adj_sample_partic)\n",
    "        if return_sparse:\n",
    "            adj_sample = sp.sparse.csr_array(adj_sample)\n",
    "        if return_components:\n",
    "            yield adj_sample, adj_sample_deg, adj_sample_partic\n",
    "        else:\n",
    "            yield adj_sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def sample_max_clique_indep_match_deg(adj, max_clique_plant_prob, max_clique_to_node, return_components=False, return_sparse=False):\n",
    "    n = adj.shape[0]\n",
    "    edge_idcs = np.array(list(itertools.combinations(np.arange(n),2)))\n",
    "    num_possible_edge = edge_idcs.shape[0]\n",
    "    edge_to_node = sp.sparse.csr_array((num_possible_edge,n), dtype=int)\n",
    "    edge_to_node[np.arange(num_possible_edge)[:,None],edge_idcs] = 1\n",
    "\n",
    "    #max_clique_to_node = _get_max_clique_to_nodes(adj)\n",
    "    edge_to_max_clique = sp.sparse.coo_array((edge_to_node@max_clique_to_node.T) == 2)\n",
    "\n",
    "    adj_plant = _calc_adj_edgeset_plant(n, max_clique_plant_prob, edge_to_max_clique, edge_idcs)\n",
    "    deg_targets = adj.sum(axis=1)\n",
    "    adj_deg = _calc_adj_match_deg(deg_targets, adj_init=adj_plant, print_iters=False)\n",
    "    \n",
    "    while True:\n",
    "        adj_sample_deg = np.zeros((n,n))\n",
    "        adj_sample_deg[np.triu_indices(n,1)] = np.random.binomial(1, adj_deg[np.triu_indices(n,1)])\n",
    "        adj_sample_deg = adj_sample_deg + adj_sample_deg.T\n",
    "        \n",
    "        adj_sample_maxclique_indep = np.zeros((n,n))\n",
    "        adj_sample_maxclique_indep[np.triu_indices(n,1)] = np.random.binomial(1, adj_plant[np.triu_indices(n,1)])\n",
    "        adj_sample_maxclique_indep = adj_sample_maxclique_indep + adj_sample_maxclique_indep.T\n",
    "        \n",
    "        adj_sample = np.maximum(adj_sample_deg, adj_sample_maxclique_indep)\n",
    "        if return_sparse:\n",
    "            adj_sample = sp.sparse.csr_array(adj_sample)\n",
    "        if return_components:\n",
    "            yield adj_sample, adj_sample_deg, adj_sample_maxclique_indep\n",
    "        else:\n",
    "            yield adj_sample"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Checking Triangles vs Overlap for the three models on Cora:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "largest_cc = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/scratch/1361251.1.tsourakakisgroup/ipykernel_2551174/1155078140.py:42: DeprecationWarning: \n",
      "\n",
      "The scipy.sparse array containers will be used instead of matrices\n",
      "in Networkx 3.0. Use `from_scipy_sparse_array` instead.\n",
      "  g_true = nx.from_scipy_sparse_matrix(adj)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Started MCPLOP.\n",
      "Finished param val 0 of 10.\n"
     ]
    }
   ],
   "source": [
    "dataset = 'cora' # Should be one of [cora,citeseer,web-edu,polblogs,facebook,ppi]\n",
    "\n",
    "# Read original dataset\n",
    "if dataset == \"citeseer\":\n",
    "    adj = sp.sparse.csr_array(sp.io.loadmat('datasets/citeseer.mat')['network'])\n",
    "elif dataset == \"facebook\":\n",
    "    adj = sp.sparse.csr_array(sp.io.loadmat('datasets/facebook.mat')['A'])\n",
    "elif dataset == \"cora\":\n",
    "    adj = sp.sparse.csr_array(sp.io.loadmat('datasets/cora.mat')['network'])\n",
    "elif dataset == \"polblogs\":\n",
    "    adj = sp.sparse.csr_array(sp.io.loadmat('datasets/polblogs.mat')['Problem']['A'].item())\n",
    "elif dataset == \"web-edu\":\n",
    "    adj = readMTX('web-edu')\n",
    "elif dataset == \"ppi\":\n",
    "    adj = sp.sparse.csr_array(sp.io.loadmat('datasets/Homo_sapiens.mat')['network'])\n",
    "    \n",
    "adj[adj > 0] = 1\n",
    "adj.setdiag(0)\n",
    "adj = adj.maximum(adj.T)\n",
    "adj.eliminate_zeros()\n",
    "adj = adj.astype(int)\n",
    "\n",
    "largest_cc = True\n",
    "if largest_cc:\n",
    "    # select the largest connected component\n",
    "    _, components = connected_components(adj)\n",
    "    c_ids, c_counts = np.unique(components, return_counts=True)\n",
    "    id_max_component = c_ids[c_counts.argmax()]\n",
    "    select = components == id_max_component\n",
    "    adj = adj[select][:, select]\n",
    "\n",
    "    \n",
    "adj_to_tri = lambda adj : (adj@adj@adj).trace() / 6\n",
    "adj_to_p4 = lambda adj : (adj@adj@adj@adj).trace()\n",
    "adj_to_wedge = lambda adj : sum([comb(int(d),2) for d in adj.sum(axis=0)])\n",
    "adj_to_triVector = lambda adj : (adj@adj@adj).diagonal()/2.0\n",
    "self_outer = lambda F : F @ F.T\n",
    "avg_in_triu = lambda M : M[np.triu_indices_from(M, k=1)].mean()\n",
    "adjs_to_overlap = lambda As : avg_in_triu(self_outer(As)) / As.sum(axis=1).mean()\n",
    "find_overlap = lambda obs, act : (obs*act).sum() / (obs+act-obs*act).sum()\n",
    "datasett = 'cora'\n",
    "g_true = nx.from_scipy_sparse_matrix(adj)\n",
    "num_samples_per = 10\n",
    "tris, overlaps = {}, {}\n",
    "sampled_graphs = {}\n",
    "# Additional stats\n",
    "deg_pearson = {}\n",
    "ccs, assorts, avgpls, maxds, c4s, k4s, tr_pearson, lcc, sq_comps, input_overlaps = {}, {}, {}, {}, {}, {}, {}, {}, {}, {}\n",
    "max_clique_to_node = get_max_clique_to_nodes( adj )\n",
    "methods = [('MCPLOP', lambda param_val : sample_max_clique_plant_match_deg(adj, param_val, max_clique_to_node, return_sparse=True), \n",
    "            np.linspace(0,1,10) ** 1.),\n",
    "           ('MCPROP', lambda param_val : sample_max_clique_partic_match_deg(adj, param_val, max_clique_to_node, adjust_prob_by_size='squareroot',return_sparse=True), \n",
    "            np.linspace(0,1,10) ** 1.),\n",
    "#            ('EPOP', lambda param_val : sample_epop(adj, param_val, return_sparse=True), \n",
    "#             np.linspace(0,1,10)[1:-1] ** 1.),\n",
    "           ('CCOP', lambda param_val : sample_max_clique_indep_match_deg(adj, param_val, max_clique_to_node, return_sparse=True), \n",
    "            np.linspace(0,1,10) ** 1.),\n",
    "          ]\n",
    "\n",
    "origTriVector = adj_to_triVector(adj)\n",
    "\n",
    "for (methodname, sampler, param_vals) in methods:\n",
    "    print(f\"Started {methodname}.\")\n",
    "    tris[methodname] = np.empty((len(param_vals),num_samples_per))\n",
    "    overlaps[methodname] = np.empty((len(param_vals)))\n",
    "    # additional stats\n",
    "    ccs[methodname] = np.empty((len(param_vals),num_samples_per))\n",
    "    assorts[methodname] = np.empty((len(param_vals),num_samples_per))\n",
    "    avgpls[methodname] = np.empty((len(param_vals),num_samples_per))\n",
    "    maxds[methodname] = np.empty((len(param_vals),num_samples_per))\n",
    "    c4s[methodname] = np.empty((len(param_vals),num_samples_per))\n",
    "    k4s[methodname] = np.empty((len(param_vals),num_samples_per))\n",
    "    tr_pearson[methodname] = np.empty((len(param_vals),num_samples_per))\n",
    "    lcc[methodname] = np.empty((len(param_vals),num_samples_per))\n",
    "    sq_comps[methodname] = np.empty((len(param_vals),num_samples_per))\n",
    "    deg_pearson[methodname] = np.empty((len(param_vals),num_samples_per))\n",
    "    #n_components[methodname] = np.empty((len(param_vals),num_samples_per))\n",
    "    input_overlaps[methodname] = np.empty((len(param_vals),num_samples_per))\n",
    "    sampled_graphs[methodname] = {}\n",
    "    for param_val_idx, param_val in enumerate(param_vals):\n",
    "        adj_sampler = sampler(param_val)\n",
    "        sampled_adjs = []\n",
    "        for sample_num in range(num_samples_per):\n",
    "            sampled_adjs.append(next(adj_sampler))\n",
    "            s_adj = sampled_adjs[-1]\n",
    "            #deg_pearson[methodname][param_val_idx,sample_num] = sp.stats.pearsonr(sp.sparse.csr_array(adj).sum(axis=0),s_adj.sum(axis=0))[0]\n",
    "            continue\n",
    "            tris[methodname][param_val_idx,sample_num] = adj_to_tri(sampled_adjs[-1])\n",
    "            # additional stats\n",
    "            g_sample = nx.from_scipy_sparse_array(sampled_adjs[-1])\n",
    "            size_comp = [len(c) for c in nx.connected_components(g_sample)]\n",
    "            lcc[methodname][param_val_idx,sample_num] = max(size_comp)\n",
    "            sq_comps[methodname][param_val_idx,sample_num] = np.sqrt(sum([c**2 for c in size_comp]))\n",
    "            # Extra stats\n",
    "            \n",
    "            input_overlaps[methodname][param_val_idx,sample_num] = find_overlap(s_adj,adj)\n",
    "            #if sample_num == 0:\n",
    "            #    degree_hist(datasett,param_val_idx,methodname,g_sample,g_true)\n",
    "            #n_components[methodname][param_val_idx,sample_num] = nx.number_connected_components(g_sample)\n",
    "            #continue\n",
    "            ccs[methodname][param_val_idx,sample_num] = nx.transitivity(g_sample)\n",
    "            assorts[methodname][param_val_idx,sample_num] = nx.degree_assortativity_coefficient(g_sample)\n",
    "            avgpls[methodname][param_val_idx,sample_num] = characteristicPathLength( g_sample )\n",
    "            maxds[methodname][param_val_idx,sample_num] = max(sampled_adjs[-1].sum(axis=1)).item()\n",
    "            c4s[methodname][param_val_idx,sample_num] = (1.0/8.0)*(adj_to_p4(s_adj)-4*adj_to_wedge(s_adj) -s_adj.sum())\n",
    "            k4s[methodname][param_val_idx,sample_num] = returnKs(g_sample,4)\n",
    "            tr_pearson[methodname][param_val_idx,sample_num] = sp.stats.pearsonr(origTriVector,adj_to_triVector(s_adj))[0]\n",
    "            \n",
    "        sampled_graphs[methodname][param_val_idx] = sampled_adjs\n",
    "        sampled_adjacencies = sp.sparse.vstack([sampled_adj.reshape((1,-1)) for sampled_adj in sampled_adjs])\n",
    "        overlaps[methodname][param_val_idx] = adjs_to_overlap(sampled_adjacencies)\n",
    "        print(f'Finished param val {param_val_idx} of {len(param_vals)}.')\n",
    "    print(f\"Finished {methodname}.\")\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "#import pickle as pk\n",
    "#with open('baseline_graphs/'+dataset+'.pk', 'wb') as handle:\n",
    "#    pk.dump(sampled_graphs, handle, protocol=pk.HIGHEST_PROTOCOL)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle as pk\n",
    "# Save to pickle file\n",
    "# overlaps\n",
    "# tris\n",
    "# ccs\n",
    "# assorts\n",
    "# avgpls\n",
    "# maxds\n",
    "# c4s\n",
    "# k4s\n",
    "# tr_pearson\n",
    "datasett = 'ppi'\n",
    "\n",
    "with open('baseline_pickles_cc3/'+datasett+'_deg_pearson.pk', 'wb') as handle:\n",
    "    pk.dump(deg_pearson, handle, protocol=pk.HIGHEST_PROTOCOL)\n",
    "\n",
    "# with open('baseline_pickles_cc3/'+datasett+'_overlaps.pk', 'wb') as handle:\n",
    "#     pk.dump(overlaps, handle, protocol=pk.HIGHEST_PROTOCOL)\n",
    "    \n",
    "# with open('baseline_pickles_cc3/'+datasett+'_tris.pk', 'wb') as handle:\n",
    "#     pk.dump(tris, handle, protocol=pk.HIGHEST_PROTOCOL)\n",
    "    \n",
    "# with open('baseline_pickles_cc3/'+datasett+'_ccs.pk', 'wb') as handle:\n",
    "#     pk.dump(ccs, handle, protocol=pk.HIGHEST_PROTOCOL)\n",
    "    \n",
    "# with open('baseline_pickles_cc3/'+datasett+'_assorts.pk', 'wb') as handle:\n",
    "#     pk.dump(assorts, handle, protocol=pk.HIGHEST_PROTOCOL)\n",
    "    \n",
    "# with open('baseline_pickles_cc3/'+datasett+'_avgpls.pk', 'wb') as handle:\n",
    "#     pk.dump(avgpls, handle, protocol=pk.HIGHEST_PROTOCOL)\n",
    "    \n",
    "# with open('baseline_pickles_cc3/'+datasett+'_maxds.pk', 'wb') as handle:\n",
    "#     pk.dump(maxds, handle, protocol=pk.HIGHEST_PROTOCOL)\n",
    "    \n",
    "# with open('baseline_pickles_cc3/'+datasett+'_c4s.pk', 'wb') as handle:\n",
    "#     pk.dump(c4s, handle, protocol=pk.HIGHEST_PROTOCOL)\n",
    "    \n",
    "# with open('baseline_pickles_cc3/'+datasett+'_k4s.pk', 'wb') as handle:\n",
    "#     pk.dump(k4s, handle, protocol=pk.HIGHEST_PROTOCOL)\n",
    "    \n",
    "# with open('baseline_pickles_cc3/'+datasett+'_trPearson.pk', 'wb') as handle:\n",
    "#     pk.dump(tr_pearson, handle, protocol=pk.HIGHEST_PROTOCOL)\n",
    "    \n",
    "# with open('baseline_pickles_cc3/'+datasett+'_lcc.pk', 'wb') as handle:\n",
    "#     pk.dump(lcc, handle, protocol=pk.HIGHEST_PROTOCOL)\n",
    "    \n",
    "# with open('baseline_pickles_cc3/'+datasett+'_comps.pk', 'wb') as handle:\n",
    "#     pk.dump(sq_comps, handle, protocol=pk.HIGHEST_PROTOCOL)\n",
    "    \n",
    "# with open('baseline_pickles_cc3/'+datasett+'_input_overlaps.pk', 'wb') as handle:\n",
    "#     pk.dump(input_overlaps, handle, protocol=pk.HIGHEST_PROTOCOL)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for (methodname, sampler, param_vals) in methods:\n",
    "    plt.plot(param_vals, overlaps[methodname], label=methodname, marker='.')\n",
    "plt.xlim(0,1)\n",
    "plt.ylim(0,1)\n",
    "plt.xlabel('Parameter')\n",
    "plt.ylabel('Overlap Estimate')\n",
    "plt.title('Cora')\n",
    "plt.legend()\n",
    "plt.grid()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for (methodname, sampler, param_vals) in methods:\n",
    "    plt.plot(overlaps[methodname], tris[methodname].mean(axis=1), label=methodname, marker='.')\n",
    "plt.plot([0,1], np.ones(2)*adj_to_tri(adj), label='True', linestyle='--')\n",
    "plt.xlim(0,1)\n",
    "plt.ylim(0,)\n",
    "plt.xlabel('Overlap')\n",
    "plt.ylabel('Triangles')\n",
    "plt.title('Cora')\n",
    "plt.legend()\n",
    "plt.grid()\n",
    "#plt.savefig('cora_' + 'allmSR_triangles.eps', format='eps')\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
