{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Import packages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import networkx as nx\n",
    "import copy as cp\n",
    "import scipy.linalg as slin\n",
    "import pickle as pkl\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Definition of useful procedures and constants"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "SEED = 42\n",
    "\n",
    "# generates a distribution of graphs where the two clusters are fixed and random graphs with edges sampled w.p. p on which we additionally plant additional structures, and the crossing edges are sampled w.p. q\n",
    "# n is the number of vertices in the generated graph, each cluster contains n/2 vertices\n",
    "# p is the probability of drawing an edge inside a cluster\n",
    "# q is the probability of drawing an edge across clusters\n",
    "# adversary_type, params determine the structure to plant in the clusters\n",
    "# prg is the PRG to be used\n",
    "# returns a graph consisting of the deterministic clusters only, together with the planted bisection, the minimum in-cluster degree, and the 2nd and 3rd e-vals of the expected graph\n",
    "def dcm_exp(n, p, q, adversary_type, params, prg):\n",
    "    assert n % 2 == 0 # number of vertices should be even, to split evenly in two clusters\n",
    "\n",
    "    G_exp = nx.Graph() # create an empty graph to store expectation of this distribution\n",
    "    G_det = nx.Graph() # create an empty graph to store the deterministic clusters\n",
    "\n",
    "    # add n vertices to both graphs\n",
    "    G_exp.add_nodes_from(range(n))\n",
    "    G_det.add_nodes_from(range(n))\n",
    "\n",
    "    # add edges\n",
    "    for u in range(n):\n",
    "        for v in range(n):\n",
    "            if ((u < n//2 and v < n//2) or (u >= n//2 and v >= n//2)) and (u < v): # intra-cluster edges\n",
    "                connect = prg.binomial(1, p) # sampled w.p. p and add to both the expected graph and the graph of clusters\n",
    "                if connect > 0:\n",
    "                    G_exp.add_edge(u, v)\n",
    "                    G_det.add_edge(u, v)\n",
    "            elif u < v: # inter-cluster edges, add edges weighted with q to the expected graph\n",
    "                G_exp.add_edge(u, v, weight = q)\n",
    "\n",
    "    # plant an additional structure in the clusters\n",
    "    G_exp, _ = perturb(G = G_exp, G_exp = None, adversary_type = adversary_type, params = params)\n",
    "    G_det, _ = perturb(G = G_det, G_exp = None, adversary_type = adversary_type, params = params)\n",
    "\n",
    "    # form the {-1,+1} vector representing the planted bisection\n",
    "    z = np.zeros(n)\n",
    "    for u in range(n):\n",
    "        z[u] = -1 if u < n//2 else 1\n",
    "\n",
    "    # compute the minimum in-cluster degree\n",
    "    d_in = min_in_cluster_degree(G_det, {u for u in range(n) if z[u] < 0}, {u for u in range(n) if z[u] >= 0}, weighted = False)\n",
    "\n",
    "    # get the spectrum of the expected graph\n",
    "    classifiers = get_classifiers(G_exp, {matrix: True if matrix == 'UL' else False for matrix in MATRICES})\n",
    "\n",
    "    return G_det, z, d_in, classifiers['UL'][1][1], classifiers['UL'][1][2]\n",
    "\n",
    "# given a DCM distribution, determined by the cluster edges and the probability of sampling an inter-cluster edge, draw a sample\n",
    "# G_det is the graph of cluster edges, where we assume to have two clusters of size n/2\n",
    "# q is the probability of drawing an edge across clusters\n",
    "# prg is the PRG to be used\n",
    "def dcm(G_det, q, prg):\n",
    "    n = G_det.number_of_nodes()\n",
    "    G = cp.deepcopy(G_det)\n",
    "\n",
    "    assert n% 2 == 0 # number of vertices should be even, to split evenly in two clusters\n",
    "\n",
    "    for u in range(n//2):\n",
    "        for v in range(n//2, n):\n",
    "            if prg.binomial(1,q): # for inter-cluster edge, add it w.p, q\n",
    "                G.add_edge(u,v)\n",
    "\n",
    "    return G\n",
    "\n",
    "# generate a graph from the stochastic block model distribution SBM(n,p,q), with two equally sized clusters\n",
    "# n is the number of vertices in the generated graph, each cluster contains n/2 vertices\n",
    "# p is the probability of drawing an edge inside a cluster\n",
    "# q is the probability of drawing an edge across clusters\n",
    "# prg is the PRG to be used\n",
    "def sbm(n, p, q, prg):\n",
    "    assert n % 2 == 0 # number of vertices should be even, to split evenly in two clusters\n",
    "\n",
    "    G = nx.Graph() # create an empty graph\n",
    "\n",
    "    G.add_nodes_from(range(n)) # add vertices to the graph\n",
    "\n",
    "    # draw edges at random\n",
    "    for u in range(n):\n",
    "        for v in range(n):\n",
    "            if ((u < n//2 and v < n//2) or (u >= n//2 and v >= n//2)) and (u < v): # intra-cluster edges\n",
    "                connect = prg.binomial(1, p)\n",
    "                if connect > 0:\n",
    "                    G.add_edge(u, v)\n",
    "            elif u < v: # inter-cluster edges\n",
    "                connect = prg.binomial(1, q)\n",
    "                if connect > 0:\n",
    "                    G.add_edge(u, v)\n",
    "\n",
    "    return G\n",
    "\n",
    "# generate the expected graph sampled from the stochastic block model distribution SBM(n,p,q), with two equally sized clusters\n",
    "# also returns numpy array assigning labels in {-1,+1} to vertices, representing the two clusters\n",
    "# n is the number of vertices in the generated graph, each cluster contains n/2 vertices\n",
    "# p is the probability of drawing an edge inside a cluster\n",
    "# q is the probability of drawing an edge across clusters\n",
    "def sbm_exp(n, p, q):\n",
    "    assert n % 2 == 0 # number of vertices should be even, to split evenly in two clusters\n",
    "\n",
    "    G = nx.Graph() # create an empty graph\n",
    "\n",
    "    G.add_nodes_from(range(n)) # add vertices to the graph\n",
    "\n",
    "    # add expectation of each edge\n",
    "    for u in range(n):\n",
    "        for v in range(n):\n",
    "            if ((u < n//2 and v < n//2) or (u >= n//2 and v >= n//2)) and (u < v): # intra-cluster edges\n",
    "                G.add_edge(u, v, weight = p)\n",
    "            elif u < v: # inter-cluster edges\n",
    "                G.add_edge(u, v, weight = q)\n",
    "\n",
    "    # assign cluster labels\n",
    "    z = np.zeros(n)\n",
    "    for u in range(n):\n",
    "        z[u] = -1 if u < n//2 else 1\n",
    "\n",
    "    return G, z\n",
    "\n",
    "\n",
    "# retuns a new graph obtained by (monotonically) adversarially perturbing the input, and the corresponding expectation\n",
    "# G is the graph to perturb\n",
    "# G_exp is the starting expetation, before perturbing\n",
    "# adversary_type = 0, 1, 2, ... is the type of perturbation to apply\n",
    "# params can contain parameters for a specific adversary type\n",
    "def perturb(G, G_exp, adversary_type, params):\n",
    "    if G != None:\n",
    "        G = cp.deepcopy(G)\n",
    "        n = G.number_of_nodes()\n",
    "\n",
    "    if G_exp != None:\n",
    "        G_exp = cp.deepcopy(G_exp)\n",
    "        n = G_exp.number_of_nodes()\n",
    "\n",
    "    if adversary_type == 0: # plant two disjoint cliques on the first n/2 vertices of the graph\n",
    "\n",
    "        assert n % 4 == 0 # the graph size should be divisible by 4 to have two equally sized cliques\n",
    "        size = n//4 # the size of the cliques to be planted\n",
    "\n",
    "        perturbation = [] # contains the edges to be added\n",
    "\n",
    "        # repeat twice, each iteration generates a clique\n",
    "        for i in range(2):\n",
    "            lb = i * size # vertex where the cliques starts\n",
    "            ub = (i+1) * size # vertex where the cliques ends\n",
    "            perturbation += [(u,v) for u in range(lb, ub) for v in range(lb, ub) if u < v] # add all pairs\n",
    "\n",
    "        if G != None:\n",
    "            G.add_edges_from(perturbation)\n",
    "        if G_exp != None:\n",
    "            for u,v in perturbation:\n",
    "                G_exp[u][v]['weight'] = min(1,G_exp[u][v]['weight']+1)\n",
    "\n",
    "    elif adversary_type == 1: # plant two disjoint random graphs on the first n/2 vertices of the graph, the edge sampling rate is contained in param\n",
    "\n",
    "        assert n % 4 == 0 # the graph size should be divisible by 4 to have two equally sized cliques\n",
    "        size = n//4 # the size of the cliques to be planted\n",
    "\n",
    "        p, prg = params # sampling probability and PRG to use\n",
    "        assert p >= 0 and p <= 1\n",
    "\n",
    "        perturbation = [] # contains the edges to be added\n",
    "        perturbation_support = [] # contains the edges that could be added\n",
    "\n",
    "        # repeat twice, each iteration generates a random graph\n",
    "        for i in range(2):\n",
    "            lb = i * size # vertex where the cliques starts\n",
    "            ub = (i+1) * size # vertex where the cliques ends\n",
    "            perturbation += [(u,v) for u in range(lb, ub) for v in range(lb, ub) if u < v and prg.binomial(1, p) > 0] # add the pair with probability p\n",
    "            perturbation_support += [(u,v) for u in range(lb, ub) for v in range(lb, ub) if u < v] # add all the pairs to the support\n",
    "\n",
    "        if G != None:\n",
    "            G.add_edges_from(perturbation)\n",
    "        if G_exp != None:\n",
    "            for u,v in perturbation_support:\n",
    "                G_exp[u][v]['weight'] = min(1,p+G_exp[u][v]['weight']-p*G_exp[u][v]['weight'])\n",
    "\n",
    "    elif adversary_type == 2: # plant a clique on the first n/2 vertices of the graph, the size of the clique is contained in param\n",
    "\n",
    "        assert n % 2 == 0 # the graph size should be divisible by 2 to have two equally sized clusters\n",
    "\n",
    "        size = params # clique size\n",
    "        assert size >= 2 and size <= n//2\n",
    "\n",
    "        perturbation = [(u,v) for u in range(0, size) for v in range(0, size) if u < v] # contains the edges to be added\n",
    "\n",
    "        if G != None:\n",
    "            G.add_edges_from(perturbation)\n",
    "        if G_exp != None:\n",
    "            for u,v in perturbation:\n",
    "                G_exp[u][v]['weight'] = min(1,G_exp[u][v]['weight']+1)\n",
    "\n",
    "    elif adversary_type == 3: # plant a clique on the first n/2 vertices of the graph and one on the second n/2 vertices, the sizes of the cliques are contained in param\n",
    "\n",
    "        assert n % 2 == 0 # the graph size should be divisible by 2 to have two equally sized clusters\n",
    "\n",
    "        size = params # clique sizes\n",
    "        assert size[0] >= 2 and size[0] <= n//2 and size[1] >= 2 and size[1] <= n//2\n",
    "\n",
    "        perturbation = [] # contains the edges to be added\n",
    "\n",
    "        # repeat twice, each iteration generates a clique\n",
    "        for i in range(2):\n",
    "            lb = i * (n//2) # vertex where the cliques starts\n",
    "            ub = lb + size[i] # vertex where the cliques ends\n",
    "            perturbation += [(u,v) for u in range(lb, ub) for v in range(lb, ub) if u < v] # add all pairs\n",
    "\n",
    "        if G != None:\n",
    "            G.add_edges_from(perturbation)\n",
    "        if G_exp != None:\n",
    "            for u,v in perturbation:\n",
    "                G_exp[u][v]['weight'] = min(1,G_exp[u][v]['weight']+1)\n",
    "\n",
    "    elif adversary_type == 4: # plant two disjoint cliques on the first n/2 vertices of the graph, and two disjoint cliques on the second n/2 vertices of the graph\n",
    "\n",
    "        assert n % 4 == 0 # the graph size should be divisible by 4 to have two equally sized cliques per cluster\n",
    "        size = n//4 # the size of the cliques to be planted\n",
    "\n",
    "        perturbation = [] # contains the edges to be added\n",
    "\n",
    "        # repeat for each group of n/2 vertices\n",
    "        for j in range(2):\n",
    "            # repeat twice, each iteration generates a clique\n",
    "            for i in range(2):\n",
    "                lb = j*(n//2) + i * size # vertex where the cliques starts\n",
    "                ub = j*(n//2) + (i+1) * size # vertex where the cliques ends\n",
    "                perturbation += [(u,v) for u in range(lb, ub) for v in range(lb, ub) if u < v] # add all pairs\n",
    "\n",
    "        if G != None:\n",
    "            G.add_edges_from(perturbation)\n",
    "        if G_exp != None:\n",
    "            for u,v in perturbation:\n",
    "                G_exp[u][v]['weight'] = min(1,G_exp[u][v]['weight']+1)\n",
    "\n",
    "    elif adversary_type == 5: # plant many stars on the first n/2 vertices of the graph, and many stars on the second n/2 vertices of the graph, the number of stars per side is contained in params\n",
    "\n",
    "        assert n % 2 == 0 # the graph size should be divisible by 2 to have two equally sized clusters\n",
    "\n",
    "        num_stars = params # number of stars\n",
    "\n",
    "        # check that the number of stars fits into the clusters\n",
    "        assert (n//2-num_stars[0])//num_stars[0] >= 1\n",
    "        assert (n//2-num_stars[1])//num_stars[1] >= 1\n",
    "\n",
    "        perturbation = [] # contains the edges to be added\n",
    "\n",
    "        # repeat for each group of n/2 vertices\n",
    "        for j in range(2):\n",
    "            degree = (n//2-num_stars[j])//num_stars[j] # compute the degree of stars in this cluster\n",
    "            for u in range(j*n//2, j*n//2 + num_stars[j]): # for each star center\n",
    "                for v in range(j*n//2 + num_stars[j] + (u-j*n//2)*degree, j*n//2 + num_stars[j] + (u+1-j*n//2)*degree): # for each star ray\n",
    "                    if u < v:\n",
    "                        perturbation.append((u,v)) # add the edge\n",
    "\n",
    "        if G != None:\n",
    "            G.add_edges_from(perturbation)\n",
    "        if G_exp != None:\n",
    "            for u,v in perturbation:\n",
    "                G_exp[u][v]['weight'] = min(1,G_exp[u][v]['weight']+1)\n",
    "\n",
    "    else:\n",
    "        assert 0 == 1\n",
    "\n",
    "    return G, G_exp\n",
    "\n",
    "# matrices used to compute classifiersr\n",
    "MATRICES = [\n",
    "    'UL',\n",
    "    'NL',\n",
    "    'RL',\n",
    "    'A'\n",
    "]\n",
    "\n",
    "# colors used to plot each matrix\n",
    "MATRICES_COLORS = {\n",
    "    'UL': 'blue',\n",
    "    'NL': 'green',\n",
    "    'RL': 'purple',\n",
    "    'A': 'orange'\n",
    "}\n",
    "\n",
    "# labels used to plot each matrix\n",
    "MATRICES_LABELS = {\n",
    "    'UL': '$\\mathbf{L}$',\n",
    "    'NL': '$\\mathcal{L}_{\\mathrm{sym}}$',\n",
    "    'RL': '$\\mathcal{L}_{\\mathrm{rw}}$',\n",
    "    'A': '$\\mathbf{A}$'\n",
    "}\n",
    "\n",
    "# longer labels used to plot each matrix\n",
    "MATRICES_LABELS_LONG = {\n",
    "    'UL': 'Unnormalized Laplacian $\\mathbf{L}$',\n",
    "    'NL': 'Sym. normalized Laplacian $\\mathcal{L}_{\\mathrm{sym}}$',\n",
    "    'RL': 'R.W. normalized Laplacian $\\mathcal{L}_{\\mathrm{rw}}$',\n",
    "    'A': 'Adjacency $\\mathbf{A}$'\n",
    "}\n",
    "\n",
    "# returns a dictionary containing vectors over vertices of the input graph\n",
    "# G is the graph for which to compute the classifiers\n",
    "# matrices_to_use is used if one only wants to compute the spectrum of some of these four matrices\n",
    "# the output dictionary is of the form\n",
    "# 'UL' -> (u_1, u_2, u_3, evals)\n",
    "# 'NL' -> (u_1, u_2, u_3, evals)\n",
    "# 'RL' -> (u_1, u_2, u_3, evals)\n",
    "# 'A' -> (u_1, u_2, u_3, evals)\n",
    "# where evals contains sorted eigenvalues, and u_1, u_2, u_3 are the bottom 3 eigenvectors (in decreasing order for the adjacency)\n",
    "def get_classifiers(G, matrices_to_use):\n",
    "    vectors = {}\n",
    "\n",
    "    if matrices_to_use['UL']:\n",
    "        # classifier from unnormalized Laplacian\n",
    "\n",
    "        L = nx.laplacian_matrix(G).toarray() # get the Laplacian matrix\n",
    "        eigenvalues, eigenvectors = slin.eigh(L) # compute the spectrum\n",
    "        sorted_indices = eigenvalues.argsort() # get the ascending order of eigenvalues\n",
    "        eigenvalues = eigenvalues[sorted_indices] # sort the eigenvalues accordingly\n",
    "        eigenvectors = eigenvectors[:, sorted_indices] # sort the eigenvectors accordingly\n",
    "\n",
    "        vectors['UL'] = ([eigenvectors[:, 0], eigenvectors[:, 1], eigenvectors[:, 2]], eigenvalues)\n",
    "\n",
    "    if matrices_to_use['NL']:\n",
    "        # classifier from normalized Laplacian\n",
    "\n",
    "        NL = nx.normalized_laplacian_matrix(G).toarray() # get the normalized Laplacian matrix\n",
    "        eigenvalues, eigenvectors = slin.eigh(NL) # compute the spectrum\n",
    "        sorted_indices = eigenvalues.argsort() # get the ascending order of eigenvalues\n",
    "        eigenvalues = eigenvalues[sorted_indices] # sort the eigenvalues accordingly\n",
    "        eigenvectors = eigenvectors[:, sorted_indices] # sort the eigenvectors accordingly\n",
    "\n",
    "        vectors['NL'] = ([eigenvectors[:, 0], eigenvectors[:, 1], eigenvectors[:, 2]], eigenvalues)\n",
    "\n",
    "    if matrices_to_use['RL']:\n",
    "        # classifier from random walk Laplacian\n",
    "\n",
    "        A = nx.adjacency_matrix(G).toarray() # get the adjacency matrix\n",
    "        L = nx.laplacian_matrix(G).toarray() # get the Laplacian matrix\n",
    "        D = L + A # get the degree matrix\n",
    "        RL = np.identity(D.shape[0])-np.linalg.inv(D).dot(A) # get the random walk Laplacian matrix\n",
    "        eigenvalues, eigenvectors = slin.eigh(RL) # compute the spectrum\n",
    "        sorted_indices = eigenvalues.argsort() # get the ascending order of eigenvalues\n",
    "        eigenvalues = eigenvalues[sorted_indices] # sort the eigenvalues accordingly\n",
    "        eigenvectors = eigenvectors[:, sorted_indices] # sort the eigenvectors accordingly\n",
    "\n",
    "        vectors['RL'] = ([eigenvectors[:, 0], eigenvectors[:, 1], eigenvectors[:, 2]], eigenvalues)\n",
    "\n",
    "    if matrices_to_use['A']:\n",
    "        # classifier from adjacency\n",
    "\n",
    "        A = nx.adjacency_matrix(G).toarray() # get the adjacency matrix\n",
    "        eigenvalues, eigenvectors = slin.eigh(A) # compute the spectrum\n",
    "        sorted_indices = (-eigenvalues).argsort() # get the descending order of eigenvalues\n",
    "        eigenvalues = eigenvalues[sorted_indices] # sort the eigenvalues accordingly\n",
    "        eigenvectors = eigenvectors[:, sorted_indices] # sort the eigenvectors accordingly\n",
    "\n",
    "        vectors['A'] = ([eigenvectors[:, 0], eigenvectors[:, 1], eigenvectors[:, 2]], eigenvalues)\n",
    "\n",
    "    return vectors\n",
    "\n",
    "# returns the minimum degree of a vertex inside its own cluster\n",
    "# G is the input graph\n",
    "# P_1, P_2 are sets that partition the vertex set\n",
    "# weighted is a boolean that tells if degrees should be taken as weighted or not\n",
    "def min_in_cluster_degree(G, P_1, P_2, weighted):\n",
    "    min_in_degree = G.number_of_nodes()\n",
    "\n",
    "    for u in G.nodes():\n",
    "        min_in_degree = min(nx.cut_size(G, {u}, P_1 if u in P_1 else P_2, weight = 'weight' if weighted else None), min_in_degree)\n",
    "\n",
    "    return min_in_degree\n",
    "\n",
    "# returns the agreement of the bisection given by the signs of the input embedding compared to the gound truth labels\n",
    "# embedding is a numpy array\n",
    "# labels is a numpy array with entries in {-1.+1}\n",
    "def agreement(embedding, labels):\n",
    "    n = embedding.shape[0]\n",
    "\n",
    "    x = (embedding < 0)*2-1 # compute the {-1,+1} sign vector\n",
    "    y = labels\n",
    "\n",
    "    return max((x == y).sum()/n, ((-x) == y).sum()/n) # compute the agreement of x and -x\n",
    "\n",
    "# returns the agreement of the bisection given by the best sweep cut of the input embedding compared to the gound truth labels\n",
    "# embedding is a numpy array\n",
    "# labels is a numpy array with entries in {-1.+1}\n",
    "def best_sweep_cut(embedding, labels):\n",
    "    n = embedding.shape[0]\n",
    "\n",
    "    y = labels\n",
    "\n",
    "    n_1 = (y == -1).sum() # get the number of coordinates on one side\n",
    "\n",
    "    sorted_coordinates_ascending = embedding.argsort() # sort coordinates in increasing order of embedding\n",
    "    sorted_coordinates_descending = (-embedding).argsort() # sort coordinates in decreasing order of embedding\n",
    "\n",
    "    x_asc = np.zeros(n) # assigns -1 to the first n_1 coordinates in the embedding\n",
    "    x_desc = np.zeros(n) # assigns -1 to the last n_1 coordinates in the embedding\n",
    "    for u in range(n):\n",
    "        x_asc[sorted_coordinates_ascending[u]] = -1 if u < n_1 else 1\n",
    "        x_desc[sorted_coordinates_descending[u]] = -1 if u < n_1 else 1\n",
    "\n",
    "    return max(max((x_asc == y).sum()/n, (x_desc == y).sum()/n), max((-x_asc == y).sum()/n, (-x_desc == y).sum()/n)) # compute the agreement of x_asc, x_desc, -x_asc, -x_desc\n",
    "\n",
    "# returns the variance of the embedding w.r.t. the labels\n",
    "# embedding is a numpy array\n",
    "# labels is a numpy array with entries in {-1,+1}\n",
    "def variance(embedding, labels):\n",
    "    n = embedding.shape[0]\n",
    "    y = labels/np.linalg.norm(labels) # normalize the label vector\n",
    "    return min(np.linalg.norm(embedding-y)**2/n, np.linalg.norm(embedding+y)**2/n) # return the minimum of the two possible signing of the labels\n",
    "\n",
    "# plots the embedding given by the input vector\n",
    "# v is a numpy array, assumed to be unit norm\n",
    "# ground_truth is a numpy array that maps each coordinate of v to {-1,+1}\n",
    "# margin indicates how wide the embedding should be\n",
    "# legend_position tells where to position the legend in the plot\n",
    "# heading is the title of the plot, optional\n",
    "def plot_embedding(v, ground_truth, margin, legend_position, heading = None):\n",
    "    n = v.shape[0] # get the number of entries\n",
    "    assert n % 2 == 0 # the number of entries should be even, to split them evenly\n",
    "\n",
    "    fig, ax = plt.subplots(figsize = (4,3))\n",
    "\n",
    "    custom_ylim = (-margin / np.sqrt(n), margin / np.sqrt(n)) # plot the embedding in the range given by margin, normalized according to the number of entries\n",
    "\n",
    "    if heading != None:\n",
    "        ax.set_title(heading) # add title\n",
    "\n",
    "    plt.setp(ax, ylim=custom_ylim) # set y-axis margin\n",
    "\n",
    "    # label axes\n",
    "    plt.ylabel('$\\mathbf{u}_2$')\n",
    "    plt.xlabel('Vertices')\n",
    "\n",
    "    x1 = [u for u in range(n) if ground_truth[u] == -1] # coordinates in the first community (-1)\n",
    "    x2 = [u for u in range(n) if ground_truth[u] == 1] # coordinates in the second community (+1)\n",
    "    ax.scatter(x1, v[x1], marker = 'o', color = 'green', label = r'$P_1$') # color the first community (-1)\n",
    "    ax.scatter(x2, v[x2], marker = 'o', color = 'orange', label = r'$P_2$') # color the second community (+1)\n",
    "\n",
    "    # ideal embedding\n",
    "    ax.plot(list(range(n)), [1/np.sqrt(n) for _ in range(n)], color = 'black', linestyle = 'dashed')\n",
    "    ax.plot(list(range(n)), [-1/np.sqrt(n) for _ in range(n)], color = 'black', linestyle = 'dashed')\n",
    "    ax.plot(list(range(n)), [0 for _ in range(n)], color = 'black', linestyle = 'dashed')\n",
    "\n",
    "    plt.legend(loc=legend_position)\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Nonhomogeneous SBM -- varying p and q, pbar=1/2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prg = np.random.default_rng(SEED) # initialize the PRG\n",
    "\n",
    "# number of vertices in the graphs to generate\n",
    "n = 2000 #@param\n",
    "\n",
    "# set the p_bar parameter\n",
    "p_bar = 1/2 #@param\n",
    "\n",
    "# number of different values for p\n",
    "num_p_vals = 15 #@param\n",
    "# number of different values for q\n",
    "num_q_vals = 15 #@param\n",
    "\n",
    "# range of values for p\n",
    "min_p, max_p = 1/n, 0.45 #@param\n",
    "# range of values for q\n",
    "min_q, max_q = 1/n, 0.45 #@param\n",
    "\n",
    "width_p = (max_p - min_p)/num_p_vals # compute the spacing for values of p\n",
    "width_q = (max_q - min_q)/num_q_vals # compute the spacing for values of q\n",
    "\n",
    "p_vals = [min_p + i*width_p for i in range(num_p_vals)] + [max_p] # compute the list of values for p\n",
    "q_vals = [min_q + i*width_q for i in range(num_q_vals)] + [max_q] # compute the list of values for q\n",
    "\n",
    "pq_pairs = [(j, k, p, q) for j, p in enumerate(p_vals) for k, q in enumerate(q_vals) if p > q] # make a list of all relevant (p,q) pairs\n",
    "\n",
    "agreement_vals = [[0 for q in q_vals] for p in p_vals] # initilize a matrix of agreement values for each (p,q) pairs\n",
    "\n",
    "_, clusters_planted = sbm_exp(n, 0, 0) # get the planted bisection\n",
    "\n",
    "# number of independent repetitions for each p,q pair\n",
    "NUM_TRIALS = 3 #@param\n",
    "for i, (p_index, q_index, p, q) in enumerate(pq_pairs):\n",
    "    print(f'Pair number {i+1}/{len(pq_pairs)}')\n",
    "\n",
    "    p_prime = (p_bar-p)/(1-p) # reduce it to take into account for the first round of sampling in the base SBM graph\n",
    "\n",
    "    sum_agreement = 0 # accumulate agreement across trials\n",
    "    for _ in range(NUM_TRIALS):\n",
    "        G_adv, _ = perturb(G = sbm(n, p, q, prg), G_exp = None, adversary_type = 1, params = (p_prime, prg)) # oversample edges according to p_bar\n",
    "        classifiers_adv = get_classifiers(G_adv, {matrix: True if matrix == 'UL' else False for matrix in MATRICES}) # compute the classifier from the unnormalized Laplacian\n",
    "        u_2 = classifiers_adv['UL'][0][1] # get the second eigenvector\n",
    "        sum_agreement += agreement(u_2, clusters_planted) # compute agreement with the planted bisection\n",
    "\n",
    "    agreement_vals[p_index][q_index] = sum_agreement/NUM_TRIALS # store the average agreement in the corresponding entry\n",
    "\n",
    "# store experiment setting and results to file\n",
    "experiment = {\n",
    "    'n': n,\n",
    "    'p_bar': p_bar,\n",
    "    'p_vals': p_vals,\n",
    "    'q_vals': q_vals,\n",
    "    'pq_pairs': pq_pairs,\n",
    "    'NUM_TRIALS': NUM_TRIALS,\n",
    "    'agreement_vals': agreement_vals\n",
    "}\n",
    "\n",
    "\n",
    "with open(f'experiment_NSSBM_pq_pairs_pbar_n_{n}_{str(p_bar).replace(\".\",\"\")}.res', 'wb') as output_file:\n",
    "    pkl.dump(experiment, output_file)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load experiment file and plot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "experiment_file_name = f'experiment_NSSBM_pq_pairs_pbar_n_{n}_{str(p_bar).replace(\".\",\"\")}.res'\n",
    "with open(experiment_file_name, 'rb') as input_file:\n",
    "    loaded_experiment = pkl.load(input_file)\n",
    "\n",
    "    n_load = loaded_experiment['n']\n",
    "    min_p_load = np.min(np.array(loaded_experiment['p_vals']))\n",
    "    max_p_load = np.max(np.array(loaded_experiment['p_vals']))\n",
    "    min_q_load = np.min(np.array(loaded_experiment['q_vals']))\n",
    "    max_q_load = np.max(np.array(loaded_experiment['q_vals']))\n",
    "\n",
    "    fig, ax = plt.subplots(figsize=(4,3))\n",
    "\n",
    "    ax.set_title(r'$\\bar{p}=' + str(loaded_experiment['p_bar']) + '$')\n",
    "\n",
    "    plt.setp(ax, ylim=(min_p_load,max_p_load))\n",
    "    plt.ylabel(r'Intra-cluster edge probability $p$')\n",
    "    plt.xlabel(r'Inter-cluster edge probability $q$')\n",
    "\n",
    "    c = ax.pcolormesh(loaded_experiment['q_vals'], loaded_experiment['p_vals'], loaded_experiment['agreement_vals'], cmap='Blues', shading='nearest')\n",
    "\n",
    "    fig.colorbar(c, ax=ax)\n",
    "\n",
    "    ax.plot(np.linspace(min_q, max_q, 100), [(np.sqrt(n_load*np.log(n_load)*p_bar)+np.log(n_load))/n_load + q for q in np.linspace(min_q_load, max_q_load, 100)], color = 'red') # guaranteed threshold for NSBM\n",
    "    ax.plot(np.linspace(min_q, max_q, 100), [(np.sqrt(2*np.log(n_load)/n_load)+np.sqrt(q))**2 for q in np.linspace(min_q_load, max_q_load, 100)], color = 'red', linestyle='dashed') # optimal information theoretic threshold for vanilla SBM\n",
    "\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Nonhomogeneous SBM -- varying p and q, pbar=1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prg = np.random.default_rng(SEED) # initialize the PRG\n",
    "\n",
    "# number of vertices in the graphs to generate\n",
    "n = 2000 #@param\n",
    "\n",
    "# set the p_bar parameter\n",
    "p_bar = 1 #@param\n",
    "\n",
    "# number of different values for p\n",
    "num_p_vals = 15 #@param\n",
    "# number of different values for q\n",
    "num_q_vals = 15 #@param\n",
    "\n",
    "# range of values for p\n",
    "min_p, max_p = 1/n, 0.45 #@param\n",
    "# range of values for q\n",
    "min_q, max_q = 1/n, 0.45 #@param\n",
    "\n",
    "width_p = (max_p - min_p)/num_p_vals # compute the spacing for values of p\n",
    "width_q = (max_q - min_q)/num_q_vals # compute the spacing for values of q\n",
    "\n",
    "p_vals = [min_p + i*width_p for i in range(num_p_vals)] + [max_p] # compute the list of values for p\n",
    "q_vals = [min_q + i*width_q for i in range(num_q_vals)] + [max_q] # compute the list of values for q\n",
    "\n",
    "pq_pairs = [(j, k, p, q) for j, p in enumerate(p_vals) for k, q in enumerate(q_vals) if p > q] # make a list of all relevant (p,q) pairs\n",
    "\n",
    "agreement_vals = [[0 for q in q_vals] for p in p_vals] # initilize a matrix of agreement values for each (p,q) pairs\n",
    "\n",
    "_, clusters_planted = sbm_exp(n, 0, 0) # get the planted bisection\n",
    "\n",
    "# number of independent repetitions for each p,q pair\n",
    "NUM_TRIALS = 3 #@param\n",
    "for i, (p_index, q_index, p, q) in enumerate(pq_pairs):\n",
    "    print(f'Pair number {i+1}/{len(pq_pairs)}')\n",
    "\n",
    "    p_prime = (p_bar-p)/(1-p) # reduce it to take into account for the first round of sampling in the base SBM graph\n",
    "\n",
    "    sum_agreement = 0 # accumulate agreement across trials\n",
    "    for _ in range(NUM_TRIALS):\n",
    "        G_adv, _ = perturb(G = sbm(n, p, q, prg), G_exp = None, adversary_type = 1, params = (p_prime, prg)) # oversample edges according to p_bar\n",
    "        classifiers_adv = get_classifiers(G_adv, {matrix: True if matrix == 'UL' else False for matrix in MATRICES}) # compute the classifier from the unnormalized Laplacian\n",
    "        u_2 = classifiers_adv['UL'][0][1] # get the second eigenvector\n",
    "        sum_agreement += agreement(u_2, clusters_planted) # compute agreement with the plated bisection\n",
    "\n",
    "    agreement_vals[p_index][q_index] = sum_agreement/NUM_TRIALS # store the average agreement in the corresponding entry\n",
    "\n",
    "# store experiment setting and results to file\n",
    "experiment = {\n",
    "    'n': n,\n",
    "    'p_bar': p_bar,\n",
    "    'p_vals': p_vals,\n",
    "    'q_vals': q_vals,\n",
    "    'pq_pairs': pq_pairs,\n",
    "    'NUM_TRIALS': NUM_TRIALS,\n",
    "    'agreement_vals': agreement_vals\n",
    "}\n",
    "\n",
    "\n",
    "with open(f'experiment_NSSBM_pq_pairs_pbar_n_{n}_{str(p_bar).replace(\".\",\"\")}.res', 'wb') as output_file:\n",
    "    pkl.dump(experiment, output_file)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load experiment file and plot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "experiment_file_name = f'experiment_NSSBM_pq_pairs_pbar_n_{n}_{str(p_bar).replace(\".\",\"\")}.res'\n",
    "with open(experiment_file_name, 'rb') as input_file:\n",
    "    loaded_experiment = pkl.load(input_file)\n",
    "\n",
    "    n_load = loaded_experiment['n']\n",
    "    min_p_load = np.min(np.array(loaded_experiment['p_vals']))\n",
    "    max_p_load = np.max(np.array(loaded_experiment['p_vals']))\n",
    "    min_q_load = np.min(np.array(loaded_experiment['q_vals']))\n",
    "    max_q_load = np.max(np.array(loaded_experiment['q_vals']))\n",
    "\n",
    "    fig, ax = plt.subplots(figsize=(4,3))\n",
    "\n",
    "    ax.set_title(r'$\\bar{p}=' + str(loaded_experiment['p_bar']) + '$')\n",
    "\n",
    "    plt.setp(ax, ylim=(min_p_load,max_p_load))\n",
    "    plt.ylabel(r'Intra-cluster edge probability $p$')\n",
    "    plt.xlabel(r'Inter-cluster edge probability $q$')\n",
    "\n",
    "    c = ax.pcolormesh(loaded_experiment['q_vals'], loaded_experiment['p_vals'], loaded_experiment['agreement_vals'], cmap='Blues', shading='nearest')\n",
    "\n",
    "    fig.colorbar(c, ax=ax)\n",
    "\n",
    "    ax.plot(np.linspace(min_q, max_q, 100), [(np.sqrt(n_load*np.log(n_load)*p_bar)+np.log(n_load))/n_load + q for q in np.linspace(min_q_load, max_q_load, 100)], color = 'red') # guaranteed threshold for NSBM\n",
    "    ax.plot(np.linspace(min_q, max_q, 100), [(np.sqrt(2*np.log(n_load)/n_load)+np.sqrt(q))**2 for q in np.linspace(min_q_load, max_q_load, 100)], color = 'red', linestyle='dashed') # optimal information theoretic threshold for vanilla SBM\n",
    "\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Nonhomogeneous SBM -- varying p_bar"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prg = np.random.default_rng(SEED) # initialize the PRG\n",
    "\n",
    "# number of vertices in the graphs to generate\n",
    "n = 2000 #@param\n",
    "\n",
    "# probability of intra-cluster edges\n",
    "p = 24 * np.log(n)/n #@param\n",
    "# probability of inter-cluster edges\n",
    "q = 8 * np.log(n)/n #@param\n",
    "\n",
    "assert p/q >= 2  # check the gap is large enough\n",
    "\n",
    "p_bar_threshold = (3*p/q)*p # p_bar value beyond which other matrices should break\n",
    "\n",
    "assert p_bar_threshold <= 1  # check that p_bar_threshold is a valid probability\n",
    "\n",
    "p_bar_max = (n*(p-q)-np.log(n))**2/(n*np.log(n)) # maximum p_bar for which the Laplacian is supposed to work (as per theory)\n",
    "\n",
    "assert p_bar_max >= p_bar_threshold  # check that the maximum p_bar is above the threshold at which the other matrices should break\n",
    "\n",
    "_, clusters_planted = sbm_exp(n, p, q) # get the planted bisection\n",
    "\n",
    "# number of values for p_bar in the interval [p,1) to be considered\n",
    "num_p_bar_vals = 12 #@param\n",
    "\n",
    "width = (1-p)/num_p_bar_vals # spacing between values for p_bar\n",
    "\n",
    "# compute the values for p_bar, including p_bar_max, p_bar_threshold, and 1, avoiding too-close-by values\n",
    "p_bar_vals = sorted(set([p + i*width for i in range(num_p_bar_vals) if np.absolute(p + i*width - p_bar_threshold) > (p_bar_max-p_bar_threshold)/3 and np.absolute(p + i*width - p_bar_max) > (p_bar_max-p_bar_threshold)/3] + [1, p_bar_threshold, p_bar_max]))\n",
    "\n",
    "agreement_vals = {matrix: [] for matrix in MATRICES} # for each matrix store the list of agreement values correspondign to the 0-cut, one for each value of p_bar\n",
    "sweep_vals = {matrix: [] for matrix in MATRICES} # for each matrix store the list of agreement values correspondign to the sweep bisection, one for each value of p_bar\n",
    "variance_vals = {matrix: [] for matrix in MATRICES} # for each matrix store the list of variances of the embedding wrt the planted bisection, one for each value of p_bar\n",
    "\n",
    "# number of independent repetitions for each value of p_bar\n",
    "NUM_TRIALS = 10 #@param\n",
    "for i, p_bar in enumerate(p_bar_vals):\n",
    "    print(f'Value number {i+1}/{len(p_bar_vals)}')\n",
    "    p_prime = (p_bar-p)/(1-p) # reduce it to take into account for the first round of sampling in the base SBM graph\n",
    "\n",
    "    # accumulate the sum across trials for each measure of interest\n",
    "    sum_agreement = {matrix: 0 for matrix in MATRICES}\n",
    "    sum_sweep = {matrix: 0 for matrix in MATRICES}\n",
    "    sum_variance = {matrix: 0 for matrix in MATRICES}\n",
    "\n",
    "    for _ in range(NUM_TRIALS):\n",
    "        G_adv, _ = perturb(G = sbm(n, p, q, prg), G_exp = None, adversary_type = 1, params = (p_prime, prg)) # oversample edges according to p_bar\n",
    "        classifiers_adv = get_classifiers(G_adv, {matrix: True for matrix in MATRICES}) # get the classifiers for all matrices\n",
    "\n",
    "        for matrix in MATRICES:\n",
    "            u_2 = classifiers_adv[matrix][0][1] # get the second eigenvector\n",
    "\n",
    "            # compute the measure of interest and accumulate\n",
    "            sum_agreement[matrix] += agreement(u_2, clusters_planted)\n",
    "            sum_sweep[matrix] += best_sweep_cut(u_2, clusters_planted)\n",
    "            sum_variance[matrix] += variance(u_2, clusters_planted)\n",
    "\n",
    "    # append the average (over trials) of each measure of interest for this value of p_bar\n",
    "    for matrix in MATRICES:\n",
    "        agreement_vals[matrix].append(sum_agreement[matrix]/NUM_TRIALS)\n",
    "        sweep_vals[matrix].append(sum_sweep[matrix]/NUM_TRIALS)\n",
    "        variance_vals[matrix].append(sum_variance[matrix]/NUM_TRIALS)\n",
    "\n",
    "# store experiment setting and results to file\n",
    "experiment = {\n",
    "    'n': n,\n",
    "    'p': p,\n",
    "    'q': q,\n",
    "    'p_bar_threshold': p_bar_threshold,\n",
    "    'p_bar_max': p_bar_max,\n",
    "    'p_bar_vals': p_bar_vals,\n",
    "    'NUM_TRIALS': NUM_TRIALS,\n",
    "    'agreement_vals': agreement_vals,\n",
    "    'sweep_vals': sweep_vals,\n",
    "    'variance_vals': variance_vals\n",
    "}\n",
    "\n",
    "with open(f'experiment_NSSBM_pbar_n_{n}_p_{str(p).replace(\".\",\"\")}_q_{str(q).replace(\".\",\"\")}.res', 'wb') as output_file:\n",
    "    pkl.dump(experiment, output_file)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load experiment file and plot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "experiment_file_name = f'experiment_NSSBM_pbar_n_{n}_p_{str(p).replace(\".\",\"\")}_q_{str(q).replace(\".\",\"\")}.res'\n",
    "with open(experiment_file_name, 'rb') as input_file:\n",
    "    loaded_experiment = pkl.load(input_file)\n",
    "\n",
    "    # plot agreement of the 0-cut\n",
    "\n",
    "    fig, ax = plt.subplots(figsize=(4,3))\n",
    "\n",
    "    # plot values in the [0.45,1.05] range\n",
    "    custom_ylim = (0.45, 1.05)\n",
    "    plt.setp(ax, ylim=custom_ylim)\n",
    "\n",
    "    ax.set_title(r'$0$-cut') # add title\n",
    "\n",
    "    # label axes\n",
    "    plt.ylabel(r'Agreement with planted bisection')\n",
    "    plt.xlabel(r'Largest Intra-cluster edge probability $\\bar{p}$')\n",
    "\n",
    "    for matrix in MATRICES:\n",
    "        ax.scatter(loaded_experiment['p_bar_vals'], loaded_experiment['agreement_vals'][matrix], color = MATRICES_COLORS[matrix])\n",
    "        ax.plot(loaded_experiment['p_bar_vals'], loaded_experiment['agreement_vals'][matrix], color = MATRICES_COLORS[matrix], label = MATRICES_LABELS[matrix])\n",
    "\n",
    "    # add vertical lines for p_bar_threshold and p_bar_max\n",
    "    plt.axvline(x = loaded_experiment['p_bar_threshold'], color = 'red')\n",
    "    plt.axvline(x = loaded_experiment['p_bar_max'], color = 'red', linestyle = 'dashed')\n",
    "\n",
    "    # show\n",
    "    plt.legend(loc='center left')\n",
    "    plt.show()\n",
    "\n",
    "    # plot agreement of the sweep bisection\n",
    "\n",
    "    fig, ax = plt.subplots(figsize=(4,3))\n",
    "\n",
    "    # plot values in the [0.45,1.05] range\n",
    "    custom_ylim = (0.45, 1.05)\n",
    "    plt.setp(ax, ylim=custom_ylim)\n",
    "\n",
    "    ax.set_title('Sweep bisection') # add title\n",
    "\n",
    "    # label axes\n",
    "    plt.ylabel(r'Agreement with planted bisection')\n",
    "    plt.xlabel(r'Largest Intra-cluster edge probability $\\bar{p}$')\n",
    "\n",
    "    for matrix in MATRICES:\n",
    "        ax.scatter(loaded_experiment['p_bar_vals'], loaded_experiment['sweep_vals'][matrix], color = MATRICES_COLORS[matrix])\n",
    "        ax.plot(loaded_experiment['p_bar_vals'], loaded_experiment['sweep_vals'][matrix], color = MATRICES_COLORS[matrix], label = MATRICES_LABELS[matrix])\n",
    "\n",
    "    # add vertical lines for p_bar_threshold and p_bar_max\n",
    "    plt.axvline(x = loaded_experiment['p_bar_threshold'], color = 'red')\n",
    "    plt.axvline(x = loaded_experiment['p_bar_max'], color = 'red', linestyle = 'dashed')\n",
    "\n",
    "    # show\n",
    "    plt.legend(loc='center left')\n",
    "    plt.show()\n",
    "\n",
    "    # plot variance of the embedding\n",
    "\n",
    "    fig, ax = plt.subplots(figsize=(4,3))\n",
    "\n",
    "    # label axes\n",
    "    plt.ylabel(r'Variance of $\\mathbf{u}_2$ w.r.t. $\\mathbf{u}^*_2$')\n",
    "    plt.xlabel(r'Largest Intra-cluster edge probability $\\bar{p}$')\n",
    "\n",
    "    # only plot the unnormalized Laplacian\n",
    "    to_plot = {'UL'}\n",
    "\n",
    "    for matrix in MATRICES:\n",
    "        if matrix in to_plot:\n",
    "            ax.scatter(loaded_experiment['p_bar_vals'], loaded_experiment['variance_vals'][matrix], color = MATRICES_COLORS[matrix])\n",
    "            ax.plot(loaded_experiment['p_bar_vals'], loaded_experiment['variance_vals'][matrix], color = MATRICES_COLORS[matrix], label = MATRICES_LABELS[matrix])\n",
    "\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Nonhomogeneous SBM -- plot embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prg = np.random.default_rng(SEED) # initialize the PRG\n",
    "\n",
    "# number of vertices in the graph to generate\n",
    "n = 2000 #@param\n",
    "\n",
    "# probability of intra-cluster edges\n",
    "p = 24 * np.log(n)/n #@param\n",
    "# probability of inter-cluster edges\n",
    "q = 8 * np.log(n)/n #@param\n",
    "\n",
    "_, clusters_planted = sbm_exp(n, p, q) # get the planted bisection\n",
    "\n",
    "# choose the type of adversarial perturbation\n",
    "perturb_type = 1 #@param\n",
    "\n",
    "# set appopriate parameters for the perturbation\n",
    "if perturb_type == 0:\n",
    "    parameters = None\n",
    "elif perturb_type == 1:\n",
    "    p_bar = 3*(p/q)*p #@param\n",
    "    p_prime = (p_bar-p)/(1-p) # reduce it to take into account for the first round of sampling in the base SBM graph\n",
    "    parameters = p_prime, prg\n",
    "elif perturb_type == 2:\n",
    "    clique_size = n//6 #@param\n",
    "    parameters = clique_size\n",
    "elif perturb_type == 3:\n",
    "    clique_sizes = (n//4, n//4) #@param\n",
    "    parameters = clique_sizes\n",
    "elif perturb_type == 4:\n",
    "    parameters = None\n",
    "elif perturb_type == 5:\n",
    "    num_stars = (5, 5) #@param\n",
    "    parameters = num_stars\n",
    "else:\n",
    "    assert 0 == 1\n",
    "\n",
    "G_base = sbm(n, p, q, prg) # draw a sample from SBM graph\n",
    "G_adv, _ = perturb(G = G_base, G_exp = None, adversary_type = perturb_type, params = parameters) # perturb the graph\n",
    "\n",
    "classifiers_base = get_classifiers(G_base, {matrix: True for matrix in MATRICES}) # classifiers for the base SBM graph\n",
    "classifiers_adv = get_classifiers(G_adv, {matrix: True for matrix in MATRICES}) # classifiers for the nonhomogeneous SBM graph\n",
    "\n",
    "# store experiment setting and results to file\n",
    "experiment = {\n",
    "    'n': n,\n",
    "    'p': p,\n",
    "    'q': q,\n",
    "    'perturb_type': perturb_type,\n",
    "    'parameters': parameters,\n",
    "    'classifiers_base': classifiers_base,\n",
    "    'classifiers_adv': classifiers_adv,\n",
    "    'clusters_planted': clusters_planted\n",
    "}\n",
    "\n",
    "with open(f'experiment_embedding_NSSBM_n_{n}_p_{str(p).replace(\".\",\"\")}_q_{str(q).replace(\".\",\"\")}_{perturb_type}.res', 'wb') as output_file:\n",
    "    pkl.dump(experiment, output_file)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load experiment file and plot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "experiment_file_name = f'experiment_embedding_NSSBM_n_{n}_p_{str(p).replace(\".\",\"\")}_q_{str(q).replace(\".\",\"\")}_{perturb_type}.res'\n",
    "with open(experiment_file_name, 'rb') as input_file:\n",
    "    loaded_experiment = pkl.load(input_file)\n",
    "\n",
    "    margin_y_axis = 2.0\n",
    "\n",
    "    # plot embedding for each matrix before the perturbation\n",
    "    for matrix in MATRICES:\n",
    "        u_2 = loaded_experiment['classifiers_base'][matrix][0][1] # get the second eigenvector\n",
    "        plot_embedding(u_2, loaded_experiment['clusters_planted'], margin = margin_y_axis, legend_position = 'lower right', heading = MATRICES_LABELS_LONG[matrix] if MATRICES_LABELS_LONG[matrix] != None else None)\n",
    "\n",
    "    # plot embedding for each matrix after the perturbation\n",
    "    for matrix in MATRICES: # repeat for each matrix: unnormalized Laplacian, normalized Laplacian, unnormalized adjacency\n",
    "        u_2 = loaded_experiment['classifiers_adv'][matrix][0][1] # get the second eigenvector\n",
    "        plot_embedding(u_2, loaded_experiment['clusters_planted'], margin = margin_y_axis, legend_position = 'lower right', heading = MATRICES_LABELS_LONG[matrix] if MATRICES_LABELS_LONG[matrix] != None else None)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Deterministic clusters -- varying size of planted clique"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prg = np.random.default_rng(SEED) # initialize the PRG\n",
    "\n",
    "# number of vertices in the graph to generate\n",
    "n = 2000 #@param\n",
    "\n",
    "# probability of intra-cluster edges\n",
    "p = 9 * 1/np.sqrt(n) #@param\n",
    "# probability of inter-cluster edges\n",
    "q = 1 * 1/np.sqrt(n) #@param\n",
    "\n",
    "# number of clique sizes to test (equally distributed in [n/2])\n",
    "num_clique_sizes = 10 #@param\n",
    "\n",
    "clique_sizes = [int(n//2 * 1/num_clique_sizes * (i+1)) for i in range(num_clique_sizes)]\n",
    "\n",
    "agreement_vals = {matrix: [] for matrix in MATRICES} # for each matrix store the list of agreement values correspondign to the 0-cut, one for each size of the planted clique\n",
    "sweep_vals = {matrix: [] for matrix in MATRICES} # for each matrix store the list of agreement values correspondign to the sweep bisection, one for each size of the planted clique\n",
    "variance_vals = {matrix: [] for matrix in MATRICES} # for each matrix store the list of variances of the embedding wrt the planted bisection, one for each size of the planted clique\n",
    "gap_vals = [] # for each size of the planted clique, store the spectral gap of the expected graph\n",
    "d_in_vals = [] # for each size of the planted clique, store the minimium in-cluster degree\n",
    "\n",
    "NUM_TRIALS = 10 #@param\n",
    "for i, size in enumerate(clique_sizes):\n",
    "    print(f'Size number {i+1}/{num_clique_sizes}')\n",
    "\n",
    "    G_det, clusters_planted, d_in, lambda2_star, lambda3_star = dcm_exp(n, p, q, adversary_type = 2, params = size, prg = prg)\n",
    "\n",
    "    # accumulate the sum across trials for each measure of interest\n",
    "    sum_agreement = {matrix: 0 for matrix in MATRICES}\n",
    "    sum_sweep = {matrix: 0 for matrix in MATRICES}\n",
    "    sum_variance = {matrix: 0 for matrix in MATRICES}\n",
    "\n",
    "    for _ in range(NUM_TRIALS):\n",
    "        G_sample = dcm(G_det, q, prg)\n",
    "        classifiers = get_classifiers(G_sample, {matrix: True for matrix in MATRICES}) # classifiers for the DCM sampled graph\n",
    "\n",
    "        for matrix in MATRICES:\n",
    "            u_2 = classifiers[matrix][0][1] # get the second eigenvector\n",
    "\n",
    "            # compute the measure of interest and accumulate\n",
    "            sum_agreement[matrix] += agreement(u_2, clusters_planted)\n",
    "            sum_sweep[matrix] += best_sweep_cut(u_2, clusters_planted)\n",
    "            sum_variance[matrix] += variance(u_2, clusters_planted)\n",
    "\n",
    "    # append the average (over trials) of each measure of interest for this size of the planted clique\n",
    "    for matrix in MATRICES:\n",
    "        agreement_vals[matrix].append(sum_agreement[matrix]/NUM_TRIALS)\n",
    "        sweep_vals[matrix].append(sum_sweep[matrix]/NUM_TRIALS)\n",
    "        variance_vals[matrix].append(sum_variance[matrix]/NUM_TRIALS)\n",
    "\n",
    "    d_in_vals.append(d_in) # store the minimum in-cluster degree for this size of the planted clique\n",
    "    gap_vals.append(-lambda2_star+lambda3_star) # store the spectral gap of the expected graph for this size of the planted clique\n",
    "\n",
    "# store experiment setting and results to file\n",
    "experiment = {\n",
    "    'n': n,\n",
    "    'p': p,\n",
    "    'q': q,\n",
    "    'clique_sizes': clique_sizes,\n",
    "    'NUM_TRIALS': NUM_TRIALS,\n",
    "    'agreement_vals': agreement_vals,\n",
    "    'sweep_vals': sweep_vals,\n",
    "    'variance_vals': variance_vals,\n",
    "    'gap_vals': gap_vals,\n",
    "    'd_in_vals': d_in_vals\n",
    "}\n",
    "\n",
    "with open(f'experiment_DCM_clique_sizes_n_{n}_p_{str(p).replace(\".\",\"\")}_q_{str(q).replace(\".\",\"\")}.res', 'wb') as output_file:\n",
    "    pkl.dump(experiment, output_file)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load experiment file and plot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "experiment_file_name = f'experiment_DCM_clique_sizes_n_{n}_p_{str(p).replace(\".\",\"\")}_q_{str(q).replace(\".\",\"\")}.res'\n",
    "with open(experiment_file_name, 'rb') as input_file:\n",
    "    loaded_experiment = pkl.load(input_file)\n",
    "\n",
    "    n_load = loaded_experiment['n']\n",
    "    q_load = loaded_experiment['q']\n",
    "\n",
    "    # plot agreement of the 0-cut\n",
    "\n",
    "    fig, ax = plt.subplots(figsize=(4,3))\n",
    "\n",
    "    # plot values in the [0.45,1.05] range\n",
    "    custom_ylim = (0.45, 1.05)\n",
    "    plt.setp(ax, ylim=custom_ylim)\n",
    "\n",
    "    ax.set_title(r'$0$-cut') # set title\n",
    "\n",
    "    # label axes\n",
    "    plt.ylabel(r'Agreement with planted bisection')\n",
    "    plt.xlabel(r'Size of planted clique')\n",
    "\n",
    "    for matrix in MATRICES:\n",
    "        ax.scatter(loaded_experiment['clique_sizes'], loaded_experiment['agreement_vals'][matrix], color = MATRICES_COLORS[matrix])\n",
    "        ax.plot(loaded_experiment['clique_sizes'], loaded_experiment['agreement_vals'][matrix], color = MATRICES_COLORS[matrix], label = MATRICES_LABELS[matrix])\n",
    "\n",
    "    # show\n",
    "    plt.legend(loc='center left')\n",
    "    plt.show()\n",
    "\n",
    "    # plot agreement of the sweep bisection\n",
    "\n",
    "    fig, ax = plt.subplots(figsize=(4,3))\n",
    "\n",
    "    # plot values in the [0.45,1.05] range\n",
    "    custom_ylim = (0.45, 1.05)\n",
    "    plt.setp(ax, ylim=custom_ylim)\n",
    "\n",
    "    ax.set_title('Sweep bisection') # set title\n",
    "\n",
    "    # label axes\n",
    "    plt.ylabel(r'Agreement with planted bisection')\n",
    "    plt.xlabel(r'Size of planted clique')\n",
    "\n",
    "    for matrix in MATRICES:\n",
    "        ax.scatter(loaded_experiment['clique_sizes'], loaded_experiment['sweep_vals'][matrix], color = MATRICES_COLORS[matrix])\n",
    "        ax.plot(loaded_experiment['clique_sizes'], loaded_experiment['sweep_vals'][matrix], color = MATRICES_COLORS[matrix], label = MATRICES_LABELS[matrix])\n",
    "\n",
    "    # show\n",
    "    plt.legend(loc='center left')\n",
    "    plt.show()\n",
    "\n",
    "    # plot minimum in-cluster degree\n",
    "\n",
    "    fig, ax = plt.subplots(figsize=(4,3))\n",
    "\n",
    "    # label axes\n",
    "    plt.ylabel(r'$d_{\\mathsf{in}}$')\n",
    "    plt.xlabel(r'Size of planted clique')\n",
    "\n",
    "    ax.plot(loaded_experiment['clique_sizes'], loaded_experiment['d_in_vals'], color = 'blue')\n",
    "    ax.plot(loaded_experiment['clique_sizes'], [n_load*q_load+np.sqrt(n_load) for _ in range(len(loaded_experiment['clique_sizes']))], color = 'red')\n",
    "\n",
    "    plt.show()\n",
    "\n",
    "    # plot spectral gap\n",
    "\n",
    "    # label axes\n",
    "    fig, ax = plt.subplots(figsize=(4,3))\n",
    "    plt.ylabel(r'$\\lambda_3(\\hat{\\mathbf{L}})-\\lambda_2(\\hat{\\mathbf{L}})$')\n",
    "    plt.xlabel(r'Size of planted clique')\n",
    "\n",
    "    ax.plot(loaded_experiment['clique_sizes'], loaded_experiment['gap_vals'], color = 'blue')\n",
    "    ax.plot(loaded_experiment['clique_sizes'], [n_load*q_load+np.sqrt(n_load)+np.sqrt(n_load*q_load*np.log(n_load))+np.log(n_load) for _ in range(len(loaded_experiment['clique_sizes']))], color = 'red')\n",
    "\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Deterministic clusters -- plot embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# number of vertices in the graph to generate\n",
    "n = 2000 #@param\n",
    "\n",
    "# probability of intra-cluster edges\n",
    "p = 9 * 1/np.sqrt(n) #@param\n",
    "# probability of inter-cluster edges\n",
    "q = 1 * 1/np.sqrt(n) #@param\n",
    "\n",
    "# choose the type of adversarial perturbation\n",
    "perturb_type = 2 #@param\n",
    "\n",
    "# set appopriate parameters for the perturbation\n",
    "if perturb_type == 0:\n",
    "    parameters = None\n",
    "elif perturb_type == 1:\n",
    "    p_bar = 3*(p/q)*p #@param\n",
    "    p_prime = (p_bar-p)/(1-p) # reduce it to take into account for the first round of sampling in the base SBM graph\n",
    "    parameters = p_prime, prg\n",
    "elif perturb_type == 2:\n",
    "    clique_size = int(2/5*n) #@param\n",
    "    parameters = clique_size\n",
    "elif perturb_type == 3:\n",
    "    clique_sizes = (n//4, n//4) #@param\n",
    "    parameters = clique_sizes\n",
    "elif perturb_type == 4:\n",
    "    parameters = None\n",
    "elif perturb_type == 5:\n",
    "    num_stars = (5, 5) #@param\n",
    "    parameters = clique_sizes\n",
    "else:\n",
    "    assert 0 == 1\n",
    "\n",
    "G_det, clusters_planted, d_in, lambda2_star, lambda3_star = dcm_exp(n, p, q, perturb_type, parameters, prg) # get the deterministic clusters, the planted bisection, and some statistics of the expected graph\n",
    "\n",
    "G_sample = dcm(G_det, q, prg) # draw a sample\n",
    "\n",
    "classifiers = get_classifiers(G_sample, {matrix: True for matrix in MATRICES}) # classifiers for the DCM sampled graph\n",
    "\n",
    "# store experiment setting and results to file\n",
    "experiment = {\n",
    "    'n': n,\n",
    "    'p': p,\n",
    "    'q': q,\n",
    "    'perturb_type': perturb_type,\n",
    "    'parameters': parameters,\n",
    "    'd_in': d_in,\n",
    "    'lambda2_star': lambda2_star,\n",
    "    'lambda3_star': lambda3_star,\n",
    "    'classifiers': classifiers,\n",
    "    'clusters_planted': clusters_planted\n",
    "}\n",
    "\n",
    "with open(f'experiment_embedding_DCM_n_{n}_p_{str(p).replace(\".\",\"\")}_q_{str(q).replace(\".\",\"\")}_{perturb_type}_{parameters}.res', 'wb') as output_file:\n",
    "    pkl.dump(experiment, output_file)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load experiment file and plot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "experiment_file_name = f'experiment_embedding_DCM_n_{n}_p_{str(p).replace(\".\",\"\")}_q_{str(q).replace(\".\",\"\")}_{perturb_type}_{parameters}.res'\n",
    "with open(experiment_file_name, 'rb') as input_file:\n",
    "    loaded_experiment = pkl.load(input_file)\n",
    "\n",
    "    margin_y_axis = 2.0\n",
    "\n",
    "    # plot embedding for each matrix\n",
    "    for matrix in MATRICES:\n",
    "        u_2 = loaded_experiment['classifiers'][matrix][0][1] # get the second eigenvector\n",
    "        plot_embedding(u_2, loaded_experiment['clusters_planted'], margin = margin_y_axis, legend_position = 'center right', heading = MATRICES_LABELS_LONG[matrix] if MATRICES_LABELS_LONG[matrix] != None else None)"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
