{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "collapsed_sections": [
        "X__H-7mThqc8",
        "Dz5N9LCypEOm",
        "990eZAv0p0TS",
        "za-kqK6T4Vmi",
        "vxwr-JE3p4KK",
        "amYWMX3clvAL",
        "36V_xHeWpgzE"
      ]
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "# Import Libraries"
      ],
      "metadata": {
        "id": "TYqqrdFndOj_"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Import Graph-tool"
      ],
      "metadata": {
        "id": "X__H-7mThqc8"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "b61_iWWSgzWT"
      },
      "outputs": [],
      "source": [
        "!pip install -q condacolab\n",
        "import condacolab\n",
        "condacolab.install()\n",
        "!conda install -c conda-forge graph-tool"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!conda install scikit-learn"
      ],
      "metadata": {
        "id": "mL3HcsfsxTyx"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# test graph-tool\n",
        "from graph_tool.all import *"
      ],
      "metadata": {
        "id": "rBzPMIuah10u"
      },
      "execution_count": 1,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Import GraphWorld"
      ],
      "metadata": {
        "id": "Dz5N9LCypEOm"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!git clone https://github.com/google-research/graphworld.git"
      ],
      "metadata": {
        "id": "fzCCnPf-pAzy"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "%cd graphworld/src\n",
        "!pip install -r requirements.txt"
      ],
      "metadata": {
        "id": "TXSKoXpxpIVG"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "%matplotlib inline\n",
        "import numpy as np\n",
        "import matplotlib.pyplot as plt\n",
        "import graph_tool.all as gt\n",
        "import sklearn"
      ],
      "metadata": {
        "id": "2nQKCMiGpMbc"
      },
      "execution_count": 4,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "%cd /content/graphworld/src"
      ],
      "metadata": {
        "id": "Lfj9bePapS96"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from graph_world.beam.generator_config_sampler import ParamSamplerSpec\n",
        "from graph_world.beam.generator_beam_handler import GeneratorBeamHandler\n",
        "import graph_world.generators.sbm_simulator\n",
        "from graph_world.generators.sbm_simulator import GenerateStochasticBlockModelWithFeatures, MatchType, MakePi, MakeDegrees, MakePropMat\n",
        "from graph_world.metrics.graph_metrics import graph_metrics, graph_metrics_nx\n",
        "from graph_world.metrics.node_label_metrics import NodeLabelMetrics"
      ],
      "metadata": {
        "id": "5VkGz1bRpfaW"
      },
      "execution_count": 6,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Import GLI module"
      ],
      "metadata": {
        "id": "990eZAv0p0TS"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "%cd /content\n",
        "!git clone https://github.com/Graph-Learning-Benchmarks/gli.git"
      ],
      "metadata": {
        "id": "nbsF3FC8p2fh"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "%cd /content/gli\n",
        "!pip install -e ."
      ],
      "metadata": {
        "id": "aj7hRVISqFrA"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Install torch-sparse / pyyaml"
      ],
      "metadata": {
        "id": "za-kqK6T4Vmi"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-2.0.0+${CUDA}.html\n",
        "!pip install pyyaml"
      ],
      "metadata": {
        "id": "n0qpiRSkspkt"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Controlled Experiments"
      ],
      "metadata": {
        "id": "nJKH7WWjdmA5"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Varying Gini-Degree"
      ],
      "metadata": {
        "id": "UPVhlgOkpx2j"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# the \n",
        "def gini(array):\n",
        "    # Values cannot be 0:\n",
        "    array = array + 0.000000001\n",
        "    # Values must be sorted:\n",
        "    array = np.sort(array)\n",
        "    # Index per array element:\n",
        "    index = np.arange(1,array.shape[0]+1)\n",
        "    # Number of array elements:\n",
        "    n = array.shape[0]\n",
        "    # Gini coefficient:\n",
        "    return ((np.sum((2 * index - n  - 1) * array)) / (n * np.sum(array)))"
      ],
      "metadata": {
        "id": "kpyjrF4uen_b"
      },
      "execution_count": 10,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import random\n",
        "import pandas as pd\n",
        "import networkx as nx\n",
        "import os\n",
        "import numpy as np\n",
        "import graph_tool as gt\n",
        "import torch\n",
        "import matplotlib.pyplot as plt\n",
        "\n",
        "parent_dir = \"/content/gli/datasets/\"\n",
        "partition = {\"train_FOLD\":0.6, \"val_FOLD\":0.2, \"test_FOLD\":0.2}\n",
        "num_of_split = 1\n",
        "# transform gini coefficient \n",
        "\n",
        "# set GraphWorld parameters\n",
        "NVERTEX = 5000\n",
        "FEATURE_CENTER_DISTANCE = 0.05\n",
        "P2Q = 3.0\n",
        "CLUSTER_SIZE_SLOPE = 0.0\n",
        "\n",
        "avg_degree = 20\n",
        "feature_dim = 16\n",
        "edge_center_distance = 2.0\n",
        "edge_feature_dim = 2\n",
        "feature_cluster_variance = 0.25\n",
        "\n",
        "NUM_CLUSTERS = 4\n",
        "\n",
        "gini_arr = [1.5, 2, 2.5, 3, 5]\n",
        "repeat = 1\n",
        "for i in range(len(gini_arr)):\n",
        "  # power exponent!\n",
        "  POWER_EXPONENT = gini_arr[i]\n",
        "  # to build additional datasets\n",
        "  \n",
        "  for j in range(repeat):\n",
        "\n",
        "    # create directory \n",
        "    directory = \"gw_gini_\"+str(i)+\"_\"+str(j)\n",
        "    # Path\n",
        "    # if we want to train, then add this line\n",
        "    directory = os.path.join(parent_dir, directory)\n",
        "    if not os.path.exists(directory):\n",
        "      os.mkdir(directory)\n",
        "\n",
        "    pi = graph_world.generators.sbm_simulator.MakePi(num_communities=NUM_CLUSTERS, community_size_slope = CLUSTER_SIZE_SLOPE)\n",
        "    prop_mat = graph_world.generators.sbm_simulator.MakePropMat(num_communities=NUM_CLUSTERS, p_to_q_ratio=P2Q)\n",
        "    out_degrees = graph_world.generators.sbm_simulator.MakeDegrees(POWER_EXPONENT, 1, NVERTEX)\n",
        "\n",
        "    sampler_out = graph_world.generators.sbm_simulator.GenerateStochasticBlockModelWithFeatures(\n",
        "      num_vertices=NVERTEX,\n",
        "      num_edges=NVERTEX*avg_degree,\n",
        "      pi=pi,\n",
        "      prop_mat=prop_mat,\n",
        "      out_degs=out_degrees,\n",
        "      feature_center_distance=FEATURE_CENTER_DISTANCE,\n",
        "      feature_dim=feature_dim,\n",
        "      num_feature_groups=NUM_CLUSTERS,\n",
        "      feature_group_match_type=MatchType.GROUPED,\n",
        "      feature_cluster_variance=feature_cluster_variance,\n",
        "      edge_feature_dim=edge_feature_dim,\n",
        "      edge_center_distance=edge_center_distance,\n",
        "      edge_cluster_variance=1,\n",
        "      normalize_features=True)\n",
        "    \n",
        "    graph = sampler_out.graph\n",
        "    memberships = sampler_out.graph_memberships\n",
        "    feature_memberships = sampler_out.feature_memberships\n",
        "    features = sampler_out.node_features\n",
        "    degrees = graph.get_out_degrees(graph.get_vertices())\n",
        "    num_removed = 0\n",
        "    for z, d in enumerate(degrees):\n",
        "      if d == 0:\n",
        "          graph.remove_vertex(z - num_removed)\n",
        "          memberships = np.delete(memberships, [z - num_removed])\n",
        "          features = np.delete(features, [z - num_removed], axis=0)\n",
        "          num_removed += 1\n",
        "    # gt.remove_self_loops(graph)\n",
        "\n",
        "\n",
        "    # for printing out coreness gini value\n",
        "    # out = {}                                    \n",
        "    # nx_graph = nx.Graph()\n",
        "    # edge_list = [(int(e.source()), int(e.target())) for e in graph.edges()]\n",
        "    # nx_graph.add_edges_from(edge_list)\n",
        "    \n",
        "    # degree_sequence = [d for n, d in nx_graph.degree()]\n",
        "    # degree_sequence = np.sort(degree_sequence)\n",
        "    # # print(degree_sequence)\n",
        "    # # fit = powerlaw.Fit(degree_sequence, verbose=False)\n",
        "    # # print(fit.power_law.alpha)\n",
        "\n",
        "    # out['metrics'] = graph_metrics_nx(nx_graph)\n",
        "    print(\"degree_gini: \", gini(out_degrees))\n",
        "\n",
        "    # for dataset.npz\n",
        "    output = {}\n",
        "    output[\"node_feats\"] = torch.from_numpy(features.astype(\"float32\"))\n",
        "    output[\"node_class\"] = torch.from_numpy(memberships)\n",
        "\n",
        "    output[\"edge\"] = torch.from_numpy(graph.get_edges())\n",
        "    output[\"edge_list\"] = torch.from_numpy(np.ones(len(list(graph.edges()))))\n",
        "    output[\"node_list\"] = torch.from_numpy(np.ones(len(list(graph.vertices()))))\n",
        "    print(output[\"node_class\"].shape)\n",
        "    np.savez(\"/content/gli/datasets/gw_gini_\"+str(i)+\"_\"+str(j)+\"/gw_gini_\"+str(i)+\"_\"+str(j)+\".npz\", **output, allow_pickle=True)\n",
        "\n",
        "\n",
        "    # for dataset_task.npz\n",
        "    output_task = {}\n",
        "    node_ids = list(range(len(list(graph.vertices()))))\n",
        "    train_str = \"train_\"\n",
        "    val_str = \"val_\"\n",
        "    test_str = 'test_'\n",
        "    for z in range(num_of_split):\n",
        "      random.shuffle(node_ids)\n",
        "      train_len = int(len(node_ids) * partition[\"train_FOLD\"])\n",
        "      val_len = int(len(node_ids) * partition[\"val_FOLD\"])\n",
        "      test_len = int(len(node_ids) * partition[\"test_FOLD\"])\n",
        "      output_task[\"train\"] = node_ids[:train_len]\n",
        "      output_task[\"val\"] = node_ids[train_len:train_len+val_len]\n",
        "      output_task[\"test\"] = node_ids[train_len+val_len:]\n",
        "\n",
        "      np.savez(\"/content/gli/datasets/gw_gini_\"+str(i)+\"_\"+str(j)+\"/gw_gini_\"+str(i)+\"_\"+str(j)+\"_task.npz\", **output_task, allow_pickle=True)\n",
        "\n",
        "\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "CKA9oqZmsA9u",
        "outputId": "f9c81565-7399-4077-a1b4-5f0ca4efc4d9"
      },
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "degree_gini:  0.9068595083485667\n",
            "torch.Size([3111])\n",
            "degree_gini:  0.8048813622687628\n",
            "torch.Size([4741])\n",
            "degree_gini:  0.5000619029505455\n",
            "torch.Size([4999])\n",
            "degree_gini:  0.3609721190090517\n",
            "torch.Size([5000])\n",
            "degree_gini:  0.06935995544182426\n",
            "torch.Size([5000])\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Build GLI Format and Store Dataset"
      ],
      "metadata": {
        "id": "fHZfr9GUlXpi"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# output json files\n",
        "import json\n",
        "\n",
        "# 4 values of power expo\n",
        "for i in range(len(gini_arr)):\n",
        "  # to build additional datasets\n",
        "  # 5 independent datasets\n",
        "  for j in range(repeat):\n",
        "\n",
        "    # for metadata.json files\n",
        "    dataset_str = \"gw_gini_\"+str(i)+\"_\"+str(j)+\".npz\"\n",
        "    print(\"dataset_str: \", dataset_str)\n",
        "    metadata_json = {}\n",
        "    metadata_json[\"description\"] = \"Random Generated Dataset.\"\n",
        "    metadata_json[\"data\"] = {}\n",
        "\n",
        "    metadata_json[\"data\"][\"Node\"] = {}\n",
        "    metadata_json[\"data\"][\"Edge\"] = {}\n",
        "    metadata_json[\"data\"][\"Graph\"] = {}\n",
        "    metadata_json[\"data\"][\"Node\"][\"NodeFeature\"] = {\"description\": \"Node features of random generated dataset, real-valued vectors.\",\n",
        "                            \"type\": \"double\",\n",
        "                            \"format\": \"Tensor\",\n",
        "                            \"file\": dataset_str,\n",
        "                            \"key\": \"node_feats\"}\n",
        "\n",
        "\n",
        "    metadata_json[\"data\"][\"Node\"][\"NodeLabel\"] = {\"description\": \"Node labels of random generated dataset, int ranged from 1 to 4.\",\n",
        "                                          \"type\": \"int\",\n",
        "                                          \"format\": \"Tensor\",\n",
        "                                          \"file\": dataset_str,\n",
        "                                          \"key\": \"node_class\"\n",
        "                                          }\n",
        "\n",
        "    metadata_json[\"data\"][\"Edge\"][\"_Edge\"] = {\"file\":dataset_str, \"key\":\"edge\"}\n",
        "\n",
        "    metadata_json[\"data\"][\"Graph\"] = {}\n",
        "    metadata_json[\"data\"][\"Graph\"][\"_NodeList\"] = {}\n",
        "    metadata_json[\"data\"][\"Graph\"][\"_NodeList\"][\"file\"] = dataset_str\n",
        "    metadata_json[\"data\"][\"Graph\"][\"_NodeList\"][\"key\"] = \"node_list\"\n",
        "    metadata_json[\"data\"][\"Graph\"][\"_EdgeList\"] = {}\n",
        "    metadata_json[\"data\"][\"Graph\"][\"_EdgeList\"][\"file\"] = dataset_str\n",
        "    metadata_json[\"data\"][\"Graph\"][\"_EdgeList\"][\"key\"] = \"edge_list\"\n",
        "\n",
        "\n",
        "    metadata_json[\"citation\"]=\"@inproceedings{10.1145/3534678.3539203,\\\n",
        "    author = {Palowitch, John and Tsitsulin, Anton and Mayer, Brandon and Perozzi, Bryan},\\\n",
        "    title = {GraphWorld: Fake Graphs Bring Real Insights for GNNs},\\\n",
        "    year = {2022},\\\n",
        "    isbn = {9781450393850},\\\n",
        "    publisher = {Association for Computing Machinery},\\\n",
        "    url = {https://doi.org/10.1145/3534678.3539203},\\\n",
        "    doi = {10.1145/3534678.3539203},\\\n",
        "    booktitle = {Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining},\\\n",
        "    pages = {3691–3701},\\\n",
        "    series = {KDD '22}}\"\n",
        "\n",
        "    metadata_json[\"is_heterogeneous\"] = False\n",
        "\n",
        "\n",
        "    # for task_node_classification_1.json file\n",
        "    task_json = {}\n",
        "    task_str = \"gw_gini_\"+str(i)+\"_\"+str(j)+\"_task\"+\".npz\"\n",
        "    print(\"task_str: \", task_str)\n",
        "    task_json[\"description\"] = \"Node classification on random generated dataset.\"\n",
        "    task_json[\"type\"] = \"NodeClassification\"\n",
        "    task_json[\"feature\"] = [\"Node/NodeFeature\"]\n",
        "    task_json[\"target\"] = \"Node/NodeLabel\"\n",
        "    task_json[\"num_classes\"] = NUM_CLUSTERS\n",
        "    task_json[\"train_set\"] = {}\n",
        "    task_json[\"train_set\"][\"file\"] = task_str\n",
        "    task_json[\"train_set\"][\"key\"] = \"train\"\n",
        "\n",
        "    task_json[\"val_set\"] = {}\n",
        "    task_json[\"val_set\"][\"file\"] = task_str\n",
        "    task_json[\"val_set\"][\"key\"] = \"val\"\n",
        "\n",
        "    task_json[\"test_set\"] = {}\n",
        "    task_json[\"test_set\"][\"file\"] = task_str\n",
        "    task_json[\"test_set\"][\"key\"] = \"test\"\n",
        "\n",
        "    # for urls.json files\n",
        "    url_json = {}\n",
        "    url_json[dataset_str] = \"\"\n",
        "    url_json[task_str] = \"\"\n",
        "\n",
        "    # output to each directory\n",
        "    dir_str = \"gw_gini_\"+str(i)+\"_\"+str(j)\n",
        "    with open(parent_dir+dir_str+'/metadata.json', 'w') as fp:\n",
        "      json.dump(metadata_json, fp, indent=4)\n",
        "    with open(parent_dir+dir_str+'/task_node_classification_1.json', 'w') as fp:\n",
        "      json.dump(task_json, fp, indent=4)\n",
        "    with open(parent_dir+dir_str+'/urls.json', 'w') as fp:\n",
        "      json.dump(url_json, fp, indent=4)\n",
        " "
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "U1wisVAulQKs",
        "outputId": "6e902804-9060-4162-f33c-652cf99319ea"
      },
      "execution_count": 13,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "dataset_str:  gw_gini_0_0.npz\n",
            "task_str:  gw_gini_0_0_task.npz\n",
            "dataset_str:  gw_gini_1_0.npz\n",
            "task_str:  gw_gini_1_0_task.npz\n",
            "dataset_str:  gw_gini_2_0.npz\n",
            "task_str:  gw_gini_2_0_task.npz\n",
            "dataset_str:  gw_gini_3_0.npz\n",
            "task_str:  gw_gini_3_0_task.npz\n",
            "dataset_str:  gw_gini_4_0.npz\n",
            "task_str:  gw_gini_4_0_task.npz\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!zip -r /content/all_data_new.zip /content/gli/datasets/gw_gini_*\n",
        "from google.colab import files\n",
        "files.download(\"/content/all_data_new.zip\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 538
        },
        "id": "I67xFIyElboR",
        "outputId": "fafb7301-3812-4bb7-e707-a0881c8c29b3"
      },
      "execution_count": 14,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "  adding: content/gli/datasets/gw_gini_0_0/ (stored 0%)\n",
            "  adding: content/gli/datasets/gw_gini_0_0/metadata.json (deflated 63%)\n",
            "  adding: content/gli/datasets/gw_gini_0_0/gw_gini_0_0_task.npz (deflated 70%)\n",
            "  adding: content/gli/datasets/gw_gini_0_0/task_node_classification_1.json (deflated 59%)\n",
            "  adding: content/gli/datasets/gw_gini_0_0/urls.json (deflated 36%)\n",
            "  adding: content/gli/datasets/gw_gini_0_0/gw_gini_0_0.npz (deflated 73%)\n",
            "  adding: content/gli/datasets/gw_gini_1_0/ (stored 0%)\n",
            "  adding: content/gli/datasets/gw_gini_1_0/metadata.json (deflated 63%)\n",
            "  adding: content/gli/datasets/gw_gini_1_0/task_node_classification_1.json (deflated 59%)\n",
            "  adding: content/gli/datasets/gw_gini_1_0/urls.json (deflated 36%)\n",
            "  adding: content/gli/datasets/gw_gini_1_0/gw_gini_1_0.npz (deflated 67%)\n",
            "  adding: content/gli/datasets/gw_gini_1_0/gw_gini_1_0_task.npz (deflated 69%)\n",
            "  adding: content/gli/datasets/gw_gini_2_0/ (stored 0%)\n",
            "  adding: content/gli/datasets/gw_gini_2_0/metadata.json (deflated 63%)\n",
            "  adding: content/gli/datasets/gw_gini_2_0/task_node_classification_1.json (deflated 59%)\n",
            "  adding: content/gli/datasets/gw_gini_2_0/urls.json (deflated 36%)\n",
            "  adding: content/gli/datasets/gw_gini_2_0/gw_gini_2_0.npz (deflated 71%)\n",
            "  adding: content/gli/datasets/gw_gini_2_0/gw_gini_2_0_task.npz (deflated 69%)\n",
            "  adding: content/gli/datasets/gw_gini_3_0/ (stored 0%)\n",
            "  adding: content/gli/datasets/gw_gini_3_0/metadata.json (deflated 63%)\n",
            "  adding: content/gli/datasets/gw_gini_3_0/task_node_classification_1.json (deflated 59%)\n",
            "  adding: content/gli/datasets/gw_gini_3_0/urls.json (deflated 36%)\n",
            "  adding: content/gli/datasets/gw_gini_3_0/gw_gini_3_0_task.npz (deflated 69%)\n",
            "  adding: content/gli/datasets/gw_gini_3_0/gw_gini_3_0.npz (deflated 72%)\n",
            "  adding: content/gli/datasets/gw_gini_4_0/ (stored 0%)\n",
            "  adding: content/gli/datasets/gw_gini_4_0/metadata.json (deflated 63%)\n",
            "  adding: content/gli/datasets/gw_gini_4_0/task_node_classification_1.json (deflated 59%)\n",
            "  adding: content/gli/datasets/gw_gini_4_0/urls.json (deflated 36%)\n",
            "  adding: content/gli/datasets/gw_gini_4_0/gw_gini_4_0.npz (deflated 72%)\n",
            "  adding: content/gli/datasets/gw_gini_4_0/gw_gini_4_0_task.npz (deflated 69%)\n"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.Javascript object>"
            ],
            "application/javascript": [
              "\n",
              "    async function download(id, filename, size) {\n",
              "      if (!google.colab.kernel.accessAllowed) {\n",
              "        return;\n",
              "      }\n",
              "      const div = document.createElement('div');\n",
              "      const label = document.createElement('label');\n",
              "      label.textContent = `Downloading \"${filename}\": `;\n",
              "      div.appendChild(label);\n",
              "      const progress = document.createElement('progress');\n",
              "      progress.max = size;\n",
              "      div.appendChild(progress);\n",
              "      document.body.appendChild(div);\n",
              "\n",
              "      const buffers = [];\n",
              "      let downloaded = 0;\n",
              "\n",
              "      const channel = await google.colab.kernel.comms.open(id);\n",
              "      // Send a message to notify the kernel that we're ready.\n",
              "      channel.send({})\n",
              "\n",
              "      for await (const message of channel.messages) {\n",
              "        // Send a message to notify the kernel that we're ready.\n",
              "        channel.send({})\n",
              "        if (message.buffers) {\n",
              "          for (const buffer of message.buffers) {\n",
              "            buffers.push(buffer);\n",
              "            downloaded += buffer.byteLength;\n",
              "            progress.value = downloaded;\n",
              "          }\n",
              "        }\n",
              "      }\n",
              "      const blob = new Blob(buffers, {type: 'application/binary'});\n",
              "      const a = document.createElement('a');\n",
              "      a.href = window.URL.createObjectURL(blob);\n",
              "      a.download = filename;\n",
              "      div.appendChild(a);\n",
              "      a.click();\n",
              "      div.remove();\n",
              "    }\n",
              "  "
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.Javascript object>"
            ],
            "application/javascript": [
              "download(\"download_2ff363c9-b778-4f57-90ea-ab233b545729\", \"all_data_new.zip\", 2033065)"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Varying Avg Degree"
      ],
      "metadata": {
        "id": "vxwr-JE3p4KK"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import random\n",
        "import pandas as pd\n",
        "import networkx as nx\n",
        "import os\n",
        "import numpy as np\n",
        "import graph_tool as gt\n",
        "import torch\n",
        "import matplotlib.pyplot as plt\n",
        "\n",
        "parent_dir = \"/content/gli/datasets/\"\n",
        "partition = {\"train_FOLD\":0.6, \"val_FOLD\":0.2, \"test_FOLD\":0.2}\n",
        "num_of_split = 1\n",
        "# transform gini coefficient \n",
        "\n",
        "# set GraphWorld parameters\n",
        "NVERTEX = 5000\n",
        "FEATURE_CENTER_DISTANCE = 0.05\n",
        "P2Q = 3.0\n",
        "CLUSTER_SIZE_SLOPE = 0.0\n",
        "\n",
        "# avg_degree = 20\n",
        "feature_dim = 16\n",
        "edge_center_distance = 2.0\n",
        "edge_feature_dim = 2\n",
        "feature_cluster_variance = 0.25\n",
        "# POWER_EXPONENT = 2\n",
        "\n",
        "\n",
        "NUM_CLUSTERS = 4\n",
        "repeat = 1\n",
        "POWER_EXPONENT = 2\n",
        "\n",
        "\n",
        "\n",
        "deg_arr = [10, 20, 30, 40, 50]\n",
        "for i in range(len(deg_arr)):\n",
        "  \n",
        "  avg_degree = deg_arr[i]\n",
        "  \n",
        "  # to build additional datasets\n",
        "  \n",
        "  for j in range(repeat):\n",
        "\n",
        "    # create directory \n",
        "    directory = \"gw_deg_\"+str(i)+\"_\"+str(j)\n",
        "    # Path\n",
        "    # if we want to train, then add this line\n",
        "    directory = os.path.join(parent_dir, directory)\n",
        "    if not os.path.exists(directory):\n",
        "      os.mkdir(directory)\n",
        "\n",
        "    pi = graph_world.generators.sbm_simulator.MakePi(num_communities=NUM_CLUSTERS, community_size_slope = CLUSTER_SIZE_SLOPE)\n",
        "    prop_mat = graph_world.generators.sbm_simulator.MakePropMat(num_communities=NUM_CLUSTERS, p_to_q_ratio=P2Q)\n",
        "    out_degrees = graph_world.generators.sbm_simulator.MakeDegrees(POWER_EXPONENT, 1, NVERTEX)\n",
        "\n",
        "    sampler_out = graph_world.generators.sbm_simulator.GenerateStochasticBlockModelWithFeatures(\n",
        "      num_vertices=NVERTEX,\n",
        "      num_edges=NVERTEX*avg_degree,\n",
        "      pi=pi,\n",
        "      prop_mat=prop_mat,\n",
        "      out_degs=out_degrees,\n",
        "      feature_center_distance=FEATURE_CENTER_DISTANCE,\n",
        "      feature_dim=feature_dim,\n",
        "      num_feature_groups=NUM_CLUSTERS,\n",
        "      feature_group_match_type=MatchType.GROUPED,\n",
        "      feature_cluster_variance=feature_cluster_variance,\n",
        "      edge_feature_dim=edge_feature_dim,\n",
        "      edge_center_distance=edge_center_distance,\n",
        "      edge_cluster_variance=1,\n",
        "      normalize_features=True)\n",
        "    \n",
        "    graph = sampler_out.graph\n",
        "    memberships = sampler_out.graph_memberships\n",
        "    feature_memberships = sampler_out.feature_memberships\n",
        "    features = sampler_out.node_features\n",
        "    degrees = graph.get_out_degrees(graph.get_vertices())\n",
        "    num_removed = 0\n",
        "    for z, d in enumerate(degrees):\n",
        "      if d == 0:\n",
        "          graph.remove_vertex(z - num_removed)\n",
        "          memberships = np.delete(memberships, [z - num_removed])\n",
        "          features = np.delete(features, [z - num_removed], axis=0)\n",
        "          num_removed += 1\n",
        "    # gt.remove_self_loops(graph)\n",
        "\n",
        "\n",
        "    # for printing out coreness gini value\n",
        "    # out = {}                                    \n",
        "    # nx_graph = nx.Graph()\n",
        "    # edge_list = [(int(e.source()), int(e.target())) for e in graph.edges()]\n",
        "    # nx_graph.add_edges_from(edge_list)\n",
        "    \n",
        "    # degree_sequence = [d for n, d in nx_graph.degree()]\n",
        "    # degree_sequence = np.sort(degree_sequence)\n",
        "    # # print(degree_sequence)\n",
        "    # # fit = powerlaw.Fit(degree_sequence, verbose=False)\n",
        "    # # print(fit.power_law.alpha)\n",
        "\n",
        "    # out['metrics'] = graph_metrics_nx(nx_graph)\n",
        "    # print(\"psu_diameter: \", _diameter(nx_graph))\n",
        "\n",
        "    # for dataset.npz\n",
        "    output = {}\n",
        "    output[\"node_feats\"] = torch.from_numpy(features.astype(\"float32\"))\n",
        "    output[\"node_class\"] = torch.from_numpy(memberships)\n",
        "\n",
        "    output[\"edge\"] = torch.from_numpy(graph.get_edges())\n",
        "    output[\"edge_list\"] = torch.from_numpy(np.ones(len(list(graph.edges()))))\n",
        "    output[\"node_list\"] = torch.from_numpy(np.ones(len(list(graph.vertices()))))\n",
        "    print(output[\"node_class\"].shape)\n",
        "    np.savez(\"/content/gli/datasets/gw_deg_\"+str(i)+\"_\"+str(j)+\"/gw_deg_\"+str(i)+\"_\"+str(j)+\".npz\", **output, allow_pickle=True)\n",
        "\n",
        "\n",
        "    # for dataset_task.npz\n",
        "    output_task = {}\n",
        "    node_ids = list(range(len(list(graph.vertices()))))\n",
        "    train_str = \"train_\"\n",
        "    val_str = \"val_\"\n",
        "    test_str = 'test_'\n",
        "    for z in range(num_of_split):\n",
        "      random.shuffle(node_ids)\n",
        "      train_len = int(len(node_ids) * partition[\"train_FOLD\"])\n",
        "      val_len = int(len(node_ids) * partition[\"val_FOLD\"])\n",
        "      test_len = int(len(node_ids) * partition[\"test_FOLD\"])\n",
        "      output_task[\"train\"] = node_ids[:train_len]\n",
        "      output_task[\"val\"] = node_ids[train_len:train_len+val_len]\n",
        "      output_task[\"test\"] = node_ids[train_len+val_len:]\n",
        "\n",
        "      np.savez(\"/content/gli/datasets/gw_deg_\"+str(i)+\"_\"+str(j)+\"/gw_deg_\"+str(i)+\"_\"+str(j)+\"_task.npz\", **output_task, allow_pickle=True)\n",
        "\n",
        "\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "SXL16mZnp7zA",
        "outputId": "2ed70463-6c29-4db5-dd27-bcce60217b30"
      },
      "execution_count": 23,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "torch.Size([3996])\n",
            "torch.Size([4851])\n",
            "torch.Size([4868])\n",
            "torch.Size([4992])\n",
            "torch.Size([4994])\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Build GLI Format and Store Dataset"
      ],
      "metadata": {
        "id": "_DXP1MiUmKIA"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# output json files\n",
        "import json\n",
        "\n",
        "# 4 values of power expo\n",
        "for i in range(len(deg_arr)):\n",
        "  # to build additional datasets\n",
        "  # 5 independent datasets\n",
        "  for j in range(repeat):\n",
        "\n",
        "    # for metadata.json files\n",
        "    dataset_str = \"gw_deg_\"+str(i)+\"_\"+str(j)+\".npz\"\n",
        "    print(\"dataset_str: \", dataset_str)\n",
        "    metadata_json = {}\n",
        "    metadata_json[\"description\"] = \"Random Generated Dataset.\"\n",
        "    metadata_json[\"data\"] = {}\n",
        "\n",
        "    metadata_json[\"data\"][\"Node\"] = {}\n",
        "    metadata_json[\"data\"][\"Edge\"] = {}\n",
        "    metadata_json[\"data\"][\"Graph\"] = {}\n",
        "    metadata_json[\"data\"][\"Node\"][\"NodeFeature\"] = {\"description\": \"Node features of random generated dataset, real-valued vectors.\",\n",
        "                            \"type\": \"double\",\n",
        "                            \"format\": \"Tensor\",\n",
        "                            \"file\": dataset_str,\n",
        "                            \"key\": \"node_feats\"}\n",
        "\n",
        "\n",
        "    metadata_json[\"data\"][\"Node\"][\"NodeLabel\"] = {\"description\": \"Node labels of random generated dataset, int ranged from 1 to 4.\",\n",
        "                                          \"type\": \"int\",\n",
        "                                          \"format\": \"Tensor\",\n",
        "                                          \"file\": dataset_str,\n",
        "                                          \"key\": \"node_class\"\n",
        "                                          }\n",
        "\n",
        "    metadata_json[\"data\"][\"Edge\"][\"_Edge\"] = {\"file\":dataset_str, \"key\":\"edge\"}\n",
        "\n",
        "    metadata_json[\"data\"][\"Graph\"] = {}\n",
        "    metadata_json[\"data\"][\"Graph\"][\"_NodeList\"] = {}\n",
        "    metadata_json[\"data\"][\"Graph\"][\"_NodeList\"][\"file\"] = dataset_str\n",
        "    metadata_json[\"data\"][\"Graph\"][\"_NodeList\"][\"key\"] = \"node_list\"\n",
        "    metadata_json[\"data\"][\"Graph\"][\"_EdgeList\"] = {}\n",
        "    metadata_json[\"data\"][\"Graph\"][\"_EdgeList\"][\"file\"] = dataset_str\n",
        "    metadata_json[\"data\"][\"Graph\"][\"_EdgeList\"][\"key\"] = \"edge_list\"\n",
        "\n",
        "\n",
        "    metadata_json[\"citation\"]=\"@inproceedings{10.1145/3534678.3539203,\\\n",
        "    author = {Palowitch, John and Tsitsulin, Anton and Mayer, Brandon and Perozzi, Bryan},\\\n",
        "    title = {GraphWorld: Fake Graphs Bring Real Insights for GNNs},\\\n",
        "    year = {2022},\\\n",
        "    isbn = {9781450393850},\\\n",
        "    publisher = {Association for Computing Machinery},\\\n",
        "    url = {https://doi.org/10.1145/3534678.3539203},\\\n",
        "    doi = {10.1145/3534678.3539203},\\\n",
        "    booktitle = {Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining},\\\n",
        "    pages = {3691–3701},\\\n",
        "    series = {KDD '22}}\"\n",
        "\n",
        "    metadata_json[\"is_heterogeneous\"] = False\n",
        "\n",
        "\n",
        "    # for task_node_classification_1.json file\n",
        "    task_json = {}\n",
        "    task_str = \"gw_deg_\"+str(i)+\"_\"+str(j)+\"_task\"+\".npz\"\n",
        "    print(\"task_str: \", task_str)\n",
        "    task_json[\"description\"] = \"Node classification on random generated dataset.\"\n",
        "    task_json[\"type\"] = \"NodeClassification\"\n",
        "    task_json[\"feature\"] = [\"Node/NodeFeature\"]\n",
        "    task_json[\"target\"] = \"Node/NodeLabel\"\n",
        "    task_json[\"num_classes\"] = NUM_CLUSTERS\n",
        "    task_json[\"train_set\"] = {}\n",
        "    task_json[\"train_set\"][\"file\"] = task_str\n",
        "    task_json[\"train_set\"][\"key\"] = \"train\"\n",
        "\n",
        "    task_json[\"val_set\"] = {}\n",
        "    task_json[\"val_set\"][\"file\"] = task_str\n",
        "    task_json[\"val_set\"][\"key\"] = \"val\"\n",
        "\n",
        "    task_json[\"test_set\"] = {}\n",
        "    task_json[\"test_set\"][\"file\"] = task_str\n",
        "    task_json[\"test_set\"][\"key\"] = \"test\"\n",
        "\n",
        "    # for urls.json files\n",
        "    url_json = {}\n",
        "    url_json[dataset_str] = \"\"\n",
        "    url_json[task_str] = \"\"\n",
        "\n",
        "    # output to each directory\n",
        "    dir_str = \"gw_deg_\"+str(i)+\"_\"+str(j)\n",
        "    with open(parent_dir+dir_str+'/metadata.json', 'w') as fp:\n",
        "      json.dump(metadata_json, fp, indent=4)\n",
        "    with open(parent_dir+dir_str+'/task_node_classification_1.json', 'w') as fp:\n",
        "      json.dump(task_json, fp, indent=4)\n",
        "    with open(parent_dir+dir_str+'/urls.json', 'w') as fp:\n",
        "      json.dump(url_json, fp, indent=4)\n",
        " "
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "87GlZYCvlwG_",
        "outputId": "92c25a93-5334-47cf-e704-758d2e2d4e12"
      },
      "execution_count": 24,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "dataset_str:  gw_deg_0_0.npz\n",
            "task_str:  gw_deg_0_0_task.npz\n",
            "dataset_str:  gw_deg_1_0.npz\n",
            "task_str:  gw_deg_1_0_task.npz\n",
            "dataset_str:  gw_deg_2_0.npz\n",
            "task_str:  gw_deg_2_0_task.npz\n",
            "dataset_str:  gw_deg_3_0.npz\n",
            "task_str:  gw_deg_3_0_task.npz\n",
            "dataset_str:  gw_deg_4_0.npz\n",
            "task_str:  gw_deg_4_0_task.npz\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!zip -r /content/all_data_deg.zip /content/gli/datasets/gw_deg_*\n",
        "from google.colab import files\n",
        "files.download(\"/content/all_data_deg.zip\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 625
        },
        "id": "ZFMUC0IBmsmk",
        "outputId": "6d47ac72-ed1a-467a-ca21-2c0b0b71c159"
      },
      "execution_count": 25,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "updating: content/gli/datasets/gw_deg_0_0/ (stored 0%)\n",
            "updating: content/gli/datasets/gw_deg_0_0/metadata.json (deflated 63%)\n",
            "updating: content/gli/datasets/gw_deg_0_0/gw_gini_0_0_task.npz (deflated 69%)\n",
            "updating: content/gli/datasets/gw_deg_0_0/task_node_classification_1.json (deflated 59%)\n",
            "updating: content/gli/datasets/gw_deg_0_0/urls.json (deflated 36%)\n",
            "updating: content/gli/datasets/gw_deg_0_0/gw_deg_0_0.npz (deflated 59%)\n",
            "updating: content/gli/datasets/gw_deg_1_0/ (stored 0%)\n",
            "updating: content/gli/datasets/gw_deg_1_0/metadata.json (deflated 63%)\n",
            "updating: content/gli/datasets/gw_deg_1_0/task_node_classification_1.json (deflated 59%)\n",
            "updating: content/gli/datasets/gw_deg_1_0/urls.json (deflated 36%)\n",
            "updating: content/gli/datasets/gw_deg_1_0/gw_deg_1_0.npz (deflated 68%)\n",
            "updating: content/gli/datasets/gw_deg_1_0/gw_gini_1_0_task.npz (deflated 69%)\n",
            "updating: content/gli/datasets/gw_deg_2_0/ (stored 0%)\n",
            "updating: content/gli/datasets/gw_deg_2_0/metadata.json (deflated 63%)\n",
            "updating: content/gli/datasets/gw_deg_2_0/task_node_classification_1.json (deflated 59%)\n",
            "updating: content/gli/datasets/gw_deg_2_0/urls.json (deflated 36%)\n",
            "updating: content/gli/datasets/gw_deg_2_0/gw_deg_2_0.npz (deflated 69%)\n",
            "updating: content/gli/datasets/gw_deg_2_0/gw_gini_2_0_task.npz (deflated 69%)\n",
            "updating: content/gli/datasets/gw_deg_3_0/ (stored 0%)\n",
            "updating: content/gli/datasets/gw_deg_3_0/metadata.json (deflated 63%)\n",
            "updating: content/gli/datasets/gw_deg_3_0/task_node_classification_1.json (deflated 59%)\n",
            "updating: content/gli/datasets/gw_deg_3_0/urls.json (deflated 36%)\n",
            "updating: content/gli/datasets/gw_deg_3_0/gw_gini_3_0_task.npz (deflated 69%)\n",
            "updating: content/gli/datasets/gw_deg_3_0/gw_deg_3_0.npz (deflated 73%)\n",
            "updating: content/gli/datasets/gw_deg_4_0/ (stored 0%)\n",
            "updating: content/gli/datasets/gw_deg_4_0/metadata.json (deflated 63%)\n",
            "updating: content/gli/datasets/gw_deg_4_0/task_node_classification_1.json (deflated 59%)\n",
            "updating: content/gli/datasets/gw_deg_4_0/urls.json (deflated 36%)\n",
            "updating: content/gli/datasets/gw_deg_4_0/gw_gini_4_0_task.npz (deflated 69%)\n",
            "updating: content/gli/datasets/gw_deg_4_0/gw_deg_4_0.npz (deflated 77%)\n",
            "  adding: content/gli/datasets/gw_deg_0_0/gw_deg_0_0_task.npz (deflated 70%)\n",
            "  adding: content/gli/datasets/gw_deg_1_0/gw_deg_1_0_task.npz (deflated 69%)\n",
            "  adding: content/gli/datasets/gw_deg_2_0/gw_deg_2_0_task.npz (deflated 69%)\n",
            "  adding: content/gli/datasets/gw_deg_3_0/gw_deg_3_0_task.npz (deflated 69%)\n",
            "  adding: content/gli/datasets/gw_deg_4_0/gw_deg_4_0_task.npz (deflated 69%)\n"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.Javascript object>"
            ],
            "application/javascript": [
              "\n",
              "    async function download(id, filename, size) {\n",
              "      if (!google.colab.kernel.accessAllowed) {\n",
              "        return;\n",
              "      }\n",
              "      const div = document.createElement('div');\n",
              "      const label = document.createElement('label');\n",
              "      label.textContent = `Downloading \"${filename}\": `;\n",
              "      div.appendChild(label);\n",
              "      const progress = document.createElement('progress');\n",
              "      progress.max = size;\n",
              "      div.appendChild(progress);\n",
              "      document.body.appendChild(div);\n",
              "\n",
              "      const buffers = [];\n",
              "      let downloaded = 0;\n",
              "\n",
              "      const channel = await google.colab.kernel.comms.open(id);\n",
              "      // Send a message to notify the kernel that we're ready.\n",
              "      channel.send({})\n",
              "\n",
              "      for await (const message of channel.messages) {\n",
              "        // Send a message to notify the kernel that we're ready.\n",
              "        channel.send({})\n",
              "        if (message.buffers) {\n",
              "          for (const buffer of message.buffers) {\n",
              "            buffers.push(buffer);\n",
              "            downloaded += buffer.byteLength;\n",
              "            progress.value = downloaded;\n",
              "          }\n",
              "        }\n",
              "      }\n",
              "      const blob = new Blob(buffers, {type: 'application/binary'});\n",
              "      const a = document.createElement('a');\n",
              "      a.href = window.URL.createObjectURL(blob);\n",
              "      a.download = filename;\n",
              "      div.appendChild(a);\n",
              "      a.click();\n",
              "      div.remove();\n",
              "    }\n",
              "  "
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.Javascript object>"
            ],
            "application/javascript": [
              "download(\"download_c3af80c2-0757-4745-a035-30b8431893ab\", \"all_data_deg.zip\", 2153374)"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Varying Edge Homo"
      ],
      "metadata": {
        "id": "amYWMX3clvAL"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import random\n",
        "import pandas as pd\n",
        "import networkx as nx\n",
        "import os\n",
        "import numpy as np\n",
        "import graph_tool as gt\n",
        "import torch\n",
        "import matplotlib.pyplot as plt\n",
        "\n",
        "parent_dir = \"/content/gli/datasets/\"\n",
        "partition = {\"train_FOLD\":0.6, \"val_FOLD\":0.2, \"test_FOLD\":0.2}\n",
        "num_of_split = 1\n",
        "# transform gini coefficient \n",
        "\n",
        "# set GraphWorld parameters\n",
        "NVERTEX = 5000\n",
        "FEATURE_CENTER_DISTANCE = 0.05\n",
        "# P2Q = 4.0\n",
        "CLUSTER_SIZE_SLOPE = 0.0\n",
        "\n",
        "avg_degree = 20\n",
        "feature_dim = 16\n",
        "edge_center_distance = 2.0\n",
        "edge_feature_dim = 2\n",
        "feature_cluster_variance = 0.1\n",
        "# POWER_EXPONENT = 2\n",
        "\n",
        "\n",
        "NUM_CLUSTERS = 4\n",
        "repeat = 1\n",
        "POWER_EXPONENT = 2.0\n",
        "\n",
        "\n",
        "\n",
        "p2q_arr = [1,2,3,5,10]\n",
        "for i in range(len(p2q_arr)):\n",
        "  \n",
        "\n",
        "  P2Q = p2q_arr[i]\n",
        "  # to build additional datasets\n",
        "  \n",
        "  for j in range(repeat):\n",
        "\n",
        "    # create directory \n",
        "    directory = \"gw_homo_\"+str(i)+\"_\"+str(j)\n",
        "    # Path\n",
        "    # if we want to train, then add this line\n",
        "    directory = os.path.join(parent_dir, directory)\n",
        "    if not os.path.exists(directory):\n",
        "      os.mkdir(directory)\n",
        "\n",
        "    pi = graph_world.generators.sbm_simulator.MakePi(num_communities=NUM_CLUSTERS, community_size_slope = CLUSTER_SIZE_SLOPE)\n",
        "    prop_mat = graph_world.generators.sbm_simulator.MakePropMat(num_communities=NUM_CLUSTERS, p_to_q_ratio=P2Q)\n",
        "    out_degrees = graph_world.generators.sbm_simulator.MakeDegrees(POWER_EXPONENT, 1, NVERTEX)\n",
        "\n",
        "    sampler_out = graph_world.generators.sbm_simulator.GenerateStochasticBlockModelWithFeatures(\n",
        "      num_vertices=NVERTEX,\n",
        "      num_edges=NVERTEX*avg_degree,\n",
        "      pi=pi,\n",
        "      prop_mat=prop_mat,\n",
        "      out_degs=out_degrees,\n",
        "      feature_center_distance=FEATURE_CENTER_DISTANCE,\n",
        "      feature_dim=feature_dim,\n",
        "      num_feature_groups=NUM_CLUSTERS,\n",
        "      feature_group_match_type=MatchType.GROUPED,\n",
        "      feature_cluster_variance=feature_cluster_variance,\n",
        "      edge_feature_dim=edge_feature_dim,\n",
        "      edge_center_distance=edge_center_distance,\n",
        "      edge_cluster_variance=1,\n",
        "      normalize_features=True)\n",
        "    \n",
        "    graph = sampler_out.graph\n",
        "    memberships = sampler_out.graph_memberships\n",
        "    feature_memberships = sampler_out.feature_memberships\n",
        "    features = sampler_out.node_features\n",
        "    degrees = graph.get_out_degrees(graph.get_vertices())\n",
        "    num_removed = 0\n",
        "    for z, d in enumerate(degrees):\n",
        "      if d == 0:\n",
        "          graph.remove_vertex(z - num_removed)\n",
        "          memberships = np.delete(memberships, [z - num_removed])\n",
        "          features = np.delete(features, [z - num_removed], axis=0)\n",
        "          num_removed += 1\n",
        "    # gt.remove_self_loops(graph)\n",
        "\n",
        "\n",
        "    # for printing out coreness gini value\n",
        "    # out = {}                                    \n",
        "    # nx_graph = nx.Graph()\n",
        "    # edge_list = [(int(e.source()), int(e.target())) for e in graph.edges()]\n",
        "    # nx_graph.add_edges_from(edge_list)\n",
        "    \n",
        "    # degree_sequence = [d for n, d in nx_graph.degree()]\n",
        "    # degree_sequence = np.sort(degree_sequence)\n",
        "    # # print(degree_sequence)\n",
        "    # # fit = powerlaw.Fit(degree_sequence, verbose=False)\n",
        "    # # print(fit.power_law.alpha)\n",
        "\n",
        "    # out['metrics'] = graph_metrics_nx(nx_graph)\n",
        "    # print(\"psu_diameter: \", _diameter(nx_graph))\n",
        "\n",
        "\n",
        "    out = {}\n",
        "    out['metrics'] = NodeLabelMetrics(graph,\n",
        "                                  memberships,\n",
        "                                  features)\n",
        "\n",
        "    print(\"edge_homogeneity: \", out['metrics'][\"edge_homogeneity\"])\n",
        "    # print(\"avg_in_feature_angular_distance: \", out['metrics'][\"avg_in_feature_angular_distance\"])\n",
        "    # print(\"feature_angular_snr: \", out['metrics'][\"feature_angular_snr\"])\n",
        "\n",
        "    # for dataset.npz\n",
        "    output = {}\n",
        "    output[\"node_feats\"] = torch.from_numpy(features.astype(\"float32\"))\n",
        "    output[\"node_class\"] = torch.from_numpy(memberships)\n",
        "\n",
        "    output[\"edge\"] = torch.from_numpy(graph.get_edges())\n",
        "    output[\"edge_list\"] = torch.from_numpy(np.ones(len(list(graph.edges()))))\n",
        "    output[\"node_list\"] = torch.from_numpy(np.ones(len(list(graph.vertices()))))\n",
        "    print(output[\"node_class\"].shape)\n",
        "    np.savez(\"/content/gli/datasets/gw_homo_\"+str(i)+\"_\"+str(j)+\"/gw_homo_\"+str(i)+\"_\"+str(j)+\".npz\", **output, allow_pickle=True)\n",
        "\n",
        "\n",
        "    # for dataset_task.npz\n",
        "    output_task = {}\n",
        "    node_ids = list(range(len(list(graph.vertices()))))\n",
        "    train_str = \"train_\"\n",
        "    val_str = \"val_\"\n",
        "    test_str = 'test_'\n",
        "    for z in range(num_of_split):\n",
        "      random.shuffle(node_ids)\n",
        "      train_len = int(len(node_ids) * partition[\"train_FOLD\"])\n",
        "      val_len = int(len(node_ids) * partition[\"val_FOLD\"])\n",
        "      test_len = int(len(node_ids) * partition[\"test_FOLD\"])\n",
        "      output_task[\"train\"] = node_ids[:train_len]\n",
        "      output_task[\"val\"] = node_ids[train_len:train_len+val_len]\n",
        "      output_task[\"test\"] = node_ids[train_len+val_len:]\n",
        "\n",
        "      np.savez(\"/content/gli/datasets/gw_homo_\"+str(i)+\"_\"+str(j)+\"/gw_homo_\"+str(i)+\"_\"+str(j)+\"_task.npz\", **output_task, allow_pickle=True)\n",
        "\n",
        "\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "fkaY5ojamDfL",
        "outputId": "b55e7019-90cd-407a-916b-db8609ee28d1"
      },
      "execution_count": 26,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "edge_homogeneity:  0.2510181116707748\n",
            "torch.Size([4872])\n",
            "edge_homogeneity:  0.376492808368444\n",
            "torch.Size([4703])\n",
            "edge_homogeneity:  0.4584767897423721\n",
            "torch.Size([4810])\n",
            "edge_homogeneity:  0.5668411330049261\n",
            "torch.Size([4811])\n",
            "edge_homogeneity:  0.7035974172389055\n",
            "torch.Size([4746])\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Build GLI Format and Store Dataset"
      ],
      "metadata": {
        "id": "I4jiaB_5no4b"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# output json files\n",
        "import json\n",
        "\n",
        "# 4 values of power expo\n",
        "for i in range(len(p2q_arr)):\n",
        "  # to build additional datasets\n",
        "  # 5 independent datasets\n",
        "  for j in range(repeat):\n",
        "\n",
        "    # for metadata.json files\n",
        "    dataset_str = \"gw_homo_\"+str(i)+\"_\"+str(j)+\".npz\"\n",
        "    print(\"dataset_str: \", dataset_str)\n",
        "    metadata_json = {}\n",
        "    metadata_json[\"description\"] = \"Random Generated Dataset.\"\n",
        "    metadata_json[\"data\"] = {}\n",
        "\n",
        "    metadata_json[\"data\"][\"Node\"] = {}\n",
        "    metadata_json[\"data\"][\"Edge\"] = {}\n",
        "    metadata_json[\"data\"][\"Graph\"] = {}\n",
        "    metadata_json[\"data\"][\"Node\"][\"NodeFeature\"] = {\"description\": \"Node features of random generated dataset, real-valued vectors.\",\n",
        "                            \"type\": \"double\",\n",
        "                            \"format\": \"Tensor\",\n",
        "                            \"file\": dataset_str,\n",
        "                            \"key\": \"node_feats\"}\n",
        "\n",
        "\n",
        "    metadata_json[\"data\"][\"Node\"][\"NodeLabel\"] = {\"description\": \"Node labels of random generated dataset, int ranged from 1 to 4.\",\n",
        "                                          \"type\": \"int\",\n",
        "                                          \"format\": \"Tensor\",\n",
        "                                          \"file\": dataset_str,\n",
        "                                          \"key\": \"node_class\"\n",
        "                                          }\n",
        "\n",
        "    metadata_json[\"data\"][\"Edge\"][\"_Edge\"] = {\"file\":dataset_str, \"key\":\"edge\"}\n",
        "\n",
        "    metadata_json[\"data\"][\"Graph\"] = {}\n",
        "    metadata_json[\"data\"][\"Graph\"][\"_NodeList\"] = {}\n",
        "    metadata_json[\"data\"][\"Graph\"][\"_NodeList\"][\"file\"] = dataset_str\n",
        "    metadata_json[\"data\"][\"Graph\"][\"_NodeList\"][\"key\"] = \"node_list\"\n",
        "    metadata_json[\"data\"][\"Graph\"][\"_EdgeList\"] = {}\n",
        "    metadata_json[\"data\"][\"Graph\"][\"_EdgeList\"][\"file\"] = dataset_str\n",
        "    metadata_json[\"data\"][\"Graph\"][\"_EdgeList\"][\"key\"] = \"edge_list\"\n",
        "\n",
        "\n",
        "    metadata_json[\"citation\"]=\"@inproceedings{10.1145/3534678.3539203,\\\n",
        "    author = {Palowitch, John and Tsitsulin, Anton and Mayer, Brandon and Perozzi, Bryan},\\\n",
        "    title = {GraphWorld: Fake Graphs Bring Real Insights for GNNs},\\\n",
        "    year = {2022},\\\n",
        "    isbn = {9781450393850},\\\n",
        "    publisher = {Association for Computing Machinery},\\\n",
        "    url = {https://doi.org/10.1145/3534678.3539203},\\\n",
        "    doi = {10.1145/3534678.3539203},\\\n",
        "    booktitle = {Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining},\\\n",
        "    pages = {3691–3701},\\\n",
        "    series = {KDD '22}}\"\n",
        "\n",
        "    metadata_json[\"is_heterogeneous\"] = False\n",
        "\n",
        "\n",
        "    # for task_node_classification_1.json file\n",
        "    task_json = {}\n",
        "    task_str = \"gw_homo_\"+str(i)+\"_\"+str(j)+\"_task\"+\".npz\"\n",
        "    print(\"task_str: \", task_str)\n",
        "    task_json[\"description\"] = \"Node classification on random generated dataset.\"\n",
        "    task_json[\"type\"] = \"NodeClassification\"\n",
        "    task_json[\"feature\"] = [\"Node/NodeFeature\"]\n",
        "    task_json[\"target\"] = \"Node/NodeLabel\"\n",
        "    task_json[\"num_classes\"] = NUM_CLUSTERS\n",
        "    task_json[\"train_set\"] = {}\n",
        "    task_json[\"train_set\"][\"file\"] = task_str\n",
        "    task_json[\"train_set\"][\"key\"] = \"train\"\n",
        "\n",
        "    task_json[\"val_set\"] = {}\n",
        "    task_json[\"val_set\"][\"file\"] = task_str\n",
        "    task_json[\"val_set\"][\"key\"] = \"val\"\n",
        "\n",
        "    task_json[\"test_set\"] = {}\n",
        "    task_json[\"test_set\"][\"file\"] = task_str\n",
        "    task_json[\"test_set\"][\"key\"] = \"test\"\n",
        "\n",
        "    # for urls.json files\n",
        "    url_json = {}\n",
        "    url_json[dataset_str] = \"\"\n",
        "    url_json[task_str] = \"\"\n",
        "\n",
        "    # output to each directory\n",
        "    dir_str = \"gw_homo_\"+str(i)+\"_\"+str(j)\n",
        "    with open(parent_dir+dir_str+'/metadata.json', 'w') as fp:\n",
        "      json.dump(metadata_json, fp, indent=4)\n",
        "    with open(parent_dir+dir_str+'/task_node_classification_1.json', 'w') as fp:\n",
        "      json.dump(task_json, fp, indent=4)\n",
        "    with open(parent_dir+dir_str+'/urls.json', 'w') as fp:\n",
        "      json.dump(url_json, fp, indent=4)\n",
        " "
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "fs1q4Sixnvek",
        "outputId": "1d64fb16-61b5-42e1-e74a-021b25fd6e83"
      },
      "execution_count": 31,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "dataset_str:  gw_homo_0_0.npz\n",
            "task_str:  gw_homo_0_0_task.npz\n",
            "dataset_str:  gw_homo_1_0.npz\n",
            "task_str:  gw_homo_1_0_task.npz\n",
            "dataset_str:  gw_homo_2_0.npz\n",
            "task_str:  gw_homo_2_0_task.npz\n",
            "dataset_str:  gw_homo_3_0.npz\n",
            "task_str:  gw_homo_3_0_task.npz\n",
            "dataset_str:  gw_homo_4_0.npz\n",
            "task_str:  gw_homo_4_0_task.npz\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!zip -r /content/all_data_homo.zip /content/gli/datasets/gw_homo_*\n",
        "from google.colab import files\n",
        "files.download(\"/content/all_data_homo.zip\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 538
        },
        "id": "Tk7DsE7Kn4y_",
        "outputId": "e14ac20b-9b13-4a50-a7c5-e0264ad38e7b"
      },
      "execution_count": 32,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "updating: content/gli/datasets/gw_homo_0_0/ (stored 0%)\n",
            "updating: content/gli/datasets/gw_homo_0_0/metadata.json (deflated 63%)\n",
            "updating: content/gli/datasets/gw_homo_0_0/task_node_classification_1.json (deflated 59%)\n",
            "updating: content/gli/datasets/gw_homo_0_0/urls.json (deflated 36%)\n",
            "updating: content/gli/datasets/gw_homo_0_0/gw_homo_0_0.npz (deflated 69%)\n",
            "updating: content/gli/datasets/gw_homo_0_0/gw_homo_0_0_task.npz (deflated 69%)\n",
            "updating: content/gli/datasets/gw_homo_1_0/ (stored 0%)\n",
            "updating: content/gli/datasets/gw_homo_1_0/metadata.json (deflated 63%)\n",
            "updating: content/gli/datasets/gw_homo_1_0/task_node_classification_1.json (deflated 59%)\n",
            "updating: content/gli/datasets/gw_homo_1_0/urls.json (deflated 36%)\n",
            "updating: content/gli/datasets/gw_homo_1_0/gw_homo_1_0.npz (deflated 68%)\n",
            "updating: content/gli/datasets/gw_homo_1_0/gw_homo_1_0_task.npz (deflated 69%)\n",
            "updating: content/gli/datasets/gw_homo_2_0/ (stored 0%)\n",
            "updating: content/gli/datasets/gw_homo_2_0/metadata.json (deflated 63%)\n",
            "updating: content/gli/datasets/gw_homo_2_0/task_node_classification_1.json (deflated 59%)\n",
            "updating: content/gli/datasets/gw_homo_2_0/urls.json (deflated 36%)\n",
            "updating: content/gli/datasets/gw_homo_2_0/gw_homo_2_0.npz (deflated 68%)\n",
            "updating: content/gli/datasets/gw_homo_2_0/gw_homo_2_0_task.npz (deflated 69%)\n",
            "updating: content/gli/datasets/gw_homo_3_0/ (stored 0%)\n",
            "updating: content/gli/datasets/gw_homo_3_0/metadata.json (deflated 63%)\n",
            "updating: content/gli/datasets/gw_homo_3_0/task_node_classification_1.json (deflated 59%)\n",
            "updating: content/gli/datasets/gw_homo_3_0/urls.json (deflated 36%)\n",
            "updating: content/gli/datasets/gw_homo_3_0/gw_homo_3_0.npz (deflated 67%)\n",
            "updating: content/gli/datasets/gw_homo_3_0/gw_homo_3_0_task.npz (deflated 69%)\n",
            "updating: content/gli/datasets/gw_homo_4_0/ (stored 0%)\n",
            "updating: content/gli/datasets/gw_homo_4_0/metadata.json (deflated 63%)\n",
            "updating: content/gli/datasets/gw_homo_4_0/task_node_classification_1.json (deflated 59%)\n",
            "updating: content/gli/datasets/gw_homo_4_0/urls.json (deflated 36%)\n",
            "updating: content/gli/datasets/gw_homo_4_0/gw_homo_4_0_task.npz (deflated 69%)\n",
            "updating: content/gli/datasets/gw_homo_4_0/gw_homo_4_0.npz (deflated 66%)\n"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.Javascript object>"
            ],
            "application/javascript": [
              "\n",
              "    async function download(id, filename, size) {\n",
              "      if (!google.colab.kernel.accessAllowed) {\n",
              "        return;\n",
              "      }\n",
              "      const div = document.createElement('div');\n",
              "      const label = document.createElement('label');\n",
              "      label.textContent = `Downloading \"${filename}\": `;\n",
              "      div.appendChild(label);\n",
              "      const progress = document.createElement('progress');\n",
              "      progress.max = size;\n",
              "      div.appendChild(progress);\n",
              "      document.body.appendChild(div);\n",
              "\n",
              "      const buffers = [];\n",
              "      let downloaded = 0;\n",
              "\n",
              "      const channel = await google.colab.kernel.comms.open(id);\n",
              "      // Send a message to notify the kernel that we're ready.\n",
              "      channel.send({})\n",
              "\n",
              "      for await (const message of channel.messages) {\n",
              "        // Send a message to notify the kernel that we're ready.\n",
              "        channel.send({})\n",
              "        if (message.buffers) {\n",
              "          for (const buffer of message.buffers) {\n",
              "            buffers.push(buffer);\n",
              "            downloaded += buffer.byteLength;\n",
              "            progress.value = downloaded;\n",
              "          }\n",
              "        }\n",
              "      }\n",
              "      const blob = new Blob(buffers, {type: 'application/binary'});\n",
              "      const a = document.createElement('a');\n",
              "      a.href = window.URL.createObjectURL(blob);\n",
              "      a.download = filename;\n",
              "      div.appendChild(a);\n",
              "      a.click();\n",
              "      div.remove();\n",
              "    }\n",
              "  "
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.Javascript object>"
            ],
            "application/javascript": [
              "download(\"download_4430fe72-35fe-4ce4-b21f-4376e32b5982\", \"all_data_homo.zip\", 1978629)"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Varying Feature SNR"
      ],
      "metadata": {
        "id": "36V_xHeWpgzE"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import random\n",
        "import pandas as pd\n",
        "import networkx as nx\n",
        "import os\n",
        "import numpy as np\n",
        "import graph_tool as gt\n",
        "import torch\n",
        "import matplotlib.pyplot as plt\n",
        "\n",
        "parent_dir = \"/content/gli/datasets/\"\n",
        "partition = {\"train_FOLD\":0.6, \"val_FOLD\":0.2, \"test_FOLD\":0.2}\n",
        "num_of_split = 1\n",
        "# transform gini coefficient \n",
        "\n",
        "# set GraphWorld parameters\n",
        "NVERTEX = 5000\n",
        "FEATURE_CENTER_DISTANCE = 0.05\n",
        "P2Q = 2.0\n",
        "CLUSTER_SIZE_SLOPE = 0.0\n",
        "\n",
        "avg_degree = 20\n",
        "feature_dim = 16\n",
        "edge_center_distance = 2.0\n",
        "edge_feature_dim = 2\n",
        "# feature_cluster_variance = 0.1\n",
        "# POWER_EXPONENT = 2\n",
        "\n",
        "\n",
        "NUM_CLUSTERS = 4\n",
        "repeat = 1\n",
        "POWER_EXPONENT = 2.0\n",
        "\n",
        "\n",
        "\n",
        "var_arr = [0.1, 0.2, 0.5, 1, 2]\n",
        "for i in range(len(p2q_arr)):\n",
        "  \n",
        "  feature_cluster_variance = var_arr[i]\n",
        "  # to build additional datasets\n",
        "  \n",
        "  for j in range(repeat):\n",
        "\n",
        "    # create directory \n",
        "    directory = \"gw_var_\"+str(i)+\"_\"+str(j)\n",
        "    # Path\n",
        "    # if we want to train, then add this line\n",
        "    directory = os.path.join(parent_dir, directory)\n",
        "    if not os.path.exists(directory):\n",
        "      os.mkdir(directory)\n",
        "\n",
        "    pi = graph_world.generators.sbm_simulator.MakePi(num_communities=NUM_CLUSTERS, community_size_slope = CLUSTER_SIZE_SLOPE)\n",
        "    prop_mat = graph_world.generators.sbm_simulator.MakePropMat(num_communities=NUM_CLUSTERS, p_to_q_ratio=P2Q)\n",
        "    out_degrees = graph_world.generators.sbm_simulator.MakeDegrees(POWER_EXPONENT, 1, NVERTEX)\n",
        "\n",
        "    sampler_out = graph_world.generators.sbm_simulator.GenerateStochasticBlockModelWithFeatures(\n",
        "      num_vertices=NVERTEX,\n",
        "      num_edges=NVERTEX*avg_degree,\n",
        "      pi=pi,\n",
        "      prop_mat=prop_mat,\n",
        "      out_degs=out_degrees,\n",
        "      feature_center_distance=FEATURE_CENTER_DISTANCE,\n",
        "      feature_dim=feature_dim,\n",
        "      num_feature_groups=NUM_CLUSTERS,\n",
        "      feature_group_match_type=MatchType.GROUPED,\n",
        "      feature_cluster_variance=feature_cluster_variance,\n",
        "      edge_feature_dim=edge_feature_dim,\n",
        "      edge_center_distance=edge_center_distance,\n",
        "      edge_cluster_variance=1,\n",
        "      normalize_features=True)\n",
        "    \n",
        "    graph = sampler_out.graph\n",
        "    memberships = sampler_out.graph_memberships\n",
        "    feature_memberships = sampler_out.feature_memberships\n",
        "    features = sampler_out.node_features\n",
        "    degrees = graph.get_out_degrees(graph.get_vertices())\n",
        "    num_removed = 0\n",
        "    for z, d in enumerate(degrees):\n",
        "      if d == 0:\n",
        "          graph.remove_vertex(z - num_removed)\n",
        "          memberships = np.delete(memberships, [z - num_removed])\n",
        "          features = np.delete(features, [z - num_removed], axis=0)\n",
        "          num_removed += 1\n",
        "    # gt.remove_self_loops(graph)\n",
        "\n",
        "\n",
        "    # for printing out coreness gini value\n",
        "    # out = {}                                    \n",
        "    # nx_graph = nx.Graph()\n",
        "    # edge_list = [(int(e.source()), int(e.target())) for e in graph.edges()]\n",
        "    # nx_graph.add_edges_from(edge_list)\n",
        "    \n",
        "    # degree_sequence = [d for n, d in nx_graph.degree()]\n",
        "    # degree_sequence = np.sort(degree_sequence)\n",
        "    # # print(degree_sequence)\n",
        "    # # fit = powerlaw.Fit(degree_sequence, verbose=False)\n",
        "    # # print(fit.power_law.alpha)\n",
        "\n",
        "    # out['metrics'] = graph_metrics_nx(nx_graph)\n",
        "    # print(\"psu_diameter: \", _diameter(nx_graph))\n",
        "\n",
        "\n",
        "    out = {}\n",
        "    out['metrics'] = NodeLabelMetrics(graph,\n",
        "                                  memberships,\n",
        "                                  features)\n",
        "\n",
        "    # print(\"edge_homogeneity: \", out['metrics'][\"edge_homogeneity\"])\n",
        "    print(\"avg_in_feature_angular_distance: \", out['metrics'][\"avg_in_feature_angular_distance\"])\n",
        "    print(\"feature_angular_snr: \", out['metrics'][\"feature_angular_snr\"])\n",
        "\n",
        "    # for dataset.npz\n",
        "    output = {}\n",
        "    output[\"node_feats\"] = torch.from_numpy(features.astype(\"float32\"))\n",
        "    output[\"node_class\"] = torch.from_numpy(memberships)\n",
        "\n",
        "    output[\"edge\"] = torch.from_numpy(graph.get_edges())\n",
        "    output[\"edge_list\"] = torch.from_numpy(np.ones(len(list(graph.edges()))))\n",
        "    output[\"node_list\"] = torch.from_numpy(np.ones(len(list(graph.vertices()))))\n",
        "    print(output[\"node_class\"].shape)\n",
        "    np.savez(\"/content/gli/datasets/gw_var_\"+str(i)+\"_\"+str(j)+\"/gw_var_\"+str(i)+\"_\"+str(j)+\".npz\", **output, allow_pickle=True)\n",
        "\n",
        "\n",
        "    # for dataset_task.npz\n",
        "    output_task = {}\n",
        "    node_ids = list(range(len(list(graph.vertices()))))\n",
        "    train_str = \"train_\"\n",
        "    val_str = \"val_\"\n",
        "    test_str = 'test_'\n",
        "    for z in range(num_of_split):\n",
        "      random.shuffle(node_ids)\n",
        "      train_len = int(len(node_ids) * partition[\"train_FOLD\"])\n",
        "      val_len = int(len(node_ids) * partition[\"val_FOLD\"])\n",
        "      test_len = int(len(node_ids) * partition[\"test_FOLD\"])\n",
        "      output_task[\"train\"] = node_ids[:train_len]\n",
        "      output_task[\"val\"] = node_ids[train_len:train_len+val_len]\n",
        "      output_task[\"test\"] = node_ids[train_len+val_len:]\n",
        "\n",
        "      np.savez(\"/content/gli/datasets/gw_var_\"+str(i)+\"_\"+str(j)+\"/gw_var_\"+str(i)+\"_\"+str(j)+\"_task.npz\", **output_task, allow_pickle=True)\n",
        "\n",
        "\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "iJqDmU8cpmAN",
        "outputId": "86788ae8-533e-40e0-9e07-c206c6f5733f"
      },
      "execution_count": 29,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "avg_in_feature_angular_distance:  0.6029061441157513\n",
            "feature_angular_snr:  1.1667956881756547\n",
            "torch.Size([4675])\n",
            "avg_in_feature_angular_distance:  0.5544326045810665\n",
            "feature_angular_snr:  1.0924070771527385\n",
            "torch.Size([4835])\n",
            "avg_in_feature_angular_distance:  0.5318325634886577\n",
            "feature_angular_snr:  1.0439227233726742\n",
            "torch.Size([4761])\n",
            "avg_in_feature_angular_distance:  0.5209335711680749\n",
            "feature_angular_snr:  1.030478392690934\n",
            "torch.Size([4742])\n",
            "avg_in_feature_angular_distance:  0.5064364307804582\n",
            "feature_angular_snr:  1.009111061620942\n",
            "torch.Size([4680])\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Build GLI Format and Store Dataset"
      ],
      "metadata": {
        "id": "zQpyy8RNoT1B"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# output json files\n",
        "import json\n",
        "\n",
        "# 4 values of power expo\n",
        "for i in range(len(var_arr)):\n",
        "  # to build additional datasets\n",
        "  # 5 independent datasets\n",
        "  for j in range(repeat):\n",
        "\n",
        "    # for metadata.json files\n",
        "    dataset_str = \"gw_var_\"+str(i)+\"_\"+str(j)+\".npz\"\n",
        "    print(\"dataset_str: \", dataset_str)\n",
        "    metadata_json = {}\n",
        "    metadata_json[\"description\"] = \"Random Generated Dataset.\"\n",
        "    metadata_json[\"data\"] = {}\n",
        "\n",
        "    metadata_json[\"data\"][\"Node\"] = {}\n",
        "    metadata_json[\"data\"][\"Edge\"] = {}\n",
        "    metadata_json[\"data\"][\"Graph\"] = {}\n",
        "    metadata_json[\"data\"][\"Node\"][\"NodeFeature\"] = {\"description\": \"Node features of random generated dataset, real-valued vectors.\",\n",
        "                            \"type\": \"double\",\n",
        "                            \"format\": \"Tensor\",\n",
        "                            \"file\": dataset_str,\n",
        "                            \"key\": \"node_feats\"}\n",
        "\n",
        "\n",
        "    metadata_json[\"data\"][\"Node\"][\"NodeLabel\"] = {\"description\": \"Node labels of random generated dataset, int ranged from 1 to 4.\",\n",
        "                                          \"type\": \"int\",\n",
        "                                          \"format\": \"Tensor\",\n",
        "                                          \"file\": dataset_str,\n",
        "                                          \"key\": \"node_class\"\n",
        "                                          }\n",
        "\n",
        "    metadata_json[\"data\"][\"Edge\"][\"_Edge\"] = {\"file\":dataset_str, \"key\":\"edge\"}\n",
        "\n",
        "    metadata_json[\"data\"][\"Graph\"] = {}\n",
        "    metadata_json[\"data\"][\"Graph\"][\"_NodeList\"] = {}\n",
        "    metadata_json[\"data\"][\"Graph\"][\"_NodeList\"][\"file\"] = dataset_str\n",
        "    metadata_json[\"data\"][\"Graph\"][\"_NodeList\"][\"key\"] = \"node_list\"\n",
        "    metadata_json[\"data\"][\"Graph\"][\"_EdgeList\"] = {}\n",
        "    metadata_json[\"data\"][\"Graph\"][\"_EdgeList\"][\"file\"] = dataset_str\n",
        "    metadata_json[\"data\"][\"Graph\"][\"_EdgeList\"][\"key\"] = \"edge_list\"\n",
        "\n",
        "\n",
        "    metadata_json[\"citation\"]=\"@inproceedings{10.1145/3534678.3539203,\\\n",
        "    author = {Palowitch, John and Tsitsulin, Anton and Mayer, Brandon and Perozzi, Bryan},\\\n",
        "    title = {GraphWorld: Fake Graphs Bring Real Insights for GNNs},\\\n",
        "    year = {2022},\\\n",
        "    isbn = {9781450393850},\\\n",
        "    publisher = {Association for Computing Machinery},\\\n",
        "    url = {https://doi.org/10.1145/3534678.3539203},\\\n",
        "    doi = {10.1145/3534678.3539203},\\\n",
        "    booktitle = {Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining},\\\n",
        "    pages = {3691–3701},\\\n",
        "    series = {KDD '22}}\"\n",
        "\n",
        "    metadata_json[\"is_heterogeneous\"] = False\n",
        "\n",
        "\n",
        "    # for task_node_classification_1.json file\n",
        "    task_json = {}\n",
        "    task_str = \"gw_var_\"+str(i)+\"_\"+str(j)+\"_task\"+\".npz\"\n",
        "    print(\"task_str: \", task_str)\n",
        "    task_json[\"description\"] = \"Node classification on random generated dataset.\"\n",
        "    task_json[\"type\"] = \"NodeClassification\"\n",
        "    task_json[\"feature\"] = [\"Node/NodeFeature\"]\n",
        "    task_json[\"target\"] = \"Node/NodeLabel\"\n",
        "    task_json[\"num_classes\"] = NUM_CLUSTERS\n",
        "    task_json[\"train_set\"] = {}\n",
        "    task_json[\"train_set\"][\"file\"] = task_str\n",
        "    task_json[\"train_set\"][\"key\"] = \"train\"\n",
        "\n",
        "    task_json[\"val_set\"] = {}\n",
        "    task_json[\"val_set\"][\"file\"] = task_str\n",
        "    task_json[\"val_set\"][\"key\"] = \"val\"\n",
        "\n",
        "    task_json[\"test_set\"] = {}\n",
        "    task_json[\"test_set\"][\"file\"] = task_str\n",
        "    task_json[\"test_set\"][\"key\"] = \"test\"\n",
        "\n",
        "    # for urls.json files\n",
        "    url_json = {}\n",
        "    url_json[dataset_str] = \"\"\n",
        "    url_json[task_str] = \"\"\n",
        "\n",
        "    # output to each directory\n",
        "    dir_str = \"gw_var_\"+str(i)+\"_\"+str(j)\n",
        "    with open(parent_dir+dir_str+'/metadata.json', 'w') as fp:\n",
        "      json.dump(metadata_json, fp, indent=4)\n",
        "    with open(parent_dir+dir_str+'/task_node_classification_1.json', 'w') as fp:\n",
        "      json.dump(task_json, fp, indent=4)\n",
        "    with open(parent_dir+dir_str+'/urls.json', 'w') as fp:\n",
        "      json.dump(url_json, fp, indent=4)\n",
        " "
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "yil46F9WoQoM",
        "outputId": "dfc1ae18-c082-45b4-dee5-4470f45dcb6d"
      },
      "execution_count": 30,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "dataset_str:  gw_var_0_0.npz\n",
            "task_str:  gw_var_0_0_task.npz\n",
            "dataset_str:  gw_var_1_0.npz\n",
            "task_str:  gw_var_1_0_task.npz\n",
            "dataset_str:  gw_var_2_0.npz\n",
            "task_str:  gw_var_2_0_task.npz\n",
            "dataset_str:  gw_var_3_0.npz\n",
            "task_str:  gw_var_3_0_task.npz\n",
            "dataset_str:  gw_var_4_0.npz\n",
            "task_str:  gw_var_4_0_task.npz\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!zip -r /content/all_data_var.zip /content/gli/datasets/gw_var_*\n",
        "from google.colab import files\n",
        "files.download(\"/content/all_data_var.zip\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 538
        },
        "id": "PUn0FhObokRj",
        "outputId": "028a6508-298e-479b-9bb2-7c8e2ef7b33c"
      },
      "execution_count": 33,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "  adding: content/gli/datasets/gw_var_0_0/ (stored 0%)\n",
            "  adding: content/gli/datasets/gw_var_0_0/metadata.json (deflated 63%)\n",
            "  adding: content/gli/datasets/gw_var_0_0/task_node_classification_1.json (deflated 59%)\n",
            "  adding: content/gli/datasets/gw_var_0_0/urls.json (deflated 36%)\n",
            "  adding: content/gli/datasets/gw_var_0_0/gw_var_0_0_task.npz (deflated 69%)\n",
            "  adding: content/gli/datasets/gw_var_0_0/gw_var_0_0.npz (deflated 66%)\n",
            "  adding: content/gli/datasets/gw_var_1_0/ (stored 0%)\n",
            "  adding: content/gli/datasets/gw_var_1_0/gw_var_1_0.npz (deflated 69%)\n",
            "  adding: content/gli/datasets/gw_var_1_0/metadata.json (deflated 63%)\n",
            "  adding: content/gli/datasets/gw_var_1_0/task_node_classification_1.json (deflated 59%)\n",
            "  adding: content/gli/datasets/gw_var_1_0/urls.json (deflated 36%)\n",
            "  adding: content/gli/datasets/gw_var_1_0/gw_var_1_0_task.npz (deflated 69%)\n",
            "  adding: content/gli/datasets/gw_var_2_0/ (stored 0%)\n",
            "  adding: content/gli/datasets/gw_var_2_0/metadata.json (deflated 63%)\n",
            "  adding: content/gli/datasets/gw_var_2_0/gw_var_2_0.npz (deflated 67%)\n",
            "  adding: content/gli/datasets/gw_var_2_0/task_node_classification_1.json (deflated 59%)\n",
            "  adding: content/gli/datasets/gw_var_2_0/urls.json (deflated 36%)\n",
            "  adding: content/gli/datasets/gw_var_2_0/gw_var_2_0_task.npz (deflated 69%)\n",
            "  adding: content/gli/datasets/gw_var_3_0/ (stored 0%)\n",
            "  adding: content/gli/datasets/gw_var_3_0/metadata.json (deflated 63%)\n",
            "  adding: content/gli/datasets/gw_var_3_0/gw_var_3_0.npz (deflated 66%)\n",
            "  adding: content/gli/datasets/gw_var_3_0/task_node_classification_1.json (deflated 59%)\n",
            "  adding: content/gli/datasets/gw_var_3_0/urls.json (deflated 36%)\n",
            "  adding: content/gli/datasets/gw_var_3_0/gw_var_3_0_task.npz (deflated 69%)\n",
            "  adding: content/gli/datasets/gw_var_4_0/ (stored 0%)\n",
            "  adding: content/gli/datasets/gw_var_4_0/metadata.json (deflated 63%)\n",
            "  adding: content/gli/datasets/gw_var_4_0/task_node_classification_1.json (deflated 59%)\n",
            "  adding: content/gli/datasets/gw_var_4_0/urls.json (deflated 36%)\n",
            "  adding: content/gli/datasets/gw_var_4_0/gw_var_4_0.npz (deflated 68%)\n",
            "  adding: content/gli/datasets/gw_var_4_0/gw_var_4_0_task.npz (deflated 69%)\n"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.Javascript object>"
            ],
            "application/javascript": [
              "\n",
              "    async function download(id, filename, size) {\n",
              "      if (!google.colab.kernel.accessAllowed) {\n",
              "        return;\n",
              "      }\n",
              "      const div = document.createElement('div');\n",
              "      const label = document.createElement('label');\n",
              "      label.textContent = `Downloading \"${filename}\": `;\n",
              "      div.appendChild(label);\n",
              "      const progress = document.createElement('progress');\n",
              "      progress.max = size;\n",
              "      div.appendChild(progress);\n",
              "      document.body.appendChild(div);\n",
              "\n",
              "      const buffers = [];\n",
              "      let downloaded = 0;\n",
              "\n",
              "      const channel = await google.colab.kernel.comms.open(id);\n",
              "      // Send a message to notify the kernel that we're ready.\n",
              "      channel.send({})\n",
              "\n",
              "      for await (const message of channel.messages) {\n",
              "        // Send a message to notify the kernel that we're ready.\n",
              "        channel.send({})\n",
              "        if (message.buffers) {\n",
              "          for (const buffer of message.buffers) {\n",
              "            buffers.push(buffer);\n",
              "            downloaded += buffer.byteLength;\n",
              "            progress.value = downloaded;\n",
              "          }\n",
              "        }\n",
              "      }\n",
              "      const blob = new Blob(buffers, {type: 'application/binary'});\n",
              "      const a = document.createElement('a');\n",
              "      a.href = window.URL.createObjectURL(blob);\n",
              "      a.download = filename;\n",
              "      div.appendChild(a);\n",
              "      a.click();\n",
              "      div.remove();\n",
              "    }\n",
              "  "
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.Javascript object>"
            ],
            "application/javascript": [
              "download(\"download_93754b9e-cf25-4f1a-92f3-a7506c32c59c\", \"all_data_var.zip\", 1947617)"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Train Models via GLI module"
      ],
      "metadata": {
        "id": "XsCmFKl0vq8T"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "%cd /content/gli/benchmarks/NodeClassification"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "1A7z0jkrvtrX",
        "outputId": "b36d67fb-50a3-46be-ea43-25a6b91304ba"
      },
      "execution_count": 236,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "/content/gli/benchmarks/NodeClassification\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!python train.py --dataset gw_gini_0_0 --model GCN \n",
        "!python train.py --dataset gw_gini_1_0 --model GCN \n",
        "!python train.py --dataset gw_gini_2_0 --model GCN \n",
        "!python train.py --dataset gw_gini_3_0 --model GCN \n",
        "!python train.py --dataset gw_gini_4_0 --model GCN "
      ],
      "metadata": {
        "id": "PpJttN_cvzSJ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!python train.py --dataset gw_gini_0_0 --model GAT --model-cfg configs/GAT.yaml\n",
        "!python train.py --dataset gw_gini_1_0 --model GAT --model-cfg configs/GAT.yaml\n",
        "!python train.py --dataset gw_gini_2_0 --model GAT --model-cfg configs/GAT.yaml\n",
        "!python train.py --dataset gw_gini_3_0 --model GAT --model-cfg configs/GAT.yaml\n",
        "!python train.py --dataset gw_gini_4_0 --model GAT --model-cfg configs/GAT.yaml"
      ],
      "metadata": {
        "id": "b7Qv3sdw3R5I"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!python train.py --dataset gw_gini_0_0 --model GraphSAGE --model-cfg configs/GraphSAGE.yaml\n",
        "!python train.py --dataset gw_gini_1_0 --model GraphSAGE --model-cfg configs/GraphSAGE.yaml\n",
        "!python train.py --dataset gw_gini_2_0 --model GraphSAGE --model-cfg configs/GraphSAGE.yaml\n",
        "!python train.py --dataset gw_gini_3_0 --model GraphSAGE --model-cfg configs/GraphSAGE.yaml\n",
        "!python train.py --dataset gw_gini_4_0 --model GraphSAGE --model-cfg configs/GraphSAGE.yaml"
      ],
      "metadata": {
        "id": "wpmjMsV74HgR"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!python train.py --dataset gw_gini_0_0 --model MoNet --model-cfg configs/MoNet.yaml\n",
        "!python train.py --dataset gw_gini_1_0 --model MoNet --model-cfg configs/MoNet.yaml\n",
        "!python train.py --dataset gw_gini_2_0 --model MoNet --model-cfg configs/MoNet.yaml\n",
        "!python train.py --dataset gw_gini_3_0 --model MoNet --model-cfg configs/MoNet.yaml\n",
        "!python train.py --dataset gw_gini_4_0 --model MoNet --model-cfg configs/MoNet.yaml"
      ],
      "metadata": {
        "id": "fHrlzbclOSrg"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!python train.py --dataset gw_gini_0_0 --model MixHop --model-cfg configs/MixHop.yaml\n",
        "!python train.py --dataset gw_gini_1_0 --model MixHop --model-cfg configs/MixHop.yaml\n",
        "!python train.py --dataset gw_gini_2_0 --model MixHop --model-cfg configs/MixHop.yaml\n",
        "!python train.py --dataset gw_gini_3_0 --model MixHop --model-cfg configs/MixHop.yaml\n",
        "!python train.py --dataset gw_gini_4_0 --model MixHop --model-cfg configs/MixHop.yaml"
      ],
      "metadata": {
        "id": "fH12D9hW4WRR"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!python train.py --dataset gw_gini_0_0 --model LINKX --model-cfg configs/LINKX.yaml --train-cfg configs/LINKX_train.yaml\n",
        "!python train.py --dataset gw_gini_1_0 --model LINKX --model-cfg configs/LINKX.yaml --train-cfg configs/LINKX_train.yaml\n",
        "!python train.py --dataset gw_gini_2_0 --model LINKX --model-cfg configs/LINKX.yaml --train-cfg configs/LINKX_train.yaml\n",
        "!python train.py --dataset gw_gini_3_0 --model LINKX --model-cfg configs/LINKX.yaml --train-cfg configs/LINKX_train.yaml\n",
        "!python train.py --dataset gw_gini_4_0 --model LINKX --model-cfg configs/LINKX.yaml --train-cfg configs/LINKX_train.yaml"
      ],
      "metadata": {
        "id": "ZKuqBuZd4cs1"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!python train.py --dataset gw_gini_0_0 --model MLP \n",
        "!python train.py --dataset gw_gini_1_0 --model MLP \n",
        "!python train.py --dataset gw_gini_2_0 --model MLP \n",
        "!python train.py --dataset gw_gini_3_0 --model MLP \n",
        "!python train.py --dataset gw_gini_4_0 --model MLP "
      ],
      "metadata": {
        "id": "U_oehrfpv_oE"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}