{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Generated Datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import networkx as nx\n",
    "import scipy as sp\n",
    "import numpy as np\n",
    "from graph_generation.data import synthetic_graphs\n",
    "import torch as th\n",
    "from pathlib import Path\n",
    "import pickle\n",
    "\n",
    "\n",
    "data_path = Path(\"../data\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Spectre Datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def split_dataset(graphs):\n",
    "    # The following snippet is used from \"github.com/KarolisMart/SPECTRE\".\n",
    "    test_len = int(round(len(graphs) * 0.2))\n",
    "    train_len = int(round((len(graphs) - test_len) * 0.8))\n",
    "    val_len = len(graphs) - train_len - test_len\n",
    "\n",
    "    train, val, test = th.utils.data.random_split(\n",
    "        graphs,\n",
    "        [train_len, val_len, test_len],\n",
    "        generator=th.Generator().manual_seed(1234),\n",
    "    )\n",
    "    return [graphs[i] for i in train.indices], [graphs[i] for i in val.indices], [graphs[i] for i in test.indices]"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Synthetic Datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "name = \"planar_64_200.pt\" # or sbm_200.pt\n",
    "adjs = th.load(data_path / \"spectre/\" / name)[0]\n",
    "graphs = [nx.from_numpy_array(adj.numpy().astype(bool)) for adj in adjs]\n",
    "\n",
    "train, val, test = split_dataset(graphs)\n",
    "dataset = {\n",
    "    \"train\": train,\n",
    "    \"val\": val,\n",
    "    \"test\": test,\n",
    "}\n",
    "\n",
    "\n",
    "# save the dataset\n",
    "with open(data_path / \"planar.pkl\", \"wb\") as f:\n",
    "    pickle.dump(dataset, f)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Protein and Point Cloud\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load(min_size, max_size, name, largest_cc=False):\n",
    "    with open(data_path / f\"{name}/{name}_A.txt\", \"rb\") as f:\n",
    "        data_adj = np.loadtxt(f, delimiter=',').astype(int)\n",
    "\n",
    "    with open(data_path / f\"{name}/{name}_graph_indicator.txt\", \"rb\") as f:\n",
    "        data_graph_indicator = np.loadtxt(f, delimiter=',').astype(int)\n",
    "\n",
    "    G = nx.Graph()\n",
    "    data_tuple = list(map(tuple, data_adj))\n",
    "\n",
    "    # Add edges\n",
    "    G.add_edges_from(data_tuple)\n",
    "\n",
    "    # remove self-loop\n",
    "    G.remove_edges_from(nx.selfloop_edges(G))\n",
    "\n",
    "    # Split into graphs\n",
    "    graph_num = data_graph_indicator.max()\n",
    "    node_list = np.arange(data_graph_indicator.shape[0]) + 1\n",
    "\n",
    "    graphs = []\n",
    "    for i in range(graph_num):\n",
    "        # Find the nodes for each graph\n",
    "        nodes = node_list[data_graph_indicator == i + 1]\n",
    "        G_sub = G.subgraph(nodes)\n",
    "        if G_sub.number_of_nodes() >= min_size and G_sub.number_of_nodes() <= max_size:\n",
    "            if largest_cc and not nx.is_connected(G_sub):\n",
    "                G_sub = G_sub.subgraph(max(nx.connected_components(G_sub), key=len))\n",
    "            adj = nx.to_scipy_sparse_array(G_sub).astype(bool)\n",
    "            G_sub = nx.from_scipy_sparse_array(adj)\n",
    "            graphs.append(G_sub)\n",
    "\n",
    "\n",
    "    size = [G.number_of_nodes() for G in graphs]\n",
    "    num_edges = [G.number_of_edges() for G in graphs]\n",
    "    print(f\"max nodes: {max(size)}\")\n",
    "    print(f\"min nodes: {min(size)}\")\n",
    "    print(f\"avg nodes: {np.mean(size)}\")\n",
    "    print(f\"std nodes: {np.std(size)}\")\n",
    "    print(f\"max edges: {max(num_edges)}\")\n",
    "    print(f\"min edges: {min(num_edges)}\")\n",
    "    print(f\"avg edges: {np.mean(num_edges)}\")\n",
    "    print(f\"std edges: {np.std(num_edges)}\")\n",
    "\n",
    "    train, val, test = split_dataset(graphs)\n",
    "    return {\n",
    "        \"train\": train,\n",
    "        \"val\": val,\n",
    "        \"test\": test,\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Protein\n",
    "min_size=100\n",
    "max_size=500\n",
    "\n",
    "dataset = load(min_size, max_size, \"DD\", largest_cc=False)\n",
    "\n",
    "with open(data_path / \"protein.pkl\", \"wb\") as f:\n",
    "    pickle.dump(dataset, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "max nodes: 5037\n",
      "min nodes: 56\n",
      "avg nodes: 1332.2682926829268\n",
      "std nodes: 1188.2261635833115\n",
      "max edges: 10886\n",
      "min edges: 117\n",
      "avg edges: 2970.7073170731705\n",
      "std edges: 2612.1797667558985\n"
     ]
    }
   ],
   "source": [
    "# Point Cloud\n",
    "min_size=0\n",
    "max_size=10000\n",
    "\n",
    "dataset = load(min_size, max_size, \"FIRSTMM_DB\", largest_cc=True)\n",
    "\n",
    "with open(data_path / \"point_cloud.pkl\", \"wb\") as f:\n",
    "    pickle.dump(dataset, f)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Ours"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_len = 128\n",
    "test_len = 40\n",
    "val_len = 32"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Tree"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_graphs = synthetic_graphs.generate_tree_graphs(num_graphs=train_len, min_size=64, max_size=64, seed=0)\n",
    "val_graphs = synthetic_graphs.generate_tree_graphs(num_graphs=val_len, min_size=64, max_size=64, seed=1)\n",
    "test_graphs = synthetic_graphs.generate_tree_graphs(num_graphs=test_len, min_size=64, max_size=64, seed=2)\n",
    "\n",
    "dataset = {\n",
    "    \"train\": train_graphs,\n",
    "    \"val\": val_graphs,\n",
    "    \"test\": test_graphs,\n",
    "}\n",
    "\n",
    "# save the dataset\n",
    "with open(data_path / \"tree.pkl\", \"wb\") as f:\n",
    "    pickle.dump(dataset, f)\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Extrapolation & Interpolation\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_intervals_extrapolation = [(32, 64)]\n",
    "train_intervals_interpolation = [(32, 64), (128, 160)]\n",
    "test_graph_sizes = [48, 64, 80, 96, 112, 128, 144]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "generator = synthetic_graphs.generate_tree_graphs\n",
    "\n",
    "train_graphs = []\n",
    "for interval in train_intervals_interpolation:\n",
    "    train_graphs += generator(num_graphs=train_len // len(train_intervals), min_size=interval[0], max_size=interval[1], seed=0)\n",
    "\n",
    "val_graphs = []\n",
    "for size in test_graph_sizes:\n",
    "    val_graphs += generator(num_graphs=val_len, min_size=size, max_size=size, seed=1)\n",
    "\n",
    "test_graphs = []\n",
    "for size in test_graph_sizes:\n",
    "    test_graphs += generator(num_graphs=test_len, min_size=size, max_size=size, seed=2)\n",
    "\n",
    "dataset = {\n",
    "    \"train\": train_graphs,\n",
    "    \"val\": val_graphs,\n",
    "    \"test\": test_graphs,\n",
    "}\n",
    "\n",
    "# save the dataset\n",
    "with open(data_path / \"tree_interpolation.pkl\", \"wb\") as f:\n",
    "    pickle.dump(dataset, f)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Minnesota"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "g = sp.io.mmread(data_path / \"road-minnesota.mtx\")\n",
    "g = nx.from_scipy_sparse_array(g.astype(bool))\n",
    "# only keep the largest connected component\n",
    "lc = max(nx.connected_components(g), key=len)\n",
    "g = g.subgraph(lc)\n",
    "graphs = [g]\n",
    "\n",
    "dataset = {\n",
    "    \"train\": graphs,\n",
    "    \"val\": graphs*4,\n",
    "    \"test\": graphs*4,\n",
    "}\n",
    "\n",
    "# save the dataset\n",
    "with open(data_path / \"minnesota.pkl\", \"wb\") as f:\n",
    "    pickle.dump(dataset, f)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Training Set Statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Deg.            -0.0000247601\n",
      "Clus.           0.1768286339\n",
      "Orbit           0.0048895833\n",
      "Spectral        0.0064234471\n",
      "Wavelet         0.0024285456\n",
      "Uniqueness      1.0000000000\n"
     ]
    }
   ],
   "source": [
    "from graph_generation import metrics\n",
    "# Metrics\n",
    "validation_metrics = [\n",
    "    metrics.NodeDegree(),\n",
    "    metrics.ClusteringCoefficient(),\n",
    "    metrics.OrbitCount(),\n",
    "    metrics.Spectral(),\n",
    "    metrics.Wavelet(),\n",
    "    metrics.Uniqueness(),\n",
    "    metrics.Novelty(),\n",
    "]\n",
    "\n",
    "with open(data_path / \"point_cloud.pkl\", \"rb\") as f:\n",
    "    dataset = pickle.load(f)\n",
    "\n",
    "train_graphs, test_graphs = dataset[\"train\"], dataset[\"test\"]\n",
    "\n",
    "for metric in validation_metrics:\n",
    "    score = metric(reference_graphs=test_graphs, predicted_graphs=train_graphs, train_graphs=train_graphs)\n",
    "    print(f\"{str(metric):<15} {score:.10f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "max nodes: 64\n",
      "avg nodes: 64.0\n",
      "max edges: 181\n",
      "avg edges: 177.835\n"
     ]
    }
   ],
   "source": [
    "with open(data_path / \"planar.pkl\", \"rb\") as f:\n",
    "    dataset = pickle.load(f)\n",
    "\n",
    "full_dataset = dataset[\"train\"] + dataset[\"val\"] + dataset[\"test\"]\n",
    "\n",
    "n = [g.number_of_nodes() for g in full_dataset]\n",
    "m = [g.number_of_edges() for g in full_dataset]\n",
    "\n",
    "print(f\"max nodes: {max(n)}\")\n",
    "print(f\"avg nodes: {np.mean(n)}\")\n",
    "print(f\"max edges: {max(m)}\")\n",
    "print(f\"avg edges: {np.mean(m)}\")\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "graph-generation",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
